├── .gitignore
├── AlgorithmsComparator.py
├── data_management
    ├── DataLoader.py
    ├── StreamGenerator.py
    └── __init__.py
├── demo.py
├── drift_detection_methods
    ├── __init__.py
    └── spc.py
├── ensemble_methods
    ├── DWM.py
    ├── SEA.py
    ├── __init__.py
    ├── ddd.py
    └── online_bagging.py
├── offline_methods
    ├── OfflineAlgorithmsWrapper.py
    └── __init__.py
└── training_windows_methods
    ├── AdaptiveSVC.py
    ├── __init__.py
    └── test_AdaptiveSVC.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Bibliography of the project
 2 | bibliography/
 3 | 
 4 | # Datasets
 5 | data/
 6 | 
 7 | # Figures
 8 | figures/
 9 | 
10 | # Jupyter notebooks
11 | notebooks/
12 | .ipynb_checkpoints/
13 | *.ipynb
14 | 
15 | # PyCharm directory
16 | .idea/
17 | 
18 | # Pycache
19 | __pycache__/*
20 | *.pyc
21 | 


--------------------------------------------------------------------------------
/AlgorithmsComparator.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from collections import defaultdict
  3 | 
  4 | import numpy as np
  5 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | class AlgorithmsComparator:
 10 |     def __init__(self, algorithms, stream_generator):
 11 |         """ Constructor of AlgorithmsComparator
 12 | 
 13 |         :param algorithms: iterable of tuples (algorithm_name, algorithm)
 14 |         :param stream_generator: instance of StreamGenerator
 15 |         """
 16 | 
 17 |         self.algorithms = algorithms
 18 |         self.stream_generator = stream_generator
 19 | 
 20 |         self.predictions = dict()
 21 |         self.accuracies = defaultdict(list)
 22 |         self.precisions = defaultdict(list)
 23 |         self.recalls = defaultdict(list)
 24 |         self.f1_scores = defaultdict(list)
 25 |         self.X = None
 26 |         self.y = None
 27 | 
 28 |         self.time_to_update = defaultdict(list)
 29 |         self.time_to_predict = defaultdict(list)
 30 | 
 31 |     def _set_batch(self, X, y):
 32 |         """ Set X and y for current batch"""
 33 |         self.X = X
 34 |         self.y = y
 35 | 
 36 |     def _update_algorithms(self):
 37 |         """ Update algorithms with self.X and self.y """
 38 |         for algorithm_name, algorithm in self.algorithms:
 39 |             print("\t\tAlgorithm {} ... ".format(algorithm_name), end="", flush=True)
 40 |             start_timer = time.clock()
 41 |             algorithm.update(self.X, self.y)
 42 |             end_timer = time.clock()
 43 |             time_to_update = end_timer - start_timer
 44 |             self.time_to_update[algorithm_name].append(time_to_update)
 45 |             print("OK. Time to update on this batch: {0:.3f} seconds".format(time_to_update))
 46 | 
 47 |     def _predict_algorithms(self):
 48 |         """ Make the predictions for each algorithm on self.X"""
 49 |         for algorithm_name, algorithm in self.algorithms:
 50 |             print("\t\tAlgorithm {} ... ".format(algorithm_name), end="", flush=True)
 51 |             start_timer = time.clock()
 52 |             self.predictions[algorithm_name] = algorithm.predict(self.X)
 53 |             end_timer = time.clock()
 54 |             time_to_predict = end_timer - start_timer
 55 |             self.time_to_predict[algorithm_name].append(time_to_predict)
 56 |             print("OK. Time to predict on this batch: {0:.3f} seconds".format(time_to_predict))
 57 | 
 58 |     def _evaluate_algorithms(self):
 59 |         """ Evaluate the performance of the algorithms on current batch"""
 60 |         for algorithm_name, algorithm in self.algorithms:
 61 |             # compute metrics
 62 |             accuracy = accuracy_score(self.y, self.predictions[algorithm_name])
 63 |             precision = precision_score(self.y, self.predictions[algorithm_name])
 64 |             recall = recall_score(self.y, self.predictions[algorithm_name])
 65 |             f1 = f1_score(self.y, self.predictions[algorithm_name])
 66 | 
 67 |             # add scores to dictionaries
 68 |             self.accuracies[algorithm_name].append(accuracy)
 69 |             self.precisions[algorithm_name].append(precision)
 70 |             self.recalls[algorithm_name].append(recall)
 71 |             self.f1_scores[algorithm_name].append(f1)
 72 | 
 73 |     def _plot(self, show_plot):
 74 |         """ Create the different plots """
 75 |         # create 2*2 subplots
 76 |         fig, ax = plt.subplots(2, 2, figsize=(15, 10))
 77 |         accuracy_fig = ax[0, 0]
 78 |         precision_fig = ax[0, 1]
 79 |         recall_fig = ax[1, 0]
 80 |         f1_fig = ax[1, 1]
 81 | 
 82 |         for algorithm_name, algorithm in self.algorithms:
 83 |             accuracy_fig.plot(self.accuracies[algorithm_name], label=algorithm_name)
 84 |             precision_fig.plot(self.precisions[algorithm_name], label=algorithm_name)
 85 |             recall_fig.plot(self.recalls[algorithm_name], label=algorithm_name)
 86 |             f1_fig.plot(self.f1_scores[algorithm_name], label=algorithm_name)
 87 | 
 88 |         # set title
 89 |         accuracy_fig.set_title("Accuracies over time")
 90 |         precision_fig.set_title("Precisions over time")
 91 |         recall_fig.set_title("Recalls over time")
 92 |         f1_fig.set_title("F1 scores over time")
 93 | 
 94 |         # locate legend
 95 |         accuracy_fig.legend(loc="best")
 96 |         precision_fig.legend(loc="best")
 97 |         recall_fig.legend(loc="best")
 98 |         f1_fig.legend(loc="best")
 99 | 
100 |         # set figures' limits
101 |         # accuracy_fig.set_ylim(0.7, 1.0)
102 |         # precision_fig.set_ylim(0.7, 1.0)
103 |         # recall_fig.set_ylim(0.7, 1.0)
104 |         # f1_fig.set_ylim(0.7, 1.0)
105 | 
106 |         # set x-axis labels
107 |         accuracy_fig.set_xlabel("Batch number")
108 |         precision_fig.set_xlabel("Batch number")
109 |         recall_fig.set_xlabel("Batch number")
110 |         f1_fig.set_xlabel("Batch number")
111 | 
112 |         # save figure
113 |         plt.savefig("figures/plots_{}.png".format(time.strftime("%Y%m%d_%H%M%S")), format="png")
114 | 
115 |         # show plot if needed
116 |         if show_plot:
117 |             plt.show()
118 | 
119 |     def plot_comparison(self, batch_size, stream_length=1e8, show_plot=True):
120 |         """ Main method of AlgorithmsComparator: Simulate data stream and plot the performances of each algorithm"""
121 |         print("\nLet is begin plot_comparison", end="\n" * 2)
122 |         # first training on historical data
123 |         X_train, y_train = self.stream_generator.get_historical_data()
124 |         print("Historical Data")
125 |         self._set_batch(X_train, y_train)
126 |         print("\tUpdate on historical data")
127 |         self._update_algorithms()
128 | 
129 |         # simulate data streaming
130 |         for batch_nb, (X, y) in enumerate(
131 |                 self.stream_generator.generate(batch_size=batch_size, stream_length=stream_length)):
132 |             print("Batch #{}".format(batch_nb))
133 |             # set current batch's X and y
134 |             self._set_batch(X, y)
135 | 
136 |             # predict current batch, evaluate the performances and update the algorithms
137 |             print("\tPrediction #{}".format(batch_nb))
138 |             self._predict_algorithms()
139 |             print("\tEvaluation #{}".format(batch_nb))
140 |             self._evaluate_algorithms()
141 |             print("\tUpdate #{}".format(batch_nb))
142 |             self._update_algorithms()
143 | 
144 |         print("Mean time to update")
145 |         for algorithm_name, _ in self.algorithms:
146 |             print(
147 |                 "\t{0}: {1:.3f} seconds".format(algorithm_name, np.mean(np.array(self.time_to_update[algorithm_name]))))
148 | 
149 |         print("Mean time to predict")
150 |         for algorithm_name, _ in self.algorithms:
151 |             print("\t{0}: {1:.3f} seconds".format(algorithm_name,
152 |                                                   np.mean(np.array(self.time_to_predict[algorithm_name]))))
153 | 
154 |         # make the plots
155 |         self._plot(show_plot)
156 | 


--------------------------------------------------------------------------------
/data_management/DataLoader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from collections import defaultdict
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  8 | 
  9 | HEADER_NAMES = {
 10 |     'SEA': [
 11 |         'attribute_1',
 12 |         'attribute_2',
 13 |         'attribute_3',
 14 |         'label'
 15 |     ],
 16 |     'KDD': [
 17 |         'duration',
 18 |         'protocol_type',
 19 |         'service',
 20 |         'flag',
 21 |         'src_bytes',
 22 |         'dst_bytes',
 23 |         'land',
 24 |         'wrong_fragment',
 25 |         'urgent',
 26 |         'hot',
 27 |         'num_failed_logins',
 28 |         'logged_in',
 29 |         'num_compromised',
 30 |         'root_shell',
 31 |         'su_attempted',
 32 |         'num_root',
 33 |         'num_file_creations',
 34 |         'num_shells',
 35 |         'num_access_files',
 36 |         'num_outbound_cmds',
 37 |         'is_host_login',
 38 |         'is_guest_login',
 39 |         'count',
 40 |         'srv_count',
 41 |         'serror_rate',
 42 |         'srv_serror_rate',
 43 |         'rerror_rate',
 44 |         'srv_rerror_rate',
 45 |         'same_srv_rate',
 46 |         'diff_srv_rate',
 47 |         'srv_diff_host_rate',
 48 |         'dst_host_count',
 49 |         'dst_host_srv_count',
 50 |         'dst_host_same_srv_rate',
 51 |         'dst_host_diff_srv_rate',
 52 |         'dst_host_same_src_port_rate',
 53 |         'dst_host_srv_diff_host_rate',
 54 |         'dst_host_serror_rate',
 55 |         'dst_host_srv_serror_rate',
 56 |         'dst_host_rerror_rate',
 57 |         'dst_host_srv_rerror_rate',
 58 |         'label'
 59 |     ],
 60 | }
 61 | 
 62 | 
 63 | class DataLoader:
 64 |     def __init__(self, data_path, percentage_historical_data=0.2):
 65 |         self.data_path = data_path
 66 |         self.percentage_historical_data = percentage_historical_data
 67 |         self.X = None
 68 |         self.y = None
 69 |         self.X_historical = None
 70 |         self.y_historical = None
 71 |         self.list_classes = None
 72 | 
 73 |     def return_data(self):
 74 |         """
 75 |         The data which is used for the streaming part emulation.
 76 |         :return: Tuple X and y
 77 |         """
 78 |         return self.X, self.y
 79 | 
 80 |     def return_historical_data(self):
 81 |         """
 82 |         The historical used for training the model before going online.
 83 |         :return:
 84 |         """
 85 |         return self.X_historical, self.y_historical
 86 | 
 87 |     def split_data(self):
 88 |         """
 89 |         Split the dataset based on the percentage given in argument (percentage_historical_data)
 90 |         """
 91 |         number_histocal_data = int(self.percentage_historical_data * len(self.X))
 92 |         self.X_historical = self.X[:number_histocal_data]
 93 |         self.y_historical = self.y[:number_histocal_data]
 94 |         self.X = self.X[number_histocal_data + 1:]
 95 |         self.y = self.y[number_histocal_data + 1:]
 96 | 
 97 |     def normalization(self):
 98 |         """
 99 |         Normalized the data based on the historical data. Since we study concept drift we prefer to use a MinMax
100 |         normalisation.
101 |         """
102 |         mms = MinMaxScaler()
103 |         self.X_historical = mms.fit_transform(self.X_historical)
104 |         self.X = mms.transform(self.X)
105 | 
106 |     def save_data(self, path):
107 |         if not os.path.exists(path):
108 |             with open(self.data_path, 'wb') as data_file:
109 |                 data = {'X': self.X, 'y': self.y, 'X_historical': self.X_historical, 'y_historical': self.y_historical}
110 |                 pickle.dump(data, data_file, protocol=pickle.HIGHEST_PROTOCOL)
111 | 
112 |     def load_from_pickle(self):
113 |         with open(self.data_path, 'rb') as data_file:
114 |             data = pickle.load(data_file)
115 |             self.X = data['X']
116 |             self.y = data['y']
117 |             self.X_historical = data['X_historical']
118 |             self.y_historical = data['y_historical']
119 | 
120 |     def get_classes(self):
121 |         return self.list_classes
122 | 
123 | 
124 | class SEALoader(DataLoader):
125 |     def __init__(self, sea_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2):
126 |         DataLoader.__init__(self, sea_data_path, percentage_historical_data=percentage_historical_data)
127 |         if use_pickle_for_loading:
128 |             self.load_from_pickle()
129 |         else:
130 |             sea_df = pd.read_csv(self.data_path, header=None, names=HEADER_NAMES['SEA'])
131 |             sea_data = sea_df.values
132 |             self.X = sea_data[:, 1:3]
133 |             self.y = sea_data[:, -1]
134 |             self.list_classes = np.unique(self.y)
135 |             DataLoader.split_data(self)
136 |             DataLoader.normalization(self)
137 |             # Normalization
138 |             mms = MinMaxScaler()
139 |             self.X = mms.fit_transform(self.X)
140 | 
141 | 
142 | class KDDCupLoader(DataLoader):
143 |     """
144 |     This data set was used in KDD Cup 1999 Competition (Frank and Asuncion, 2010). The full dataset has about five
145 |     million connection records, this is a set with only 10 % of the size. The original task has 24 training attack
146 |     types. The original labels of attack types are changed to label abnormal in our experiments and we keep the label
147 |     normal for normal connection. This way we simplify the set to two class problem.
148 |     """
149 |     def __init__(self, kdd_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2, dummies=True):
150 |         '''
151 | 
152 |         :param kdd_data_path:
153 |         :param use_pickle_for_loading: You have registered a pickle file
154 |         :param percentage_historical_data: Percentage of data to use for the historical training.
155 |         :param dummies: If true convert categorical variable into dummy/indicator variables (one-hot encoded).
156 |         Use dummies equal false when your learning algorithm is DecisionTree.
157 |         :return:
158 |         '''
159 |         DataLoader.__init__(self, kdd_data_path, percentage_historical_data=percentage_historical_data)
160 |         if use_pickle_for_loading:
161 |             self.load_from_pickle()
162 |         else:  # TODO shorten the following lines of code
163 |             kdd_df = pd.read_csv(
164 |                 self.data_path,
165 |                 index_col=False,
166 |                 delimiter=',',
167 |                 header=None,
168 |                 names=HEADER_NAMES['KDD']
169 |             )
170 |             # TODO (minor) : Do not load these 2 columns at first
171 |             useless_features = ["num_outbound_cmds", "is_host_login"]
172 |             kdd_df = kdd_df.drop(useless_features, axis=1)
173 | 
174 |             # Handle symbolic data
175 |             symbolic = [
176 |                 "protocol_type",
177 |                 "service",
178 |                 "flag",
179 |                 "label"
180 |             ]
181 | 
182 |             self.symbolic_df = kdd_df[symbolic]
183 |             if dummies:
184 |                 symbolic_df_without_label = self.symbolic_df[self.symbolic_df.columns.difference(['label'])]
185 |                 dummies_df = pd.get_dummies(symbolic_df_without_label)
186 |                 non_categorical = kdd_df[kdd_df.columns.difference(symbolic)].values
187 |                 # Create X
188 |                 self.X = np.concatenate((non_categorical, dummies_df.values), axis=1)
189 |                 # Create y
190 |                 label = self.symbolic_df['label'].values
191 |                 self.y = LabelEncoder().fit_transform(label)
192 |                 self.list_classes = np.unique(self.y)
193 |                 DataLoader.split_data(self)
194 |                 DataLoader.normalization(self)
195 |             else:
196 |                 self.__encode_symbolic_df()
197 |                 kdd_df[symbolic] = self.symbolic_df
198 |                 self.X = kdd_df[kdd_df.columns.difference(['label'])].values
199 |                 self.y = kdd_df['label'].values
200 |                 self.list_classes = np.unique(self.y)
201 |                 DataLoader.split_data(self)
202 | 
203 |     def __encode_symbolic_df(self):
204 |         self.symbolic_encoder = defaultdict(LabelEncoder)
205 |         # Encode the symbolic variables
206 |         self.symbolic_df = self.symbolic_df.apply(lambda x: self.symbolic_encoder[x.name].fit_transform(x))
207 | 
208 |     def inverse_encode_symbolic_df(self):
209 |         self.symbolic_df.apply(lambda x: self.symbolic_encoder[x.name].inverse_transform(x))
210 | 
211 | 
212 | class UsenetLoader(DataLoader):
213 |     '''
214 |     Text dataset, inspired by Katakis et al. (2010), is a simulation of news filtering with a concept drift related to
215 |     the change of interest of a user over time. For this purpose we use the data from 20 Newsgroups (Rennie, 2008) and
216 |     handle it as follows. There are six topics chosen and the simulated user in each concept is subscribed to mailing
217 |     list of four of them being interested only in two. Over time the virtual user decides to unsubscribe from those
218 |     groups that he was not interested in and subscribe for two new ones that he becomes interested in. The previously
219 |     interesting topics become out of his main interest. The Table 1 summarizes the concepts. Note that the topics of
220 |     interest are repeated to simulate recurring concepts. The original dataset is divided into train and test. Data from
221 |     train appears in the first three concepts whereas data from test is in the last three (recurring) concepts.
222 |     The data is preprocessed with tm (Feinerer, 2010) package for R keeping only attributes (words) longer than three
223 |     letters and with minimal document frequency greater than three. Moreover, from the remaining only those that are
224 |     informative are kept (entropy > 75 x 10-5 ). Attribute values are binary indicating the presence or absence of the
225 |     respective word. At the end the set has 659 attributes and 5,931 examples.
226 |     '''
227 | 
228 |     def __init__(self, sea_data_path, use_pickle_for_loading=False, percentage_historical_data=0.2):
229 |         DataLoader.__init__(self, sea_data_path, percentage_historical_data=percentage_historical_data)
230 |         if use_pickle_for_loading:
231 |             self.load_from_pickle()
232 |         else:
233 |             usenet_df = pd.read_csv(self.data_path, header=None)
234 |             d = {'no': 0., 'yes': 1., 't': 1., 'f': 0., 'tt': 1}  # tt = error in the df
235 |             usenet_data = usenet_df.replace(d).values
236 |             self.X = usenet_data[:, :-1]
237 |             self.y = usenet_data[:, -1]
238 |             self.list_classes = np.unique(self.y)
239 |             DataLoader.split_data(self)
240 | 
241 | 


--------------------------------------------------------------------------------
/data_management/StreamGenerator.py:
--------------------------------------------------------------------------------
 1 | class StreamGenerator:
 2 |     """
 3 |     Emulate a stream of data for online learning algorithm
 4 |     """
 5 |     def __init__(self, data_loader):
 6 |         """
 7 |         Constructor of the StreamGenerator
 8 |         :param data_loader: Loader which inherits DataLoader
 9 |         """
10 |         self.data_loader = data_loader
11 | 
12 |     def get_historical_data(self):
13 |         """
14 |         :return: A tuple X_historical_data and y_historical_data
15 |         """
16 |         return self.data_loader.return_historical_data()
17 | 
18 |     def generate(self, stream_length=1e8, batch_size=1):
19 |         """
20 |         Generator of streaming data
21 |         :param stream_length: How many example do you want to see
22 |         :param batch_size: batch size is one by default you can fixed it or randomized it.
23 |         :return: A tuple X,y
24 |         """
25 |         X, y = self.data_loader.return_data()
26 |         X_length = len(X)
27 |         if stream_length > X_length:
28 |             stream_length = X_length
29 | 
30 |         for i in range(0, stream_length, batch_size):
31 |             yield X[i:i + batch_size], y[i:i + batch_size]
32 | 


--------------------------------------------------------------------------------
/data_management/__init__.py:
--------------------------------------------------------------------------------
1 | from .DataLoader import SEALoader, KDDCupLoader, UsenetLoader
2 | from .StreamGenerator import StreamGenerator
3 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | from sklearn.linear_model import LogisticRegression, SGDClassifier
  2 | from sklearn.svm import SVC
  3 | from sklearn.tree import DecisionTreeClassifier
  4 | 
  5 | from AlgorithmsComparator import AlgorithmsComparator
  6 | from data_management.DataLoader import UsenetLoader, SEALoader, KDDCupLoader
  7 | from data_management.StreamGenerator import StreamGenerator
  8 | from drift_detection_methods.spc import DDM
  9 | from ensemble_methods import SEA, DWM, OnlineBagging, DDD, DiversityWrapper
 10 | from offline_methods import OfflineAlgorithmsWrapper
 11 | from training_windows_methods import AdaptiveSVC
 12 | 
 13 | # generate SEA concepts data
 14 | sea_loader = SEALoader('data/sea.data', percentage_historical_data=0.2)
 15 | list_classes = sea_loader.get_classes()
 16 | sea_generator = StreamGenerator(sea_loader)
 17 | 
 18 | # generate KDD data
 19 | kdd_loader = KDDCupLoader('data/kddcup.data_10_percent', percentage_historical_data=0.2)
 20 | list_classes = kdd_loader.get_classes()
 21 | kdd_generator = StreamGenerator(kdd_loader)
 22 | 
 23 | usenet_loader = UsenetLoader('data/usenet_recurrent3.3.data', percentage_historical_data=0.1)
 24 | list_classes = usenet_loader.get_classes()
 25 | usenet_generator = StreamGenerator(usenet_loader)
 26 | 
 27 | # models
 28 | SEA_decision_trees = SEA(10, list_classes=list_classes,
 29 |                          base_estimator=OfflineAlgorithmsWrapper(DecisionTreeClassifier()))
 30 | SEA_SVC = SEA(10, base_estimator=OfflineAlgorithmsWrapper(SVC()))
 31 | adaptive_SVC = AdaptiveSVC(C=100, memory_limit=465)
 32 | decision_tree = OfflineAlgorithmsWrapper(base_estimator=DecisionTreeClassifier())
 33 | 
 34 | # Online Bagging
 35 | bagging_high_diversity = OnlineBagging(lambda_diversity=0.1, n_classes=list_classes, n_estimators=25)
 36 | bagging_low_diversity = OnlineBagging(lambda_diversity=1, n_classes=list_classes, n_estimators=25)
 37 | 
 38 | # DDD with Sea
 39 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1e4}
 40 | log_high_diversity = DiversityWrapper(lambda_diversity=0.1,
 41 |                                       list_classes=list_classes,
 42 |                                       base_estimator=LogisticRegression(**PARAM_LOG_REG))
 43 | log_low_diversity = DiversityWrapper(lambda_diversity=1,
 44 |                                      list_classes=list_classes,
 45 |                                      base_estimator=LogisticRegression(**PARAM_LOG_REG))
 46 | ddd_sea_log_reg = SEA
 47 | p_sea_high_div = {
 48 |     'base_estimator': log_high_diversity,
 49 |     'n_estimators': 25,
 50 |     'list_classes': list_classes
 51 | }
 52 | p_sea_low_div = {
 53 |     'base_estimator': log_low_diversity,
 54 |     'n_estimators': 25,
 55 |     'list_classes': list_classes
 56 | }
 57 | ddd = DDD(ensemble_method=ddd_sea_log_reg, drift_detector=DDM, pl=p_sea_high_div, ph=p_sea_low_div)
 58 | 
 59 | # DDD with online Bagging
 60 | clf = OnlineBagging
 61 | p_clf_high = {'lambda_diversity': 0.1,
 62 |               'n_classes': list_classes,
 63 |               'n_estimators': 25,
 64 |               'base_estimator': SGDClassifier,
 65 |               'p_estimators': {'loss': 'log'}  # We cannot predict_proba with the hinge loss
 66 |               }
 67 | p_clf_low = {'lambda_diversity': 1,
 68 |              'n_classes': list_classes,
 69 |              'n_estimators': 25,
 70 |              'base_estimator': SGDClassifier,
 71 |              'p_estimators': {'loss': 'log'}  # We cannot predict_proba with the hinge loss
 72 |              }
 73 | ddd_online_bagging = DDD(ensemble_method=clf, drift_detector=DDM, pl=p_clf_low, ph=p_clf_high)
 74 | 
 75 | # DWM
 76 | beta = 0.8
 77 | theta = 0.01
 78 | period = 5
 79 | DWM_decision_trees = DWM(beta, theta, period, OfflineAlgorithmsWrapper(DecisionTreeClassifier()))
 80 | DWM_SVC = DWM(beta, theta, period, base_estimator=OfflineAlgorithmsWrapper(SVC(probability=True)))
 81 | 
 82 | dwm_log_high_diversity = DiversityWrapper(lambda_diversity=0.1,
 83 |                                           list_classes=list_classes,
 84 |                                           base_estimator=LogisticRegression(**PARAM_LOG_REG))
 85 | dwm_log_low_diversity = DiversityWrapper(lambda_diversity=1,
 86 |                                          list_classes=list_classes,
 87 |                                          base_estimator=LogisticRegression(**PARAM_LOG_REG))
 88 | dwm_log_reg = DWM
 89 | p_dwm_high_div = {
 90 |     'beta': 0.8,
 91 |     'theta': 0.01,
 92 |     'period': 5,
 93 |     'base_estimator': dwm_log_high_diversity,
 94 |     'list_classes': list_classes
 95 | }
 96 | p_dwm_low_div = {
 97 |     'beta': 0.8,
 98 |     'theta': 0.01,
 99 |     'period': 5,
100 |     'base_estimator': dwm_log_low_diversity,
101 |     'list_classes': list_classes
102 | }
103 | 
104 | ddd_dwm_log_reg = DDD(ensemble_method=dwm_log_reg, drift_detector=DDM, pl=p_dwm_high_div, ph=p_dwm_low_div)
105 | 
106 | algorithms = [
107 |     # ("SEA (Decision Tree)", SEA_decision_trees),
108 |     # ("Offline decision tree", decision_tree),
109 |     # ("SEA (SVC)", SEA_SVC),
110 |     # ("Adaptive SVC", adaptive_SVC),
111 |     # ("Bagging low div (LogReg)", bagging_low_diversity),
112 |     # ("Bagging high div (LogReg)", bagging_high_diversity),
113 |     ("DDD (Online bagging)", ddd_online_bagging),
114 |     ("DDD (DWM logreg)", ddd_dwm_log_reg),
115 |     # ("DDD (SEA LogReg)", ddd)
116 |     # ("DWM (Decision Tree)", DWM_decision_trees),
117 |     # ("DWM (SVC)", DWM_SVC),
118 | ]
119 | 
120 | #comparison of algorithms on SEA concepts
121 | # print("\nDataset: SEA concepts")
122 | # comparator = AlgorithmsComparator(algorithms, sea_generator)
123 | # comparator.plot_comparison(batch_size=3000, stream_length=48000)
124 | 
125 | #comparison of algorithms on KDD dataset
126 | print("\nDataset: KDD")
127 | comparator = AlgorithmsComparator(algorithms, kdd_generator)
128 | comparator.plot_comparison(batch_size=3000, stream_length=480000)
129 | 
130 | # print("\n Dataset: Usenet")
131 | # comparator = AlgorithmsComparator(algorithms, usenet_generator)
132 | # comparator.plot_comparison(batch_size=50, stream_length=6000)
133 | 


--------------------------------------------------------------------------------
/drift_detection_methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanRxl/concept_drift/de08fa7e72b16c0620d6a548e6cee32fe445c17d/drift_detection_methods/__init__.py


--------------------------------------------------------------------------------
/drift_detection_methods/spc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | '''
 4 | Detectors based on statistical process control.
 5 | SPC considers learning as a process and monitors the evolution of this process.
 6 | '''
 7 | 
 8 | 
 9 | class DDM:
10 |     '''
11 |     This class follows the article:
12 |     Gama, J., Medas, P., Castillo, G., Rodrigues, P.: Learning with drift detection.
13 |     Lecture Notes in Computer Science 3171 (2004)
14 |     '''
15 | 
16 |     def __init__(self, verbose=False):
17 |         self.verbose = verbose
18 |         self.pmin = 10e8
19 |         self.smin = 10e8
20 |         self.t = 0  # number of examples seen
21 |         self.pi = 1  # error-rate
22 |         self.si = 0  # standard deviation
23 |         self.psi = 10e8
24 |         self.ctr = 0
25 | 
26 |     def reset_after_drift(self):
27 |         self.pmin = 10e8
28 |         self.smin = 10e8
29 |         self.t = 0  # number of examples seen
30 |         self.pi = 1  # error-rate
31 |         self.si = 0  # standard deviation
32 |         self.psi = 10e8
33 | 
34 |     def __update(self, y_true, y_pred):
35 |         self.t += 1  # update the number of items seen
36 |         good_predictions = y_pred == y_true
37 |         error_rate = 1 - good_predictions
38 |         self.pi += (error_rate - self.pi) / self.t
39 |         self.si = np.sqrt(self.pi * (1 - self.pi) / self.t)
40 | 
41 |         if self.t > 30 and self.pi + self.si <= self.psi:
42 |             self.pmin = self.pi
43 |             self.smin = self.si
44 |             self.psi = self.si + self.pi
45 | 
46 |     def drift_detection(self, y_true, y_pred):
47 |         self.ctr += len(y_true)
48 |         for yt, yp in zip(y_true, y_pred):
49 |             drift = self.__drift_detection_lonely_example(yt, yp)
50 |             if drift:
51 |                 return True
52 |         return False
53 | 
54 |     def __drift_detection_lonely_example(self, y_true, y_pred):
55 |         self.__update(y_true, y_pred)
56 |         if self.t > 30 and self.pi + self.si >= self.pmin + 3 * self.smin:
57 |             if self.verbose:
58 |                 print('Drift detected: time_step={0}'.format(self.ctr))
59 |             self.reset_after_drift()
60 |             return True
61 |         elif self.pmin + 2 * self.smin <= self.pi + self.si < self.pmin + 3 * self.smin:
62 |             if self.verbose:
63 |                 print('Warning a drift may happens: time_step={0}'.format(self.ctr))
64 |             return False
65 |         else:
66 |             return False
67 | 
68 | 
69 | # TODO implement EDDM
70 | class EDDM(DDM):
71 |     '''
72 |     This class is an implementation of the following algorithm:
73 |     BAENA-GARCIA, Manuel, DEL CAMPO-ÁVILA, José, FIDALGO, Raúl, et al.
74 |     Early drift detection method
75 |     In : Fourth international workshop on knowledge discovery from data streams. 2006. p. 77-86.
76 |     http://www.cs.upc.edu/~abifet/EDDM.pdf
77 |     '''
78 | 
79 |     def __init__(self, verbose=False):
80 |         DDM.__init__(self, verbose=verbose)
81 | 
82 |     def drift_detection(self, y_true, y_pred):
83 |         pass
84 | 


--------------------------------------------------------------------------------
/ensemble_methods/DWM.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | from sklearn.metrics.classification import accuracy_score
  6 | from sklearn.tree import DecisionTreeClassifier
  7 | 
  8 | 
  9 | class DWM:
 10 |     """ This class implements the DWM algorithm based on the article "Dynamic Weighted Majority: A New Ensemble Method for Tracking Concept Drift" by Jeremy Z. Kolter and Marcus A. Maloof """
 11 | 
 12 |     def __init__(self, beta, theta, period, base_estimator=None, list_classes=None):
 13 |         # For noisy problems, a period parameter can be added
 14 |         """ Constructor of DWM
 15 | 
 16 |         :param base_estimator: instance of a classifier class (by default sklearn.tree.DecisionTreeClassifier())
 17 |         :param beta : multiplier affecting the weight every time a classifier get the prediction wrong
 18 |         :param theta: threshold to remove the classifier from the list
 19 |         """
 20 |         if base_estimator is None:
 21 |             self.base_estimator = DecisionTreeClassifier()
 22 |         else:
 23 |             self.base_estimator = base_estimator
 24 | 
 25 |         self.list_classifiers = []
 26 |         self.new_classifier = None
 27 |         self.classifier_to_evaluate = None
 28 |         self.list_classes = list_classes
 29 |         self.weights = []
 30 |         self.theta = theta
 31 |         self.beta = beta
 32 |         self.period = period
 33 |         self.step = 0
 34 | 
 35 |     def update(self, X, y):
 36 |         """ Update the ensemble of models
 37 | 
 38 |         :param X: new batch X
 39 |         :param y: array of labels
 40 |         """
 41 |         self.step += 1
 42 | 
 43 |         # retrieve list of different classes if it is the first time we fit data
 44 |         if self.list_classes is None:
 45 |             self.list_classes = np.unique(y)
 46 | 
 47 |         # train new classifier
 48 |         self.new_classifier = deepcopy(self.base_estimator)
 49 |         self.new_classifier.update(X, y)
 50 | 
 51 |         # if there is not enough classifiers, add the new classfier in the ensemble
 52 |         if len(self.list_classifiers) == 0:
 53 |             self.list_classifiers.append(self.new_classifier)
 54 |             self.weights.append(1)
 55 |         # Otherwise, we lower the weights on the lower classifiers, multiplying them by beta
 56 |         # Once the weights are lowered, we remove the classifiers with weights under the threshold theta
 57 |         elif self.step > 0 and self.step % self.period == 0:
 58 |             # On each update, we'll use two empty lists to store the classifiers/weighs that pass the tests
 59 |             # Once the tests are ran on all classifiers/weights, that new list become the main one
 60 |             self.newlist_classifiers = []
 61 |             self.newWeights = []
 62 | 
 63 |             for clf, weight in zip(self.list_classifiers, self.weights):
 64 |                 # If the prediction is untrue but the classifier still has enough weight, we'll keep him
 65 |                 if np.sum(clf.predict(X) != y) > 250:
 66 |                     if weight * (self.beta) > self.theta:
 67 |                         self.newWeights.append(round(weight * (self.beta), 2))
 68 |                         self.newlist_classifiers.append(clf)
 69 |                 else:
 70 |                     self.newWeights.append(round(weight, 2))
 71 |                     self.newlist_classifiers.append(clf)
 72 | 
 73 |             self.weights = deepcopy(self.newWeights)
 74 | 
 75 |             self.list_classifiers = deepcopy(self.newlist_classifiers)
 76 |             # The step is finished by normalizing the weight vector
 77 |             norm = np.max(self.weights)
 78 |             self.weights = [weight / norm for weight in self.weights]
 79 |             # Now let's vote with the new weights
 80 |             # If the decision is still not correct, then we'll add a new classifier
 81 | 
 82 | 
 83 |             """
 84 |             # make the prediction for each classifier
 85 |             predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers])
 86 | 
 87 |             # for each class, count the number of times the class is predicted
 88 |             nb_votes_by_class = []
 89 |             for i in range(self.list_classes):
 90 |                 nb_votes_by_class.append(0)
 91 |                 for j in range(len(self.list_classes)):
 92 |                     if predictions[j] == self.list_classes[i]:
 93 |                         nb_votes_by_class[i] += self.weights[j]
 94 |             """
 95 |             # for each example, return the class which was predicted the most
 96 |             # If the prediction is incorrect, then add a new classifier
 97 | 
 98 |         if np.any(self.predict(X) != y):
 99 |             # Train and add the new classifier
100 |             self.new_classifier = deepcopy(self.base_estimator)
101 |             self.new_classifier.update(X, y)
102 |             self.list_classifiers.append(self.new_classifier)
103 |             # Add the matching weight
104 |             self.weights.append(1)
105 |         """
106 |         for clf, weight in zip(self.list_classifiers, self.weights):
107 |             # If the prediction is untrue but the classifier still has enough weight, we'll keep him
108 |             print(self.weights, (self.beta) ** np.sum(clf.predict(X) != y),
109 |                   weight * ((self.beta) ** np.sum(clf.predict(X) != y)))
110 |         """
111 | 
112 |     """
113 |     def predict(self, X):
114 |         # Make the prediction
115 | 
116 |         # :param X: examples to predict
117 |         # :return: the prediction y_predict
118 |         # make the prediction for each classifier
119 |         predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers])
120 | 
121 |         # for each class, count the number of times the class is predicted
122 |         nb_votes_by_class = []
123 |         for c in self.list_classes:
124 |             nb_votes_by_class.append(0)
125 |             for prediction, weight in zip(predictions, self.weights):
126 |                 print(prediction, c)
127 |                 if prediction == c:
128 |                     nb_votes_by_class[len(nb_votes_by_class)] += weight
129 | 
130 |         # for each example, return the class which was predicted the most
131 |         return self.list_classes[np.argmax(nb_votes_by_class, axis=0)]
132 |     """
133 | 
134 |     def predict(self, X):
135 |         """ Compute the probability of belonging to each class
136 |         :param X: examples to predict
137 |         :return: the probabilities, array of shape (n_examples, n_classes)
138 |         """
139 |         # create empty array to retrieve
140 |         array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers)))
141 | 
142 |         # iterate over the classifiers and add the probabilities to the previous array
143 |         for i, clf in enumerate(self.list_classifiers):
144 |             array_probas[:, :, i] = clf.predict_proba(X)
145 | 
146 |         # compute and return the mean of probas computed by each classifier
147 |         probs = np.average(array_probas, axis=2, weights=self.weights)
148 |         return self.list_classes[np.argmax(probs, axis=1)]
149 | 
150 |     def predict_proba(self, X):
151 |         """ Compute the probability of belonging to each class
152 | 
153 |         :param X: examples to predict
154 |         :return: the probabilities, array of shape (n_examples, n_classes)
155 |         """
156 |         # create empty array to retrieve
157 |         array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers)))
158 | 
159 |         # iterate over the classifiers and add the probabilities to the previous array
160 |         for i, clf in enumerate(self.list_classifiers):
161 |             array_probas[:, :, i] = clf.predict_proba(X)
162 | 
163 |         # compute and return the mean of probas computed by each classifier
164 |         return np.average(array_probas, axis=2, weights=self.weights)
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     from data_management import SEALoader, StreamGenerator
169 |     from sklearn.svm import SVC
170 |     from offline_methods import OfflineAlgorithmsWrapper
171 | 
172 |     # generate data
173 |     loader = SEALoader('../data/sea.data')
174 |     generator = StreamGenerator(loader)
175 | 
176 |     # model
177 |     beta = 0.5
178 |     theta = 0.1
179 |     period = 3
180 |     clf = DWM(base_estimator=OfflineAlgorithmsWrapper(SVC(probability=True)), beta=beta, theta=theta, period=period)
181 | 
182 |     # record scores
183 |     accuracy_results = []
184 | 
185 |     for i, (X, y) in enumerate(generator.generate(batch_size=3000)):
186 |         print("Batch #%d:" % i)
187 |         print("update model\n")
188 |         delete = i % period != 0
189 |         clf.update(X, y)
190 |         # predict
191 |         print("predict for current X")
192 |         y_predict = clf.predict(X)
193 |         print("Accuracy score: %0.2f" % accuracy_score(y, y_predict))
194 |         accuracy_results.append(accuracy_score(y, y_predict))
195 | 
196 |     plt.plot(accuracy_results)
197 |     plt.ylabel('Accuracy Results')
198 |     plt.show()
199 | 


--------------------------------------------------------------------------------
/ensemble_methods/SEA.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics.classification import accuracy_score
  5 | from sklearn.tree import DecisionTreeClassifier
  6 | from offline_methods import OfflineAlgorithmsWrapper
  7 | 
  8 | 
  9 | class SEA:
 10 |     """
 11 |     This class implements the SEA algorithm based on the article
 12 |     "A Streaming Ensemble Algorithm (SEA) for Large-Scale Classification" by W Nick Street and YongSeog Kim
 13 |     """
 14 | 
 15 |     def __init__(self, n_estimators, base_estimator=None, scoring_method=None, list_classes=None):
 16 |         """ Constructor of SEA
 17 | 
 18 |         :param n_estimators: number of estimators in the ensemble
 19 |         :param base_estimator: instance of a classifier class (by default sklearn.tree.DecisionTreeClassifier())
 20 | 
 21 |         """
 22 |         if base_estimator is None:
 23 |             self.base_estimator = OfflineAlgorithmsWrapper(DecisionTreeClassifier())
 24 |         else:
 25 |             self.base_estimator = base_estimator
 26 | 
 27 |         if scoring_method is None:
 28 |             self.scoring_method = accuracy_score
 29 |         else:
 30 |             self.scoring_method = scoring_method
 31 | 
 32 |         self.n_estimators = n_estimators
 33 | 
 34 |         self.list_classifiers = []
 35 |         self.new_classifier = None
 36 |         self.classifier_to_evaluate = None
 37 |         self.list_classes = list_classes
 38 | 
 39 |     def update(self, X, y):
 40 |         """ Update the ensemble of models
 41 | 
 42 |         :param X: new batch X
 43 |         :param y: array of labels
 44 |         """
 45 |         # retrieve list of different classes if it is the first time we fit data
 46 |         if self.list_classes is None:
 47 |             self.list_classes = np.unique(y)
 48 | 
 49 |         # train new classifier
 50 |         self.new_classifier = deepcopy(self.base_estimator)
 51 |         self.new_classifier.update(X, y)
 52 | 
 53 |         # if there is not enough classifiers, add the new classfier in the ensemble
 54 |         if len(self.list_classifiers) < self.n_estimators:
 55 |             self.list_classifiers.append(self.new_classifier)
 56 |         # otherwise, evaluate the (n_estimators + 1) estimators and remove the worst performing one
 57 |         else:
 58 |             if self.classifier_to_evaluate is None:
 59 |                 self.classifier_to_evaluate = self.new_classifier
 60 |             else:
 61 |                 # evaluate (n_estimators + 1) classifiers
 62 |                 self.list_classifiers.append(self.classifier_to_evaluate)
 63 |                 scores = [self.scoring_method(y, clf.predict(X)) for clf in self.list_classifiers]
 64 | 
 65 |                 # remove the worst performing one
 66 |                 self.list_classifiers.pop(int(np.argmin(scores)))
 67 | 
 68 |     def predict(self, X):
 69 |         """ Make the prediction
 70 | 
 71 |         :param X: examples to predict
 72 |         :return: the prediction y_predict
 73 |         """
 74 |         # make the prediction for each classifier
 75 |         predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers])
 76 | 
 77 |         # for each class, count the number of times the class is predicted
 78 |         nb_votes_by_class = []
 79 |         for c in self.list_classes:
 80 |             nb_votes_by_class.append(np.sum(predictions == c, axis=0))
 81 | 
 82 |         # for each example, return the class which was predicted the most
 83 |         return self.list_classes[np.argmax(nb_votes_by_class, axis=0)]
 84 | 
 85 |     def predict_proba(self, X):
 86 |         """ Compute the probability of belonging to each class
 87 | 
 88 |         :param X: examples to predict
 89 |         :return: the probabilities, array of shape (n_examples, n_classes)
 90 |         """
 91 |         # create empty array to retrieve
 92 |         array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers)))
 93 | 
 94 |         # iterate over the classifiers and add the probabilities to the previous array
 95 |         for i, clf in enumerate(self.list_classifiers):
 96 |             array_probas[:, :, i] = clf.predict_proba(X)
 97 | 
 98 |         # compute and return the mean of probas computed by each classifier
 99 |         return array_probas.mean(axis=2)
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     from data_management import SEALoader, StreamGenerator
104 |     from sklearn.svm import SVC
105 |     from offline_methods import OfflineAlgorithmsWrapper
106 |     # generate data
107 |     loader = SEALoader('../data/sea.data')
108 |     generator = StreamGenerator(loader)
109 | 
110 |     # model
111 |     n_estimators = 5
112 |     svc = OfflineAlgorithmsWrapper(SVC(probability=True))
113 |     clf = SEA(base_estimator=svc, n_estimators=n_estimators)
114 | 
115 |     for i, (X, y) in enumerate(generator.generate(batch_size=2000)):
116 |         print("Batch #%d:" % i)
117 |         # for the first batches, only update the model
118 |         if i < n_estimators:
119 |             print("update model\n")
120 |             clf.update(X, y)
121 |         else:
122 |             # predict
123 |             print("predict for current X")
124 |             y_predict = clf.predict(X)
125 |             # probas = clf.predict_proba(X)
126 |             print("Accuracy score: %0.2f" % accuracy_score(y, y_predict))
127 |             # after some time, labels are available
128 |             print("update model\n")
129 |             clf.update(X, y)
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/ensemble_methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .SEA import SEA
2 | from .ddd import DDD, DiversityWrapper
3 | from .online_bagging import OnlineBagging
4 | from .DWM import DWM
5 | 


--------------------------------------------------------------------------------
/ensemble_methods/ddd.py:
--------------------------------------------------------------------------------
  1 | from copy import deepcopy
  2 | import numpy as np
  3 | from sklearn.metrics import accuracy_score
  4 | from sklearn.linear_model import LogisticRegression
  5 | from drift_detection_methods.spc import DDM
  6 | from ensemble_methods.online_bagging import OnlineBagging
  7 | 
  8 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1.e4}
  9 | 
 10 | 
 11 | class DiversityWrapper:
 12 |     """
 13 |     This is a wrapper for the learning algorithm used in ensemble methods by DDD.
 14 |     It allows to introduce high/low diversity during the training.
 15 |     Low diversity => lambda = 1
 16 |     High diversity => lambda =0.1
 17 |     """
 18 |     def __init__(self, lambda_diversity=0.1, base_estimator=None, list_classes=None):
 19 |         """
 20 | 
 21 |         :param lambda_diversity: Parameters of the Poisson distribution which introduce high/low diversity
 22 |         :param base_estimator: Estimators which is going to be used by this wrapper.
 23 |         :param list_classes: Number of classes to predict
 24 |         """
 25 |         self.lambda_diversity = lambda_diversity
 26 |         if base_estimator is None:
 27 |             self.base_estimator = LogisticRegression(**PARAM_LOG_REG)
 28 |         else:
 29 |             self.base_estimator = base_estimator
 30 |         self.fitted = False  # boolean which is True if base_estimator has been fit
 31 |         self.list_classes = list_classes
 32 | 
 33 |     def __create_diversity(self, X, y, lambda_diversity):
 34 |         """
 35 |         :param X:
 36 |         :param y:
 37 |         :param lambda_diversity:
 38 |         :return:
 39 |         """
 40 |         # Generate the number of time I want my classifier see the example
 41 |         X_training = None
 42 |         y_training = None
 43 |         while X_training is None and y_training is None:
 44 |             X_training = None
 45 |             y_training = None
 46 |             k = np.random.poisson(lambda_diversity, len(X))
 47 |             while np.sum(k > 0):
 48 |                 pos = np.where(k > 0)
 49 |                 if X_training is None and y_training is None:
 50 |                     X_training = X[pos]
 51 |                     y_training = y[pos]
 52 |                 else:
 53 |                     X_pos = X[pos]
 54 |                     y_pos = y[pos]
 55 |                     if X_pos.shape[0] == 1:
 56 |                         X_training = np.concatenate((X_training, X[pos].reshape((1, X[pos].shape[1]))), axis=0)
 57 |                     else:
 58 |                         X_training = np.concatenate((X_training, X[pos]), axis=0)
 59 |                     y_training = np.vstack((y_training.reshape((-1, 1)), y_pos.reshape((-1, 1))))
 60 |                 # check if there is all classes pass to the fit methods
 61 |                 k -= 1
 62 |         return X_training, y_training
 63 | 
 64 |     def __preprocess_X_and_y_fit(self, X, y):
 65 |         #TODO used only online algorithm with fit_partial method.
 66 |         """
 67 |         Check if we have all the labels in the batch.
 68 |         :param X:
 69 |         :param y:
 70 |         :return:
 71 |         """
 72 |         y_values = np.unique(y)
 73 |         if len(y_values) == len(self.list_classes):
 74 |             return X, y.reshape((y.shape[0],))
 75 |         else:
 76 |             for val in self.list_classes:
 77 |                 if val not in y_values:
 78 |                     X = np.concatenate((X, np.zeros((1, X.shape[1]))), axis=0)
 79 |                     y = np.vstack((y.reshape((-1, 1)), val))
 80 |             return X, y.reshape((y.shape[0],))
 81 | 
 82 |     def update(self, X, y):
 83 |         """Fit the base_estimator, only if it has not been fitted already"""
 84 |         X_with_diversity, y_with_diversity = self.__create_diversity(X, y, self.lambda_diversity)
 85 |         X_with_diversity, y_with_diversity = self.__preprocess_X_and_y_fit(X_with_diversity, y_with_diversity)
 86 |         self.base_estimator.fit(X_with_diversity, y_with_diversity)
 87 | 
 88 |     def predict(self, X):
 89 |         return self.base_estimator.predict(X)
 90 | 
 91 |     def predict_proba(self, X):
 92 |         return self.base_estimator.predict_proba(X)
 93 | 
 94 | class PrequentialMetrics:
 95 |     def __init__(self):
 96 |         self.acc = 1
 97 |         self.var = 0
 98 |         self.std = 0
 99 |         self.t = 0  # time_step
100 |         self.t_drift = 0  # time step of the previous drift
101 | 
102 |     def update(self, y_pred, y_true, drift):
103 |         """
104 |         Update the Prequential accuracy according to the section 5 of the DDD publication
105 |         if drift
106 |         acc(t) = acc_ex(t)
107 |         else
108 |         acc(t) = acc(t-1) + acc_ex(t)-acc(t-1)/(t -t_drift+1)
109 |         :param y_pred: predicted labels
110 |         :param y_true: real labels
111 |         :param drift: A drift has been detected
112 |         :return:
113 |         """
114 | 
115 |         number_of_time_steps = len(y_pred)  # number of time steps in the batch
116 |         self.t += number_of_time_steps  # update the number of items seen
117 |         good_predictions = np.sum(y_pred == y_true)
118 |         batch_accuracy = good_predictions / number_of_time_steps
119 | 
120 |         if drift:
121 |             self.acc = batch_accuracy
122 |             self.var = self.acc * (1 - self.acc) / number_of_time_steps
123 |             self.t_drift = self.t
124 |         else:
125 |             self.acc += (batch_accuracy - self.acc) / (self.t - self.t_drift + 1)
126 |             self.var = self.acc * (1 - self.acc) / (self.t - self.t_drift + 1)
127 | 
128 |         self.std = np.sqrt(self.var)
129 | 
130 | 
131 | class DDD:
132 |     def __init__(self, drift_detector=None, ensemble_method=None, W=0.1, pl=None, ph=None):
133 |         '''
134 |         This class implements the DDD algorithms based on the article:
135 |         MINKU, Leandro L. et YAO, Xin. DDD: A new ensemble approach for dealing with concept drift. IEEE transactions on
136 |          knowledge and data engineering, 2012, vol. 24, no 4, p. 619-633.
137 |         :param ensemble_method: online ensemble algorithm (LogisticRegression by default)
138 |         :param drift_detector: drift detection method to use
139 |         :param stream: data stream
140 |         :param W: multiplier constant W for the weight of the old low diversity ensemble
141 |         :param pl: parameters for ensemble learning with low diversity
142 |         :param ph: parameters for ensemble learning with high diversity
143 |         :param pd: parameters for drift detection method
144 |         :return:
145 |         '''
146 | 
147 |         if drift_detector is None:
148 |             self.drift_detector = DDM
149 |         else:
150 |             self.drift_detector = drift_detector
151 |         if ensemble_method is None:
152 |             self.ensemble_method = OnlineBagging
153 |         else:
154 |             self.ensemble_method = ensemble_method
155 |         self.drift_detector = drift_detector()
156 |         self.W = W
157 |         self.pl = pl
158 |         self.ph = ph
159 | 
160 |         # Parameters
161 |         self.mode_before_drift = True  # before drift
162 |         self.drift = False
163 |         self.low_diversity_learner, self.high_diversity_learner = self.__init_ensemble()
164 |         self.old_low_diversity_learner = self.old_high_diversity_learner = None
165 |         self.metric_ol, self.metric_oh, self.metric_nl, self.metric_nh = self.__init_metrics()
166 |         self.woh = self.wol = self.wnl = 0
167 |         self.y_pred = None
168 | 
169 |     def __weighted_majority(self, X, hnl, hol, hoh, wnl, wol, woh):
170 |         '''
171 |         Weighted majority between all the learning algorithms.
172 |         The new high diversity learning algorithm is not considered because it is likely to have low accuracy
173 |         on the new concept.
174 |         :param hnl: new low diversity learning algorithm
175 |         :param hol: old low diversity learning algorithm
176 |         :param hoh: old high diversity learning algorithm
177 |         :param wnl: weights
178 |         :param wol: weights
179 |         :param woh: weights
180 |         :return:
181 |         '''
182 |         y_hnl = hnl.predict_proba(X)
183 |         y_hol = hol.predict_proba(X)
184 |         y_hoh = hoh.predict_proba(X)
185 |         return self.__scores_to_single_label(wnl * y_hnl + wol * y_hol + woh * y_hoh)
186 | 
187 |     @staticmethod
188 |     def __init_metrics():
189 |         metric_ol = PrequentialMetrics()
190 |         metric_oh = PrequentialMetrics()
191 |         metric_nl = PrequentialMetrics()
192 |         metric_nh = PrequentialMetrics()
193 |         return metric_ol, metric_oh, metric_nl, metric_nh
194 | 
195 |     def __init_ensemble(self):
196 |         hnl = self.ensemble_method(**self.pl)  # ensemble low diversity
197 |         hnh = self.ensemble_method(**self.ph)  # ensemble high diversity
198 |         return hnl, hnh
199 | 
200 |     @staticmethod
201 |     def __scores_to_single_label(scores):
202 |         if len(scores.shape) == 1:
203 |             return (scores > 0).astype(np.int)
204 |         else:
205 |             return scores.argmax(axis=1)
206 | 
207 |     def predict(self, X):
208 |         # Before a drift is detected only the low ensemble is used for system prediction
209 |         if self.mode_before_drift:
210 |             y_pred = self.low_diversity_learner.predict(X)
211 |         else:
212 |             sum_acc = self.metric_nl.acc + self.metric_ol.acc * self.W + self.metric_oh.acc
213 |             self.wnl = self.metric_nl.acc / sum_acc
214 |             self.wol = self.metric_ol.acc * self.W / sum_acc
215 |             self.woh = self.metric_oh.acc / sum_acc
216 |             y_pred = self.__weighted_majority(X, self.low_diversity_learner, self.old_low_diversity_learner,
217 |                                               self.old_high_diversity_learner, self.wnl, self.wol, self.woh)
218 |         self.y_pred = y_pred
219 |         return y_pred
220 | 
221 |     def __drift_detection(self, X, y_true):
222 |         # Not done in the paper but seems to be the proper position for the update
223 |         self.metric_nl.update(self.y_pred, y_true, self.drift)
224 |         self.metric_nh.update(self.high_diversity_learner.predict(X), y_true, self.drift)
225 |         if not self.mode_before_drift:
226 |             self.metric_oh.update(self.old_high_diversity_learner.predict(X), y_true, self.drift)
227 |             self.metric_ol.update(self.old_low_diversity_learner.predict(X), y_true, self.drift)
228 | 
229 |         # Boolean == True if drift detect
230 |         self.drift = self.drift_detector.drift_detection(y_true, self.y_pred)
231 | 
232 |         if self.drift:
233 |             # The old low diversity ensemble after the second drift detection can be either
234 |             # the same as the old high diversity learning with low diversity
235 |             # after the first detection or the ensemble corresponding
236 |             # to the new low diversity after the first drift detection depending
237 |             # on which of them is the most accurate.
238 |             if self.mode_before_drift or (not self.mode_before_drift and self.metric_nl.acc > self.metric_oh.acc):
239 |                 self.old_low_diversity_learner = self.low_diversity_learner
240 |                 self.metric_ol = self.metric_nl  # Not said in the paper but make sense.
241 |             else:
242 |                 self.old_low_diversity_learner = self.old_high_diversity_learner
243 |                 self.metric_ol = self.metric_oh  # Not said in the paper but make sense.
244 | 
245 |             # The ensemble corresponding to the high diversity is registered as old
246 |             self.old_high_diversity_learner = self.high_diversity_learner
247 |             self.metric_oh = self.metric_nh  # Not said in the paper but make sense.
248 | 
249 |             # After a drift is detected new low and high diversity ensemble are created
250 |             self.low_diversity_learner, self.high_diversity_learner = self.__init_ensemble()
251 |             # In the paper all the metrics are set to zero. Which is impossible in the predict method we divide
252 |             # by 0.
253 |             _, _, self.metric_nl, self.metric_nh = self.__init_metrics()
254 |             self.mode_before_drift = False  # After drift
255 |         # if after drift
256 |         if not self.mode_before_drift:
257 |             if self.metric_nl.acc > self.metric_oh.acc and self.metric_nl.acc > self.metric_ol.acc:
258 |                 self.mode_before_drift = True
259 |             elif self.metric_oh.acc - self.metric_oh.std > self.metric_nl.acc + self.metric_nl.std \
260 |                     and self.metric_oh.acc - self.metric_oh.std > self.metric_ol.acc + self.metric_ol.std:
261 |                 self.low_diversity_learner = deepcopy(self.old_high_diversity_learner)
262 |                 self.metric_nl = deepcopy(self.metric_oh)
263 |                 self.mode_before_drift = True
264 | 
265 |     def update(self, X, y_true):
266 |         # If we have never done predictions we cannot detect if there was a drift.
267 |         if self.y_pred is not None:
268 |             self.__drift_detection(X, y_true)
269 |         self.low_diversity_learner.update(X, y_true)
270 |         self.high_diversity_learner.update(X, y_true)
271 |         if not self.mode_before_drift:
272 |             self.old_low_diversity_learner.update(X, y_true)
273 |             self.old_high_diversity_learner.update(X, y_true)
274 | 
275 | if __name__ == "__main__":
276 |     from data_management.StreamGenerator import StreamGenerator
277 |     from data_management.DataLoader import KDDCupLoader, SEALoader
278 |     from sklearn.linear_model import SGDClassifier
279 | 
280 |     # generate data
281 |     loader = SEALoader('../data/sea.data', percentage_historical_data=0.1)
282 |     generator = StreamGenerator(loader)
283 |     # kdd_data_loader = KDDCupLoader('../data/kddcup.data_10_percent')
284 |     # generator = StreamGenerator(kdd_data_loader)
285 | 
286 |     # model
287 |     clf = OnlineBagging
288 |     p_estimators = None
289 |     n_classes = np.array(range(0, 2))
290 |     p_clf_high = {'lambda_diversity': 0.1,
291 |                   'n_classes': n_classes,
292 |                   'n_estimators': 25,
293 |                   'base_estimator': SGDClassifier,
294 |                   }
295 |     p_clf_low = {'lambda_diversity': 1,
296 |                  'n_classes': n_classes,
297 |                  'n_estimators': 25,
298 |                  'base_estimator': SGDClassifier,
299 |                  }
300 |     ddd = DDD(ensemble_method=clf, drift_detector=DDM, pl=p_clf_low, ph=p_clf_high)
301 |     batch = 3000
302 |     X_historical, y_historical = generator.get_historical_data()
303 |     ddd.update(X_historical, y_historical)
304 |     for i, (X, y_true) in enumerate(generator.generate(batch_size=batch)):
305 |         y_pred = ddd.predict(X)
306 |         print("Accuracy score: %0.2f" % accuracy_score(y_true, y_pred))
307 |         # after some time, labels are available
308 |         print("update model\n")
309 |         ddd.update(X, y_true)
310 | 
311 | 


--------------------------------------------------------------------------------
/ensemble_methods/online_bagging.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.linear_model import SGDClassifier
  3 | 
  4 | PARAM_LOG_REG = {'solver': 'sag', 'tol': 1e-1, 'C': 1.e4}
  5 | 
  6 | class OnlineBagging:
  7 |     def __init__(self, lambda_diversity=0.1, n_estimators=25, base_estimator=None, p_estimators=None,
  8 |                  n_classes=None):
  9 |         '''
 10 |         Online Bagging similar to offline Bagging but introduce the low and high diversity during the bagging
 11 |         :param lambda_diversity: low lambda diversity allows high diversity ensemble whereas high lambda_diversity
 12 |         induces low diversity.
 13 |         :param n_estimators:  number of estimators for the bagging
 14 |         :param base_estimator: Online learning algorithm it should implements partial fit.
 15 |          The default value is SGDClassifer.
 16 |         :param p_estimators: Parameters of the online learning algorithms
 17 |         :param n_classes: number of classes you need to pass a list of classes
 18 |         '''
 19 |         if base_estimator is None:
 20 |             self.base_estimator = SGDClassifier
 21 |         else:
 22 |             self.base_estimator = base_estimator
 23 | 
 24 |         self.lambda_diversity = lambda_diversity
 25 |         if p_estimators is not None:
 26 |             self.list_classifiers = [self.base_estimator(**p_estimators) for _ in range(n_estimators)]
 27 |         else:
 28 |             self.list_classifiers = [self.base_estimator() for _ in range(n_estimators)]
 29 | 
 30 |         self.list_classes = n_classes
 31 | 
 32 |     def update(self, X, y):
 33 |         """ Update the ensemble of models
 34 | 
 35 |         :param X: new single X
 36 |         :param y: new single y
 37 |         """
 38 |         # retrieve list of different classes if it is the first time we fit data
 39 |         if self.list_classes is None:
 40 |             self.list_classes = np.unique(y)
 41 |         for classifier in self.list_classifiers:
 42 |             # Generate the number of time I want my classifier see the example
 43 |             k = np.random.poisson(self.lambda_diversity, len(X))
 44 |             X_training = None
 45 |             y_training = None
 46 |             while np.sum(k > 0):
 47 |                 pos = np.where(k > 0)
 48 |                 if X_training is None and y_training is None:
 49 |                     X_training = X[pos]
 50 |                     y_training = y[pos]
 51 |                 else:
 52 |                     X_pos = X[pos]
 53 |                     y_pos = y[pos]
 54 |                     if X_pos.shape[0] == 1:
 55 |                         X_training = np.concatenate((X_training, X[pos].reshape((1, X[pos].shape[1]))), axis=0)
 56 |                     else:
 57 |                         X_training = np.concatenate((X_training, X[pos]), axis=0)
 58 |                     y_training = np.vstack((y_training.reshape((-1, 1)), y_pos.reshape((-1, 1))))
 59 | 
 60 |                 # check if there is all classes pass to the fit methods
 61 |                 k -= 1
 62 |             if X_training is not None and y_training is not None:
 63 |                 y_training = y_training.reshape((y_training.shape[0],))
 64 |                 classifier.partial_fit(X_training, y_training, self.list_classes)
 65 | 
 66 |     def predict(self, X):
 67 |         """ Make the prediction
 68 | 
 69 |         :param X: examples to predict
 70 |         :return: the prediction y_predict
 71 |         """
 72 |         # make the prediction for each classifier
 73 |         predictions = np.array([clf.predict(X).tolist() for clf in self.list_classifiers])
 74 | 
 75 |         # for each class, count the number of times the class is predicted
 76 |         nb_votes_by_class = []
 77 |         for c in self.list_classes:
 78 |             nb_votes_by_class.append(np.sum(predictions == c, axis=0))
 79 | 
 80 |         # for each example, return the class which was predicted the most
 81 |         return self.list_classes[np.argmax(nb_votes_by_class, axis=0)]
 82 | 
 83 |     def predict_proba(self, X):
 84 |         """ Compute the probability of belonging to each class
 85 | 
 86 |         :param X: examples to predict
 87 |         :return: the probabilities, array of shape (n_examples, n_classes)
 88 |         """
 89 |         # create empty array to retrieve
 90 |         array_probas = np.zeros((len(X), len(self.list_classes), len(self.list_classifiers)))
 91 | 
 92 |         # iterate over the classifiers and add the probabilities to the previous array
 93 |         for i, clf in enumerate(self.list_classifiers):
 94 |             array_probas[:, :, i] = clf.predict_proba(X)
 95 | 
 96 |         # compute and return the mean of probas computed by each classifier
 97 |         return array_probas.mean(axis=2)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     from data_management.StreamGenerator import StreamGenerator
102 |     from data_management.DataLoader import SEALoader
103 |     from sklearn.metrics import accuracy_score
104 |     np.random.seed(3)
105 |     # generate data
106 |     loader = SEALoader('../data/sea.data')
107 |     generator = StreamGenerator(loader)
108 | 
109 |     # model
110 |     n_classes = np.array(range(0, 2))
111 |     clf = OnlineBagging(base_estimator=SGDClassifier, lambda_diversity=0.1, n_classes=n_classes, n_estimators=25,
112 |                         p_estimators=None)
113 |     X_histo, y_histo = generator.get_historical_data()
114 |     clf.update(X_histo, y_histo)
115 |     for i, (X, y) in enumerate(generator.generate(batch_size=50)):
116 |         print("Batch #%d:" % i)
117 |         # predict
118 |         print("predict for current X")
119 |         y_predict = clf.predict(X)
120 |         # probas = clf.predict_proba(X)
121 |         print("Accuracy score: %0.2f" % accuracy_score(y, y_predict))
122 |         # after some time, labels are available
123 |         print("update model\n")
124 |         clf.update(X, y)
125 | 


--------------------------------------------------------------------------------
/offline_methods/OfflineAlgorithmsWrapper.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import ClassifierMixin
 2 | 
 3 | 
 4 | class OfflineAlgorithmsWrapper:
 5 |     """ Wrapper on Scikit-learn classifiers, to use offline algorithms inside the project """
 6 | 
 7 |     def __init__(self, base_estimator):
 8 |         """ Constructor of OfflineAlgorithmsWrapper
 9 | 
10 |         :param base_estimator: instance of a classifier of scikit-learn (must be an instance of a subclass of sklearn.base.ClassifierMixin)
11 |         """
12 |         self.base_estimator = base_estimator
13 |         self._check_base_estimator()
14 | 
15 |         self.fitted = False  # boolean which is True if base_estimator has been fit
16 | 
17 |     def _check_base_estimator(self):
18 |         """ Raise a ValueError if base_estimator is not suitable for the project """
19 |         if not isinstance(self.base_estimator, ClassifierMixin):
20 |             raise ValueError(
21 |                 "In constructor of OfflineAlgorithmsWrapper, base_estimator should be an instance of a subclass of sklearn.base.ClassifierMixin")
22 | 
23 |     def update(self, X, y):
24 |         """ Fit the base_estimator, only if it has not been fitted already"""
25 |         if not self.fitted:
26 |             self.base_estimator.fit(X, y)
27 |             self.fitted = True
28 | 
29 |     def predict(self, X):
30 |         return self.base_estimator.predict(X)
31 | 
32 |     def predict_proba(self, X):
33 |         return self.base_estimator.predict_proba(X)
34 | 
35 | 
36 | # Code example to test OfflineAlgorithmsWrapper
37 | if __name__ == "__main__":
38 |     import numpy as np
39 |     from data_management import SEALoader, StreamGenerator
40 |     from sklearn.ensemble import RandomForestClassifier
41 | 
42 |     # generate data
43 |     print("Get data...")
44 |     loader = SEALoader('../data/sea.data')
45 |     generator = StreamGenerator(loader)
46 |     X_train, y_train = generator.get_historical_data()
47 |     X_test, y_test = generator.generate(batch_size=1000).__next__()
48 | 
49 |     # create some models
50 |     clf = OfflineAlgorithmsWrapper(RandomForestClassifier(n_estimators=100))
51 | 
52 |     # fit models
53 |     print("\nValue of self.fitted: %d" % clf.fitted)
54 |     print("First call of update()...")
55 |     clf.update(X_train, y_train)
56 |     print("Value of self.fitted: %d" % clf.fitted)
57 | 
58 |     # predict
59 |     print("\nPrediction of classes...")
60 |     y_predict1 = clf.predict(X_test)
61 | 
62 |     # predict_proba
63 |     print("\nPrediction of probabilities...")
64 |     y_probas1 = clf.predict_proba(X_test)
65 | 
66 |     # try to update on X_test, y_test
67 |     print("\nSecond call of update()...")
68 |     clf.update(X_test, y_test)
69 | 
70 |     # second prediction
71 |     print("\nSecond prediction of probabilities...")
72 |     y_probas2 = clf.predict_proba(X_test)
73 | 
74 |     # comparison of probabilities computed after first and second call to update
75 |     probs_are_equal = np.all(y_probas1 == y_probas2)
76 |     if probs_are_equal:
77 |         print("\nProbabilities are equal after first and second call to update")
78 |         print("  -> OfflineAlgorithmsWrapper works as intended")
79 |     else:
80 |         print("\nProbabilities are different after first and second call to update")
81 |         print("  -> OfflineAlgorithmsWrapper doesn't work as intended")
82 | 


--------------------------------------------------------------------------------
/offline_methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .OfflineAlgorithmsWrapper import OfflineAlgorithmsWrapper


--------------------------------------------------------------------------------
/training_windows_methods/AdaptiveSVC.py:
--------------------------------------------------------------------------------
  1 | """ Adaptive SVC algorithm """
  2 | 
  3 | import numpy as np
  4 | from sklearn.metrics.classification import accuracy_score
  5 | from sklearn.svm import SVC
  6 | 
  7 | 
  8 | class AdaptiveSVC:
  9 |     """
 10 |     This class implements the adaptive SVM algorithm based on the article
 11 |     "Detecting Concept Drift with Support Vector Machines" by R Klinkenberg and Thorsten Joachims
 12 |     """
 13 |     def __init__(self, memory_limit=10000, **svc_kwargs):
 14 |         """
 15 |         Adaptive SVC constructor
 16 |         :parameter memory_limit: limit of line of data kept in memory, integer
 17 |         :parameter svc_kwargs: kwargs to give to the SVC classifiers, kwargs
 18 |         /!\ Only linear kernel is currently supported, so kwargs kernel-related will not be handled as expected
 19 |         """
 20 |         self.memory = {'X': list(), 'y': list()}
 21 |         self.memory_limit = memory_limit
 22 |         self.memory_current_size = 0
 23 |         self.windows_in_memory = 0
 24 |         self.previous_best_window = 0
 25 |         self.svc_kwargs = svc_kwargs
 26 |         self.classifiers = list()
 27 |         self.training_set_sizes = list()
 28 |         self.xi_alpha_estimators = list()
 29 |         self.predicting_classifier = None
 30 | 
 31 |     def _add_new_batch_to_memory(self, X, y, batch_size):
 32 |         """
 33 |         Add a new batch of data to the memory
 34 |         :parameter X: batch of unlabelled data, numpy.array, shape (n,m)
 35 |         :parameter y: labels of the batch of data, numpy.array, shape (n,1)
 36 |         :parameter batch_size: size of the batch, or len(y), integer == n
 37 |         """
 38 | 
 39 |         if self.memory_current_size + batch_size > self.memory_limit:
 40 |             # It is not possible to store the last batch without exceeding the memory limit
 41 |             # So let is forget some of the oldest data
 42 |             number_of_data_to_forget = self.memory_current_size + batch_size - self.memory_limit
 43 |             oldest_batch_size = len(self.memory['X'][0])
 44 |             if number_of_data_to_forget > oldest_batch_size:
 45 |                 # More than the oldest batch has to be forgotten
 46 |                 # Let is pop this oldest batch and remove the necessary amount of data from the second oldest batch
 47 |                 self.memory['X'].pop(0)
 48 |                 self.memory['y'].pop(0)
 49 |                 self.memory_current_size -= oldest_batch_size
 50 |                 number_of_data_to_forget -= oldest_batch_size
 51 |                 self.windows_in_memory -= 1
 52 |             self.memory['X'][0] = self.memory['X'][0][number_of_data_to_forget:]
 53 |             self.memory['y'][0] = self.memory['y'][0][number_of_data_to_forget:]
 54 |             self.memory_current_size -= number_of_data_to_forget
 55 | 
 56 |             if self.memory['X'][0].size == 0:
 57 |                 # The entire oldest batch has been forgotten and is now empty
 58 |                 # Let is remove it from the memory
 59 |                 self.memory['X'].pop(0)
 60 |                 self.memory['y'].pop(0)
 61 |                 self.windows_in_memory -= 1
 62 | 
 63 |         assert(self.memory_current_size + batch_size <= self.memory_limit)
 64 | 
 65 |         # Add the entire new batch of data to memory
 66 |         self.memory['X'].append(X)
 67 |         self.memory['y'].append(y)
 68 |         self.memory_current_size += batch_size
 69 |         self.windows_in_memory += 1
 70 | 
 71 |     def _svc_fit_on_window(self, window):
 72 |         """
 73 |         Return an SVC classifier, fitted on the window given in argument
 74 |         :parameter window: number of batches to use from memory for learning, 0 < integer <= self.windows_in_memory
 75 |         """
 76 |         if window is not None:
 77 |             X_train = np.concatenate(self.memory['X'][-window:], axis=0)
 78 |             y_train = np.concatenate(self.memory['y'][-window:], axis=0)
 79 |             return SVC(**self.svc_kwargs, kernel='linear').fit(X_train, y_train), len(y_train)
 80 |         else:
 81 |             return None
 82 | 
 83 |     def _compute_xi_alpha_estimators(self, X, y, batch_size):
 84 |         xi_alpha_estimators = list()
 85 |         # Compute R
 86 |         gram_X = X.dot(X.T)
 87 |         diag_gram_X = np.diag(gram_X).reshape(len(gram_X), 1)
 88 |         R = np.max(diag_gram_X - gram_X)
 89 |         for classifier, training_set_size in zip(self.classifiers, self.training_set_sizes):
 90 |             # Compute Xi
 91 |             w_opt = classifier.coef_
 92 |             b_opt = classifier.intercept_
 93 |             xi = np.zeros(shape=(batch_size, 1))
 94 |             for data_index, (X_data_index, y_data_index) in enumerate(zip(X, y)):
 95 |                 xi[data_index] = max(1 - float(y_data_index * (w_opt.dot(X_data_index) + b_opt)), 0)
 96 |             # Compute alpha
 97 |             alpha = np.zeros(shape=(batch_size, 1))
 98 |             for support_vector_idx, alpha_coef in zip(classifier.support_, classifier.dual_coef_):
 99 |                 alpha_idx = support_vector_idx - training_set_size - batch_size
100 |                 if alpha_idx > 0:
101 |                     alpha[alpha_idx] = alpha_coef
102 |             # Compute xi-alpha estimator
103 |             xi_alpha_estimators.append(np.sum((2 * alpha * R + xi) >= 1).astype(int) / batch_size)
104 |         return xi_alpha_estimators
105 | 
106 |     def _update_memory_according_to_best_window(self, window, batch_size):
107 |         """
108 |         Remove unnecessary data from memory according to the chosen best window for learning
109 |         :parameter window: chosen best window for learning, 0 < integer <= self.windows_in_memory
110 |         :parameter batch_size: size of the current batch, integer
111 |         """
112 |         if window < self.windows_in_memory:
113 |             oldest_batch_size = len(self.memory['X'][0])
114 |             self.memory['X'] = self.memory['X'][-window:]
115 |             self.memory['y'] = self.memory['y'][-window:]
116 |             self.memory_current_size -= oldest_batch_size
117 |             if window < self.windows_in_memory - 1:
118 |                 self.memory_current_size -= (self.windows_in_memory - 1 - window) * batch_size
119 |             self.windows_in_memory = window
120 | 
121 |     def update(self, X, y):
122 |         """
123 |         Update the model with the batch given in argument
124 |         :parameter X: batch of unlabelled data, numpy.array, shape (n,m)
125 |         :parameter y: labels of the batch of data, numpy.array, shape (n,1)
126 |         """
127 |         # Add this new batch to the memory
128 |         batch_size = len(y)
129 |         self._add_new_batch_to_memory(X, y, batch_size)
130 | 
131 |         # Learn on the different windows
132 |         self.classifiers = list()
133 |         self.training_set_sizes = list()
134 |         training_windows = range(1, (self.previous_best_window + 1) + 1)
135 | 
136 |         for training_window in training_windows:
137 |             window_is_admissible = 0 < training_window <= self.windows_in_memory
138 |             if window_is_admissible:
139 |                 clf, training_set_size = self._svc_fit_on_window(training_window)
140 |                 self.classifiers.append(clf)
141 |                 self.training_set_sizes.append(training_set_size)
142 | 
143 |         # Compute the xi alpha estimators
144 |         self.xi_alpha_estimators = self._compute_xi_alpha_estimators(X, y, batch_size)
145 | 
146 |         # Keep the classifier which has the best xi-alpha estimator
147 |         best_classifier_index = np.argmin(self.xi_alpha_estimators)
148 | 
149 |         # Update self.previous_best_window
150 |         assert(0 < training_windows[best_classifier_index] <= self.windows_in_memory)
151 |         best_window = training_windows[best_classifier_index]
152 |         self.previous_best_window = best_window
153 | 
154 |         # Update memory
155 |         self._update_memory_according_to_best_window(best_window, batch_size)
156 | 
157 |         # Update predicting classifier
158 |         self.predicting_classifier = self.classifiers[best_classifier_index]
159 | 
160 |     def predict(self, X):
161 |         """
162 |         Predict labels associated to data given in argument
163 |         :parameter X: batch of unlabelled data, numpy.array, shape (n,m)
164 |         """
165 |         # Predict with the predicting classifier
166 |         if self.predicting_classifier is not None:
167 |             return self.predicting_classifier.predict(X)
168 |         else:
169 |             return np.zeros(shape=(len(X), 1))
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     from data_management import SEALoader, StreamGenerator
174 | 
175 |     # generate data
176 |     sea_loader = SEALoader('data/sea.data')
177 |     sea_generator = StreamGenerator(sea_loader)
178 | 
179 |     # model
180 |     clf = AdaptiveSVC(memory_limit=5000, C=100)
181 | 
182 |     for i, (X, y) in enumerate(sea_generator.generate(batch_size=2000)):
183 |         print("\nBatch #%d:" % i)
184 |         print("Update model")
185 |         clf.update(X, y)
186 |         print("clf.previous_best_window:", clf.previous_best_window)
187 |         print("clf.training_set_sizes:", clf.training_set_sizes)
188 |         print("clf.xi_alpha_estimators:", clf.xi_alpha_estimators)
189 |         print("clf.windows_in_memory:", clf.windows_in_memory)
190 |         print("clf.memory_current_size:", clf.memory_current_size)
191 |         print("clf.memory_limit:", clf.memory_limit)
192 |         # predict
193 |         print("Predict for current X")
194 |         y_predict = clf.predict(X)
195 |         print("Accuracy score: %0.2f" % accuracy_score(y, y_predict))
196 |         if i > 9:
197 |             break
198 | 


--------------------------------------------------------------------------------
/training_windows_methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .AdaptiveSVC import AdaptiveSVC


--------------------------------------------------------------------------------
/training_windows_methods/test_AdaptiveSVC.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from data_management.DataLoader import SEALoader
 4 | from data_management.StreamGenerator import StreamGenerator
 5 | from training_windows_methods.AdaptiveSVC import AdaptiveSVC
 6 | 
 7 | 
 8 | class TestAdaptiveSVC(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.sea_loader = SEALoader('data/sea.data')
12 |         self.sea_generator = StreamGenerator(self.sea_loader)
13 | 
14 |     def test_memory_manager(self):
15 |         # model
16 |         clf = AdaptiveSVC(memory_limit=500, C=100)
17 | 
18 |         for i, (X, y) in enumerate(self.sea_generator.generate(batch=200)):
19 |             if i < 5:
20 |                 clf.update(X, y)
21 |                 if i == 0:
22 |                     self.assertEqual(clf.previous_best_window, 1)
23 |                     self.assertEqual(clf.memory_current_size, 200)
24 |                     self.assertEqual(len(clf.memory['y']), 1)
25 |                 if i == 1:
26 |                     self.assertEqual(clf.previous_best_window, 2)
27 |                     self.assertEqual(clf.memory_current_size, 400)
28 |                     self.assertEqual(len(clf.memory['y']), 2)
29 |                 if i == 2:
30 |                     self.assertEqual(clf.previous_best_window, 3)
31 |                     self.assertEqual(clf.memory_current_size, 500)
32 |                     self.assertEqual(len(clf.memory['y']), 3)
33 |                 if i == 3:
34 |                     self.assertEqual(clf.previous_best_window, 3)
35 |                     self.assertEqual(clf.memory_current_size, 500)
36 |                     self.assertEqual(len(clf.memory['y']), 3)
37 |                 if i == 4:
38 |                     self.assertEqual(clf.previous_best_window, 3)
39 |                     self.assertEqual(clf.memory_current_size, 500)
40 |                     self.assertEqual(len(clf.memory['y']), 3)
41 |             else:
42 |                 break
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------