├── .gitignore ├── LICENSE ├── README.md └── adaboost ├── __init__.py ├── algorithm.py ├── classifiers ├── __init__.py ├── binary.py ├── classifier.py └── linear.py ├── features ├── __init__.py ├── feature.py └── simple.py └── utilities.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 L. Nathan Perkins 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | adaboost 2 | ======== 3 | 4 | AdaBoost 5 | 6 | This is based on a linear regression trainer and feature selection class initially developed to help 7 | analyze and make predictions for the MIT Big Data Challenge. The trainer can use any provided solver to 8 | perform a linear regression (by default, it uses the numpy provided linear least squares regression). 9 | The training class provides a simple way to do feature selection over a large feature space. 10 | The trainer does k-fold cross validation to find features that improve validation scores. When complete, 11 | the class has the model coefficients as well as a score. 12 | 13 | Dependencies: Python 2.7, numpy 14 | 15 | Usage: 16 | 17 | import adaboost 18 | 19 | t = adaboost.AdaBoost() 20 | 21 | # print detailed debugging information regarding the classifier selection 22 | t.debug = 2 23 | 24 | # train classifier 25 | t.train(x, y) # x is a matrix, y is a actual classifications (-1 or 1) 26 | 27 | # classify novel set of values, the sign of the return value is predicted binary class 28 | novel_y_prime = t.apply_to_matrix(novel_x) 29 | 30 | Methods 31 | ------- 32 | 33 | The following attributes are available for instances of the Trainer class. 34 | 35 | * `train(x, y)` Will begin training on matrix x and classification set y, where y contains 36 | binary classification data (either 0/1 or -1/1). The system will evaluate potential weak 37 | classifiers and iteratively add the weak classifier that best minimizes the weighted 38 | error. 39 | 40 | * `apply_to_matrix(p_x)` Applies the feature selected classifiers to novel values and 41 | returning a vector with the classification predicted. Each returned classification 42 | ranges from -1 to 1. The sign is the predicted class, and the absolute value is the 43 | confidence. 44 | 45 | 46 | Attributes 47 | ---------- 48 | 49 | The following attributes are available for instances of the Trainer class. 50 | 51 | * `debug` Allows printing of information about the training process. Can be 0 (no 52 | debugging), 1 (minimal debugging) or 2 (detailed debugging). Minimal debugging prints 53 | final scores and such data, while detailed debugging prints individual classifier 54 | additions. 55 | 56 | * `max_iterations` The maximum number of weak classifiers to use. 57 | 58 | * `target_error` The target error for the training data set. Once the training error is 59 | less than this value, the algorithm will stop. 60 | 61 | 62 | -------------------------------------------------------------------------------- /adaboost/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | __author__ = 'nathan' 4 | 5 | """ 6 | Dependency: numpy 7 | """ 8 | 9 | from algorithm import AdaBoost -------------------------------------------------------------------------------- /adaboost/algorithm.py: -------------------------------------------------------------------------------- 1 | import features 2 | import classifiers 3 | import copy 4 | import math 5 | import numpy as np 6 | 7 | 8 | class AdaBoost: 9 | def __init__(self): 10 | self.debug = 0 11 | self.features = [] 12 | self.classifiers = [] 13 | self.max_iterations = 5 14 | self.target_error = 0.01 15 | 16 | # data 17 | self.data = None 18 | self.actual = None 19 | 20 | # training weights 21 | self.weights = None 22 | 23 | # trained parameters 24 | self.selected_classifiers = None 25 | self.alphas = None 26 | 27 | def seed_features(self): 28 | if 0 == len(self.features): 29 | self.features = features.Simple.discover_features(self.data) 30 | 31 | def seed_classifiers(self): 32 | if 0 == len(self.classifiers): 33 | for f in self.features: 34 | self.classifiers.extend(f.get_classifiers()) 35 | 36 | def _evaluate_classifier(self, classifier): 37 | # get classifiers 38 | classifier.ready_data(self.data, self.actual, self.weights) 39 | 40 | if 1 < self.debug: 41 | print 'Eval: ', classifier.describe(), ' (precision: ', classifier.precision, '; error: ', classifier.error, ')' 42 | 43 | return classifier.precision, classifier.error 44 | 45 | def _train_iteration(self): 46 | # classifier count 47 | classifier_count = len(self.classifiers) 48 | 49 | # track best 50 | best_precision = None 51 | best_error = None 52 | best_j = None 53 | for j in xrange(0, classifier_count): 54 | # get precision 55 | precision, error = self._evaluate_classifier(self.classifiers[j]) 56 | 57 | # best? 58 | if best_precision is None or precision > best_precision: 59 | best_precision = precision 60 | best_error = error 61 | best_j = j 62 | 63 | # inversion 64 | if 0.5 < best_error: 65 | invert = -1 66 | best_error = 1 - best_error 67 | else: 68 | invert = 1 69 | 70 | # new classifier 71 | classifier = copy.copy(self.classifiers[best_j]) 72 | alpha = 0.5 * invert * np.log((1 - best_error) / best_error) 73 | 74 | # print classifier 75 | if 0 < self.debug: 76 | print 'Added: ', classifier.describe(), ' (alpha = ', alpha, ')' 77 | 78 | # add alpha 79 | self.alphas.append(alpha) 80 | self.selected_classifiers.append(classifier) 81 | 82 | # get mistakes 83 | mistakes = (classifier.classify_data(self.data) != self.actual) 84 | 85 | if 0 > invert: 86 | mistakes = ~mistakes 87 | 88 | sum_of_weights = np.sum(self.weights * mistakes) 89 | num_actual_0 = np.sum(self.actual[mistakes] == -1) 90 | num_actual_1 = np.sum(self.actual[mistakes] == 1) 91 | 92 | if 1 < self.debug: 93 | print 'Incorrect: ', np.sum(mistakes), '; Correct: ', np.sum(~mistakes) 94 | print 'False positives: ', num_actual_0, '; False negatives: ', num_actual_1 95 | 96 | # adjusters 97 | self.weights[mistakes] *= 0.5 / sum_of_weights 98 | self.weights[~mistakes] *= 0.5 / (1 - sum_of_weights) 99 | 100 | def train(self, data, actual): 101 | # check that data is a matrix (columns = features; rows = entries) 102 | if not isinstance(data, np.ndarray): 103 | data = np.array(data) 104 | 105 | # check that actual is a column vector 106 | if not isinstance(actual, np.ndarray): 107 | actual = np.array(actual) 108 | if 1 == len(actual.shape): 109 | actual.shape = [len(actual), 1] 110 | elif 1 == actual.shape[0]: 111 | actual = actual.T 112 | 113 | unique_actual = np.unique(actual) 114 | if 2 < len(unique_actual): 115 | raise Exception('Actual values must contain only binary classification.') 116 | if not 1 in unique_actual: 117 | raise Exception('Must have at least one positive binary classification.') 118 | if 0 in unique_actual: 119 | actual[actual == 0] = -1 120 | elif not -1 in unique_actual: 121 | raise Exception('Must have at least one negative binary classification.') 122 | 123 | # store 124 | self.data = data 125 | self.actual = actual 126 | 127 | # seed configuration 128 | self.seed_features() 129 | self.seed_classifiers() 130 | 131 | # fill initial weights 132 | self.weights = np.ones(actual.shape) / actual.shape[0] 133 | 134 | # initial values 135 | self.selected_classifiers = [] 136 | self.alphas = [] 137 | 138 | # iterations 139 | for i in xrange(0, self.max_iterations): 140 | if 1 < self.debug: 141 | print "Iteration ", i + 1 142 | 143 | # run iteration 144 | self._train_iteration() 145 | 146 | def apply_to_matrix(self, data): 147 | # check that data is a matrix (columns = features; rows = entries) 148 | if not isinstance(data, np.ndarray): 149 | data = np.array(data) 150 | 151 | # build return array 152 | ret = np.zeros((data.shape[0], 1)) 153 | 154 | for i, c in enumerate(self.selected_classifiers): 155 | ret += c.classify_data(data) * self.alphas[i] 156 | 157 | return ret 158 | -------------------------------------------------------------------------------- /adaboost/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from linear import Linear -------------------------------------------------------------------------------- /adaboost/classifiers/binary.py: -------------------------------------------------------------------------------- 1 | from linear import Linear 2 | import numpy as np 3 | 4 | 5 | class Binary(Linear): 6 | def get_potential_boundaries(self, processed_data): 7 | num_total = processed_data.shape[0] 8 | num_true = np.sum(processed_data == 1) 9 | yield 0 10 | yield num_total - num_true 11 | if num_true < num_total: 12 | yield num_total 13 | -------------------------------------------------------------------------------- /adaboost/classifiers/classifier.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Classifier: 5 | def __init__(self, feature): 6 | self.error = None 7 | self.precision = None 8 | self.feature = feature 9 | 10 | @abc.abstractmethod 11 | def describe(self): 12 | return '' 13 | 14 | @abc.abstractmethod 15 | def ready_data(self, data, actual, weights): 16 | """ 17 | Takes a data matrix, a column vector of actual classifications (-1 or 1) and a column vector of weights and 18 | customizes the classifier accordingly (e.g., sets the threshold). If this function returns true, it must 19 | fill in the error and precision parameters based on the data. 20 | :param data: np.ndarray 21 | :param actual: np.ndarray 22 | :param weights: np.ndarray 23 | :return: bool 24 | """ 25 | pass 26 | 27 | @abc.abstractmethod 28 | def classify_data(self, data): 29 | """ 30 | Takes a data matrix and returns a set of predicted classifications (either -1 or 1). 31 | :param data: np.ndarray 32 | :return: np.ndarray 33 | """ 34 | pass 35 | -------------------------------------------------------------------------------- /adaboost/classifiers/linear.py: -------------------------------------------------------------------------------- 1 | from classifier import Classifier 2 | from adaboost import utilities 3 | import numpy as np 4 | 5 | 6 | class Linear(Classifier): 7 | min_boundary = 1e-5 8 | 9 | def __init__(self, feature): 10 | Classifier.__init__(self, feature) 11 | self.threshold = None 12 | 13 | def get_sort_func(self): 14 | return None 15 | 16 | def process_data(self, val): 17 | return val 18 | 19 | def get_potential_boundaries(self, processed_data): 20 | return range(0, len(processed_data) + 1) 21 | 22 | def to_precision(self, err): 23 | return abs(err - 0.5) 24 | 25 | def describe(self): 26 | return self.feature.describe() + ' > ' + str(self.threshold) 27 | 28 | def ready_data(self, data, actual, weights): 29 | # process data (allow for some sort of pre-processing) 30 | processed_data = self.process_data(self.feature.extract(data)) 31 | 32 | # make paired lists of processed data, actual classification and weights 33 | lst = [(x, actual[i, 0], weights[i, 0]) for i, x in enumerate(processed_data)] 34 | lst.sort(cmp=self.get_sort_func(), key=lambda entry: entry[0]) 35 | 36 | # figure out initial precision 37 | best_threshold = lst[0][0] - 1e-5 38 | err = np.sum(weights[actual == -1]) 39 | best_err = err 40 | best_precision = self.to_precision(err) 41 | last_boundary = 0 42 | 43 | # consider all boundaries 44 | for boundary in self.get_potential_boundaries(processed_data): 45 | # 0 case taken care of by the initial best_threshold and best_precision 46 | if 0 == boundary: 47 | continue 48 | 49 | # same value (allows linear classifier to work with discrete data) 50 | if boundary < len(lst) and (lst[boundary][0] - lst[boundary - 1][0]) < self.min_boundary: 51 | continue 52 | 53 | # advance boundary 54 | for j in xrange(last_boundary, boundary): 55 | if 0 < lst[j][1]: 56 | err += lst[j][2] 57 | else: 58 | err -= lst[j][2] 59 | last_boundary = boundary 60 | 61 | # get precision 62 | precision = self.to_precision(err) 63 | 64 | # is improvement 65 | if precision > best_precision: 66 | best_err = err 67 | best_precision = precision 68 | if boundary == len(lst): 69 | best_threshold = lst[-1][0] + 1e-5 70 | else: 71 | best_threshold = (lst[boundary - 1][0] + lst[boundary][0]) / 2. 72 | 73 | self.threshold = best_threshold 74 | self.error = best_err 75 | self.precision = best_precision 76 | 77 | return True 78 | 79 | def classify_data(self, data): 80 | val = self.process_data(self.feature.extract(data)) 81 | 82 | # 1 for true, -1 for false 83 | ret = np.ones((data.shape[0], 1)) 84 | ret[val <= self.threshold] = -1 85 | return ret -------------------------------------------------------------------------------- /adaboost/features/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'nathan' 2 | 3 | from simple import Simple -------------------------------------------------------------------------------- /adaboost/features/feature.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class Feature: 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def discover_features(data): 10 | """ 11 | Used to discover features automatically from a data matrix. 12 | :param data: np.ndarray 13 | :return: Feature[] 14 | """ 15 | return [] 16 | 17 | @abc.abstractmethod 18 | def extract(self, data): 19 | """ 20 | Extracts the desired feature from the matrix data. Returns it as a vector. 21 | :param data: np.ndarray 22 | :return: np.ndarray 23 | """ 24 | return 0. 25 | 26 | @abc.abstractmethod 27 | def get_classifiers(self): 28 | """ 29 | Offers a list of potential classifiers for this feature (used for seeding an unconfigured algorithm). 30 | :return: classifier[] 31 | """ 32 | return [] 33 | 34 | @abc.abstractmethod 35 | def describe(self): 36 | return '' 37 | -------------------------------------------------------------------------------- /adaboost/features/simple.py: -------------------------------------------------------------------------------- 1 | from feature import Feature 2 | from adaboost.classifiers import Linear 3 | import numpy as np 4 | 5 | 6 | class Simple(Feature): 7 | """ 8 | A Simple feature simply extracts a single column from the data matrix. 9 | """ 10 | def __init__(self, column): 11 | Feature.__init__(self) 12 | self.column = column 13 | 14 | @staticmethod 15 | def discover_features(data): 16 | return [Simple(i) for i in xrange(0, data.shape[1])] 17 | 18 | def extract(self, data): 19 | return data[:, self.column] 20 | 21 | def get_classifiers(self): 22 | return [Linear(self)] 23 | 24 | def describe(self): 25 | return 'Column ' + str(self.column) 26 | 27 | -------------------------------------------------------------------------------- /adaboost/utilities.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def to_column_matrix(arr_or_mat): 5 | if len(arr_or_mat.shape) == 1: 6 | arr_or_mat.shape = [len(arr_or_mat), 1] 7 | elif np.shape(arr_or_mat)[0] == 1: 8 | return arr_or_mat.T 9 | return arr_or_mat 10 | --------------------------------------------------------------------------------