├── .gitignore
├── LICENSE
├── README.md
└── adaboost
    ├── __init__.py
    ├── algorithm.py
    ├── classifiers
        ├── __init__.py
        ├── binary.py
        ├── classifier.py
        └── linear.py
    ├── features
        ├── __init__.py
        ├── feature.py
        └── simple.py
    └── utilities.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 L. Nathan Perkins
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | adaboost
 2 | ========
 3 | 
 4 | AdaBoost
 5 | 
 6 | This is based on a linear regression trainer and feature selection class initially developed to help
 7 | analyze and make predictions for the MIT Big Data Challenge. The trainer can use any provided solver to
 8 | perform a linear regression (by default, it uses the numpy provided linear least squares regression).
 9 | The training class provides a simple way to do feature selection over a large feature space.
10 | The trainer does k-fold cross validation to find features that improve validation scores. When complete,
11 | the class has the model coefficients as well as a score.
12 | 
13 | Dependencies: Python 2.7, numpy
14 | 
15 | Usage:
16 | 
17 |     import adaboost
18 |     
19 |     t = adaboost.AdaBoost()
20 |     
21 |     # print detailed debugging information regarding the classifier selection
22 |     t.debug = 2
23 |     
24 |     # train classifier
25 |     t.train(x, y) # x is a matrix, y is a actual classifications (-1 or 1)
26 |     
27 | 	# classify novel set of values, the sign of the return value is predicted binary class
28 |     novel_y_prime = t.apply_to_matrix(novel_x)
29 | 
30 | Methods
31 | -------
32 | 
33 | The following attributes are available for instances of the Trainer class.
34 | 
35 | * `train(x, y)` Will begin training on matrix x and classification set y, where y contains
36 |   binary classification data (either 0/1 or -1/1). The system will evaluate potential weak
37 |   classifiers and iteratively add the weak classifier that best minimizes the weighted 
38 |   error.
39 | 
40 | * `apply_to_matrix(p_x)` Applies the feature selected classifiers to novel values and
41 |   returning a vector with the classification predicted. Each returned classification
42 |   ranges from -1 to 1. The sign is the predicted class, and the absolute value is the 
43 |   confidence.
44 | 
45 | 
46 | Attributes
47 | ----------
48 | 
49 | The following attributes are available for instances of the Trainer class.
50 | 
51 | * `debug` Allows printing of information about the training process. Can be 0 (no 
52 |    debugging), 1 (minimal debugging) or 2 (detailed debugging). Minimal debugging prints 
53 |    final scores and such data, while detailed debugging prints individual classifier 
54 |    additions.
55 | 
56 | * `max_iterations` The maximum number of weak classifiers to use.
57 | 
58 | * `target_error` The target error for the training data set. Once the training error is
59 |    less than this value, the algorithm will stop.
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/adaboost/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | __author__ = 'nathan'
4 | 
5 | """
6 | Dependency: numpy
7 | """
8 | 
9 | from algorithm import AdaBoost


--------------------------------------------------------------------------------
/adaboost/algorithm.py:
--------------------------------------------------------------------------------
  1 | import features
  2 | import classifiers
  3 | import copy
  4 | import math
  5 | import numpy as np
  6 | 
  7 | 
  8 | class AdaBoost:
  9 |     def __init__(self):
 10 |         self.debug = 0
 11 |         self.features = []
 12 |         self.classifiers = []
 13 |         self.max_iterations = 5
 14 |         self.target_error = 0.01
 15 | 
 16 |         # data
 17 |         self.data = None
 18 |         self.actual = None
 19 | 
 20 |         # training weights
 21 |         self.weights = None
 22 | 
 23 |         # trained parameters
 24 |         self.selected_classifiers = None
 25 |         self.alphas = None
 26 | 
 27 |     def seed_features(self):
 28 |         if 0 == len(self.features):
 29 |             self.features = features.Simple.discover_features(self.data)
 30 | 
 31 |     def seed_classifiers(self):
 32 |         if 0 == len(self.classifiers):
 33 |             for f in self.features:
 34 |                 self.classifiers.extend(f.get_classifiers())
 35 | 
 36 |     def _evaluate_classifier(self, classifier):
 37 |         # get classifiers
 38 |         classifier.ready_data(self.data, self.actual, self.weights)
 39 | 
 40 |         if 1 < self.debug:
 41 |             print 'Eval: ', classifier.describe(), ' (precision: ', classifier.precision, '; error: ', classifier.error, ')'
 42 | 
 43 |         return classifier.precision, classifier.error
 44 | 
 45 |     def _train_iteration(self):
 46 |         # classifier count
 47 |         classifier_count = len(self.classifiers)
 48 | 
 49 |         # track best
 50 |         best_precision = None
 51 |         best_error = None
 52 |         best_j = None
 53 |         for j in xrange(0, classifier_count):
 54 |             # get precision
 55 |             precision, error = self._evaluate_classifier(self.classifiers[j])
 56 | 
 57 |             # best?
 58 |             if best_precision is None or precision > best_precision:
 59 |                 best_precision = precision
 60 |                 best_error = error
 61 |                 best_j = j
 62 | 
 63 |         # inversion
 64 |         if 0.5 < best_error:
 65 |             invert = -1
 66 |             best_error = 1 - best_error
 67 |         else:
 68 |             invert = 1
 69 | 
 70 |         # new classifier
 71 |         classifier = copy.copy(self.classifiers[best_j])
 72 |         alpha = 0.5 * invert * np.log((1 - best_error) / best_error)
 73 | 
 74 |         # print classifier
 75 |         if 0 < self.debug:
 76 |             print 'Added: ', classifier.describe(), ' (alpha = ', alpha, ')'
 77 | 
 78 |         # add alpha
 79 |         self.alphas.append(alpha)
 80 |         self.selected_classifiers.append(classifier)
 81 | 
 82 |         # get mistakes
 83 |         mistakes = (classifier.classify_data(self.data) != self.actual)
 84 | 
 85 |         if 0 > invert:
 86 |             mistakes = ~mistakes
 87 | 
 88 |         sum_of_weights = np.sum(self.weights * mistakes)
 89 |         num_actual_0 = np.sum(self.actual[mistakes] == -1)
 90 |         num_actual_1 = np.sum(self.actual[mistakes] == 1)
 91 | 
 92 |         if 1 < self.debug:
 93 |             print 'Incorrect: ', np.sum(mistakes), '; Correct: ', np.sum(~mistakes)
 94 |             print 'False positives: ', num_actual_0, '; False negatives: ', num_actual_1
 95 | 
 96 |         # adjusters
 97 |         self.weights[mistakes] *= 0.5 / sum_of_weights
 98 |         self.weights[~mistakes] *= 0.5 / (1 - sum_of_weights)
 99 | 
100 |     def train(self, data, actual):
101 |         # check that data is a matrix (columns = features; rows = entries)
102 |         if not isinstance(data, np.ndarray):
103 |             data = np.array(data)
104 | 
105 |         # check that actual is a column vector
106 |         if not isinstance(actual, np.ndarray):
107 |             actual = np.array(actual)
108 |         if 1 == len(actual.shape):
109 |             actual.shape = [len(actual), 1]
110 |         elif 1 == actual.shape[0]:
111 |             actual = actual.T
112 | 
113 |         unique_actual = np.unique(actual)
114 |         if 2 < len(unique_actual):
115 |             raise Exception('Actual values must contain only binary classification.')
116 |         if not 1 in unique_actual:
117 |             raise Exception('Must have at least one positive binary classification.')
118 |         if 0 in unique_actual:
119 |             actual[actual == 0] = -1
120 |         elif not -1 in unique_actual:
121 |             raise Exception('Must have at least one negative binary classification.')
122 | 
123 |         # store
124 |         self.data = data
125 |         self.actual = actual
126 | 
127 |         # seed configuration
128 |         self.seed_features()
129 |         self.seed_classifiers()
130 | 
131 |         # fill initial weights
132 |         self.weights = np.ones(actual.shape) / actual.shape[0]
133 | 
134 |         # initial values
135 |         self.selected_classifiers = []
136 |         self.alphas = []
137 | 
138 |         # iterations
139 |         for i in xrange(0, self.max_iterations):
140 |             if 1 < self.debug:
141 |                 print "Iteration ", i + 1
142 | 
143 |             # run iteration
144 |             self._train_iteration()
145 | 
146 |     def apply_to_matrix(self, data):
147 |         # check that data is a matrix (columns = features; rows = entries)
148 |         if not isinstance(data, np.ndarray):
149 |             data = np.array(data)
150 | 
151 |         # build return array
152 |         ret = np.zeros((data.shape[0], 1))
153 | 
154 |         for i, c in enumerate(self.selected_classifiers):
155 |             ret += c.classify_data(data) * self.alphas[i]
156 | 
157 |         return ret
158 | 


--------------------------------------------------------------------------------
/adaboost/classifiers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | from linear import Linear


--------------------------------------------------------------------------------
/adaboost/classifiers/binary.py:
--------------------------------------------------------------------------------
 1 | from linear import Linear
 2 | import numpy as np
 3 | 
 4 | 
 5 | class Binary(Linear):
 6 |     def get_potential_boundaries(self, processed_data):
 7 |         num_total = processed_data.shape[0]
 8 |         num_true = np.sum(processed_data == 1)
 9 |         yield 0
10 |         yield num_total - num_true
11 |         if num_true < num_total:
12 |             yield num_total
13 | 


--------------------------------------------------------------------------------
/adaboost/classifiers/classifier.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class Classifier:
 5 |     def __init__(self, feature):
 6 |         self.error = None
 7 |         self.precision = None
 8 |         self.feature = feature
 9 | 
10 |     @abc.abstractmethod
11 |     def describe(self):
12 |         return ''
13 | 
14 |     @abc.abstractmethod
15 |     def ready_data(self, data, actual, weights):
16 |         """
17 |         Takes a data matrix, a column vector of actual classifications (-1 or 1) and a column vector of weights and
18 |         customizes the classifier accordingly (e.g., sets the threshold). If this function returns true, it must
19 |         fill in the error and precision parameters based on the data.
20 |         :param data: np.ndarray
21 |         :param actual: np.ndarray
22 |         :param weights: np.ndarray
23 |         :return: bool
24 |         """
25 |         pass
26 | 
27 |     @abc.abstractmethod
28 |     def classify_data(self, data):
29 |         """
30 |         Takes a data matrix and returns a set of predicted classifications (either -1 or 1).
31 |         :param data: np.ndarray
32 |         :return: np.ndarray
33 |         """
34 |         pass
35 | 


--------------------------------------------------------------------------------
/adaboost/classifiers/linear.py:
--------------------------------------------------------------------------------
 1 | from classifier import Classifier
 2 | from adaboost import utilities
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Linear(Classifier):
 7 |     min_boundary = 1e-5
 8 | 
 9 |     def __init__(self, feature):
10 |         Classifier.__init__(self, feature)
11 |         self.threshold = None
12 | 
13 |     def get_sort_func(self):
14 |         return None
15 | 
16 |     def process_data(self, val):
17 |         return val
18 | 
19 |     def get_potential_boundaries(self, processed_data):
20 |         return range(0, len(processed_data) + 1)
21 | 
22 |     def to_precision(self, err):
23 |         return abs(err - 0.5)
24 | 
25 |     def describe(self):
26 |         return self.feature.describe() + ' > ' + str(self.threshold)
27 | 
28 |     def ready_data(self, data, actual, weights):
29 |         # process data (allow for some sort of pre-processing)
30 |         processed_data = self.process_data(self.feature.extract(data))
31 | 
32 |         # make paired lists of processed data, actual classification and weights
33 |         lst = [(x, actual[i, 0], weights[i, 0]) for i, x in enumerate(processed_data)]
34 |         lst.sort(cmp=self.get_sort_func(), key=lambda entry: entry[0])
35 | 
36 |         # figure out initial precision
37 |         best_threshold = lst[0][0] - 1e-5
38 |         err = np.sum(weights[actual == -1])
39 |         best_err = err
40 |         best_precision = self.to_precision(err)
41 |         last_boundary = 0
42 | 
43 |         # consider all boundaries
44 |         for boundary in self.get_potential_boundaries(processed_data):
45 |             # 0 case taken care of by the initial best_threshold and best_precision
46 |             if 0 == boundary:
47 |                 continue
48 | 
49 |             # same value (allows linear classifier to work with discrete data)
50 |             if boundary < len(lst) and (lst[boundary][0] - lst[boundary - 1][0]) < self.min_boundary:
51 |                 continue
52 | 
53 |             # advance boundary
54 |             for j in xrange(last_boundary, boundary):
55 |                 if 0 < lst[j][1]:
56 |                     err += lst[j][2]
57 |                 else:
58 |                     err -= lst[j][2]
59 |             last_boundary = boundary
60 | 
61 |             # get precision
62 |             precision = self.to_precision(err)
63 | 
64 |             # is improvement
65 |             if precision > best_precision:
66 |                 best_err = err
67 |                 best_precision = precision
68 |                 if boundary == len(lst):
69 |                     best_threshold = lst[-1][0] + 1e-5
70 |                 else:
71 |                     best_threshold = (lst[boundary - 1][0] + lst[boundary][0]) / 2.
72 | 
73 |         self.threshold = best_threshold
74 |         self.error = best_err
75 |         self.precision = best_precision
76 | 
77 |         return True
78 | 
79 |     def classify_data(self, data):
80 |         val = self.process_data(self.feature.extract(data))
81 | 
82 |         # 1 for true, -1 for false
83 |         ret = np.ones((data.shape[0], 1))
84 |         ret[val <= self.threshold] = -1
85 |         return ret


--------------------------------------------------------------------------------
/adaboost/features/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'nathan'
2 | 
3 | from simple import Simple


--------------------------------------------------------------------------------
/adaboost/features/feature.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class Feature:
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @staticmethod
 9 |     def discover_features(data):
10 |         """
11 |         Used to discover features automatically from a data matrix.
12 |         :param data: np.ndarray
13 |         :return: Feature[]
14 |         """
15 |         return []
16 | 
17 |     @abc.abstractmethod
18 |     def extract(self, data):
19 |         """
20 |         Extracts the desired feature from the matrix data. Returns it as a vector.
21 |         :param data: np.ndarray
22 |         :return: np.ndarray
23 |         """
24 |         return 0.
25 | 
26 |     @abc.abstractmethod
27 |     def get_classifiers(self):
28 |         """
29 |         Offers a list of potential classifiers for this feature (used for seeding an unconfigured algorithm).
30 |         :return: classifier[]
31 |         """
32 |         return []
33 | 
34 |     @abc.abstractmethod
35 |     def describe(self):
36 |         return ''
37 | 


--------------------------------------------------------------------------------
/adaboost/features/simple.py:
--------------------------------------------------------------------------------
 1 | from feature import Feature
 2 | from adaboost.classifiers import Linear
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Simple(Feature):
 7 |     """
 8 |     A Simple feature simply extracts a single column from the data matrix.
 9 |     """
10 |     def __init__(self, column):
11 |         Feature.__init__(self)
12 |         self.column = column
13 | 
14 |     @staticmethod
15 |     def discover_features(data):
16 |         return [Simple(i) for i in xrange(0, data.shape[1])]
17 | 
18 |     def extract(self, data):
19 |         return data[:, self.column]
20 | 
21 |     def get_classifiers(self):
22 |         return [Linear(self)]
23 | 
24 |     def describe(self):
25 |         return 'Column ' + str(self.column)
26 | 
27 | 


--------------------------------------------------------------------------------
/adaboost/utilities.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def to_column_matrix(arr_or_mat):
 5 |     if len(arr_or_mat.shape) == 1:
 6 |         arr_or_mat.shape = [len(arr_or_mat), 1]
 7 |     elif np.shape(arr_or_mat)[0] == 1:
 8 |         return arr_or_mat.T
 9 |     return arr_or_mat
10 | 


--------------------------------------------------------------------------------