├── .gitignore
├── LICENSE
├── README.md
├── forest_cluster
    ├── __init__.py
    ├── forest_embedding.py
    ├── k_medoids.py
    ├── similarity.pyx
    └── tests
    │   ├── __init__.py
    │   ├── fixtures.py
    │   └── test_forest_clustering.py
├── setup.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Joshua Loyal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Random Forest Clustering
2 | Unsupervised Clustering using Random Forests
3 | 


--------------------------------------------------------------------------------
/forest_cluster/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import unicode_literals
3 | from __future__ import division
4 | 
5 | from forest_cluster.forest_embedding import RandomForestEmbedding
6 | from forest_cluster.k_medoids import KMedoids
7 | 


--------------------------------------------------------------------------------
/forest_cluster/forest_embedding.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import unicode_literals
  3 | from __future__ import division
  4 | 
  5 | import numpy as np
  6 | import scipy.sparse as sp
  7 | from sklearn.ensemble.forest import BaseForest
  8 | from sklearn.tree import DecisionTreeClassifier
  9 | from sklearn.utils import check_random_state, check_array
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | 
 12 | 
 13 | def bootstrap_sample_column(X, n_samples=None, random_state=1234):
 14 |     """bootstrap_sample_column
 15 | 
 16 |     Bootstrap sample the column of a dataset.
 17 | 
 18 |     Parameters
 19 |     ----------
 20 |     X : np.ndarray (n_samples,)
 21 |         Column to bootstrap
 22 | 
 23 |     n_samples : int
 24 |         Number of samples to generate. If `None` then generate
 25 |         a bootstrap of size of `X`.
 26 | 
 27 |     random_state : int
 28 |         Seed to the random number generator.
 29 | 
 30 |     Returns
 31 |     -------
 32 |     np.ndarray (n_samples,):
 33 |         The bootstrapped column.
 34 |     """
 35 |     random_state = check_random_state(random_state)
 36 |     if n_samples is None:
 37 |         n_samples = X.shape[0]
 38 | 
 39 |     return random_state.choice(X, size=n_samples, replace=True)
 40 | 
 41 | 
 42 | def uniform_sample_column(X, n_samples=None, random_state=1234):
 43 |     """uniform_sample_column
 44 | 
 45 |     Sample a column uniformly between its minimum and maximum value.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     X : np.ndarray (n_samples,)
 50 |         Column to sample.
 51 | 
 52 |     n_samples : int
 53 |         Number of samples to generate. If `None` then generate
 54 |         a bootstrap of size of `X`.
 55 | 
 56 |     random_state : int
 57 |         Seed to the random number generator.
 58 | 
 59 |     Returns
 60 |     -------
 61 |     np.ndarray (n_samples,):
 62 |         Uniformly sampled column.
 63 |     """
 64 |     random_state = check_random_state(random_state)
 65 |     if n_samples is None:
 66 |         n_samples = X.shape[0]
 67 | 
 68 |     min_X, max_X = np.min(X), np.max(X)
 69 |     return random_state.uniform(min_X, max_X, size=n_samples)
 70 | 
 71 | 
 72 | def generate_synthetic_features(X, method='bootstrap', random_state=1234):
 73 |     """generate_synthetic_features
 74 | 
 75 |     Generate a synthetic dataset based on the empirical distribution
 76 |     of `X`.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     X : np.ndarray (n_samples, n_features)
 81 |         Dataset whose empirical distribution is used to generate the
 82 |         synthetic dataset.
 83 | 
 84 |     method : str {'bootstrap', 'uniform'}
 85 |         Method to use to generate the synthetic dataset. `bootstrap`
 86 |         samples each column with replacement. `uniform` generates
 87 |         a new column uniformly sampled between the minimum and
 88 |         maximum value of each column.
 89 | 
 90 |     random_state : int
 91 |         Seed to the random number generator.
 92 | 
 93 |     Returns
 94 |     -------
 95 |     synth_X : np.ndarray (n_samples, n_features)
 96 |         The synthetic dataset.
 97 |     """
 98 |     random_state = check_random_state(random_state)
 99 |     n_features = int(X.shape[1])
100 |     synth_X = np.empty_like(X)
101 |     for column in xrange(n_features):
102 |         if method == 'bootstrap':
103 |             synth_X[:, column] = bootstrap_sample_column(
104 |                 X[:, column], random_state=random_state)
105 |         elif method == 'uniform':
106 |             synth_X[:, column] = uniform_sample_column(
107 |                 X[:, column], random_state=random_state)
108 |         else:
109 |             raise ValueError('method must be either `bootstrap` or `uniform`.')
110 | 
111 |     return synth_X
112 | 
113 | 
114 | def generate_discriminative_dataset(X, method='bootstrap', random_state=1234):
115 |     """generate_discriminative_dataset.
116 | 
117 |     Generate a synthetic dataset based on the empirical distribution
118 |     of `X`. A target column will be returned that is 0 if the row is
119 |     from the real distribution, and 1 if the row is synthetic. The
120 |     number of synthetic rows generated is equal to the number of rows
121 |     in the original dataset.
122 | 
123 |     Parameters
124 |     ----------
125 |     X : np.ndarray (n_samples, n_features)
126 |         Dataset whose empirical distribution is used to generate the
127 |         synthetic dataset.
128 | 
129 |     method : str {'bootstrap', 'uniform'}
130 |         Method to use to generate the synthetic dataset. `bootstrap`
131 |         samples each column with replacement. `uniform` generates
132 |         a new column uniformly sampled between the minimum and
133 |         maximum value of each column.
134 | 
135 |     random_state : int
136 |         Seed to the random number generator.
137 | 
138 |     Returns
139 |     -------
140 |     X_ : np.ndarray (2 * n_samples, n_features)
141 |         Feature array for the synthetic dataset. The rows
142 |         are randomly shuffled, so synthetic and actual samples should
143 |         be intermixed.
144 | 
145 |     y_ : np.ndarray (2 * n_samples)
146 |         Target column indicating whether the row is from the actual
147 |         dataset (0) or synthetic (1).
148 |     """
149 |     random_state = check_random_state(random_state)
150 |     n_samples = int(X.shape[0])
151 | 
152 |     synth_X = generate_synthetic_features(
153 |         X, method=method, random_state=random_state)
154 |     X_ = np.vstack((X, synth_X))
155 |     y_ = np.concatenate((np.ones(n_samples), np.zeros(n_samples)))
156 | 
157 |     permutation_indices = random_state.permutation(np.arange(X_.shape[0]))
158 |     X_ = X_[permutation_indices, :]
159 |     y_ = y_[permutation_indices]
160 | 
161 |     return X_, y_
162 | 
163 | 
164 | class RandomForestEmbedding(BaseForest):
165 |     """Very similar to sklearn's RandomTreesEmbedding;
166 |     however, the forest is trained as a discriminator.
167 |     """
168 |     def __init__(self,
169 |                  n_estimators=10,
170 |                  criterion='gini',
171 |                  max_depth=5,
172 |                  min_samples_split=2,
173 |                  min_samples_leaf=1,
174 |                  min_weight_fraction_leaf=0.,
175 |                  max_features='auto',
176 |                  max_leaf_nodes=None,
177 |                  bootstrap=True,
178 |                  sparse_output=True,
179 |                  n_jobs=1,
180 |                  random_state=None,
181 |                  verbose=0,
182 |                  warm_start=False):
183 |         super(RandomForestEmbedding, self).__init__(
184 |                 base_estimator=DecisionTreeClassifier(),
185 |                 n_estimators=n_estimators,
186 |                 estimator_params=("criterion", "max_depth", "min_samples_split",
187 |                                   "min_samples_leaf", "min_weight_fraction_leaf",
188 |                                   "max_features", "max_leaf_nodes",
189 |                                   "random_state"),
190 |                 bootstrap=bootstrap,
191 |                 oob_score=False,
192 |                 n_jobs=n_jobs,
193 |                 random_state=random_state,
194 |                 verbose=verbose,
195 |                 warm_start=warm_start)
196 | 
197 |         self.criterion = criterion
198 |         self.max_depth = max_depth
199 |         self.min_samples_split = min_samples_split
200 |         self.min_samples_leaf = min_samples_leaf
201 |         self.min_weight_fraction_leaf = min_weight_fraction_leaf
202 |         self.max_features = max_features
203 |         self.max_leaf_nodes = max_leaf_nodes
204 |         self.sparse_output = sparse_output
205 | 
206 |     def _set_oob_score(self, X, y):
207 |         raise NotImplementedError("OOB score not supported in tree embedding")
208 | 
209 |     def fit(self, X, y=None, sample_weight=None):
210 |         self.fit_transform(X, y, sample_weight=sample_weight)
211 |         return self
212 | 
213 |     def fit_transform(self, X, y=None, sample_weight=None):
214 |         X = check_array(X, accept_sparse=['csc'], ensure_2d=False)
215 | 
216 |         if sp.issparse(X):
217 |             # Pre-sort indices to avoid that each individual tree of the
218 |             # ensemble sorts the indices.
219 |             X.sort_indices()
220 | 
221 |         X_, y_ = generate_discriminative_dataset(X)
222 | 
223 |         super(RandomForestEmbedding, self).fit(X_, y_,
224 |                                                sample_weight=sample_weight)
225 | 
226 |         self.one_hot_encoder_ = OneHotEncoder(sparse=True)
227 |         if self.sparse_output:
228 |             return self.one_hot_encoder_.fit_transform(self.apply(X))
229 |         return self.apply(X)
230 | 
231 |     def transform(self, X):
232 |         if self.sparse_output:
233 |             return self.one_hot_encoder_.fit_transform(self.apply(X))
234 |         return self.apply(X)
235 | 


--------------------------------------------------------------------------------
/forest_cluster/k_medoids.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """K-medoids clustering"""
  3 | 
  4 | # Authors: Timo Erkkilä <timo.erkkila@gmail.com>
  5 | #          Antti Lehmussola <antti.lehmussola@gmail.com>
  6 | # License: BSD 3 clause
  7 | 
  8 | import numpy as np
  9 | import warnings
 10 | 
 11 | from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
 12 | from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
 13 | from sklearn.metrics.pairwise import pairwise_distances
 14 | from sklearn.utils import check_array, check_random_state
 15 | from sklearn.utils.validation import check_is_fitted
 16 | 
 17 | 
 18 | class KMedoids(BaseEstimator, ClusterMixin, TransformerMixin):
 19 |     """
 20 |     k-medoids class.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     n_clusters : int, optional, default: 8
 25 |         How many medoids. Must be positive.
 26 | 
 27 |     distance_metric : string, optional, default: 'euclidean'
 28 |         What distance metric to use.
 29 | 
 30 |     clustering : {'pam'}, optional, default: 'pam'
 31 |         What clustering mode to use.
 32 | 
 33 |     init : {'random', 'heuristic'}, optional, default: 'heuristic'
 34 |         Specify medoid initialization.
 35 | 
 36 |     max_iter : int, optional, default : 300
 37 |         Specify the maximum number of iterations when fitting.
 38 | 
 39 |     random_state : int, optional, default: None
 40 |         Specify random state for the random number generator.
 41 |     """
 42 | 
 43 |     # Supported clustering methods
 44 |     CLUSTERING_METHODS = ['pam']
 45 | 
 46 |     # Supported initialization methods
 47 |     INIT_METHODS = ['random', 'heuristic']
 48 | 
 49 |     def __init__(self, n_clusters=8, distance_metric='euclidean',
 50 |                  clustering_method='pam', init='heuristic',
 51 |                  max_iter=300, random_state=None):
 52 | 
 53 |         self.n_clusters = n_clusters
 54 | 
 55 |         self.distance_metric = distance_metric
 56 | 
 57 |         self.init = init
 58 | 
 59 |         self.max_iter = max_iter
 60 | 
 61 |         self.clustering_method = clustering_method
 62 | 
 63 |         self.random_state = random_state
 64 | 
 65 |     def _check_init_args(self):
 66 | 
 67 |         # Check n_clusters
 68 |         if self.n_clusters is None or self.n_clusters <= 0 or \
 69 |                 not isinstance(self.n_clusters, int):
 70 |             raise ValueError("n_clusters has to be nonnegative integer")
 71 | 
 72 |         # Check distance_metric
 73 |         if callable(self.distance_metric):
 74 |             self.distance_func = self.distance_metric
 75 |         elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
 76 |             self.distance_func = \
 77 |                 PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
 78 |         else:
 79 |             raise ValueError("distance_metric needs to be " +
 80 |                              "callable or one of the " +
 81 |                              "following strings: " +
 82 |                              "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys()) +
 83 |                              ". Instead, '{}' ".format(self.distance_metric) +
 84 |                              "was given.")
 85 | 
 86 |         # Check clustering_method
 87 |         if self.clustering_method not in self.CLUSTERING_METHODS:
 88 |             raise ValueError("clustering must be one of the following: " +
 89 |                              "{}".format(self.CLUSTERING_METHODS))
 90 | 
 91 |         # Check init
 92 |         if self.init not in self.INIT_METHODS:
 93 |             raise ValueError("init needs to be one of " +
 94 |                              "the following: " +
 95 |                              "{}".format(self.INIT_METHODS))
 96 | 
 97 |         # Check random state
 98 |         self.random_state_ = check_random_state(self.random_state)
 99 | 
100 |     def fit(self, X, y=None):
101 |         """Fit K-Medoids to the provided data.
102 | 
103 |         Parameters
104 |         ----------
105 |         X : array-like or sparse matrix, shape=(n_samples, n_features)
106 | 
107 |         Returns
108 |         -------
109 |         self
110 |         """
111 | 
112 |         self._check_init_args()
113 | 
114 |         # Check that the array is good and attempt to convert it to
115 |         # Numpy array if possible
116 |         X = self._check_array(X)
117 | 
118 |         # Apply distance metric to get the distance matrix
119 |         if self.distance_func:
120 |             D = self.distance_func(X)
121 |         else:
122 |             D = X
123 | 
124 |         medoid_ics = self._get_initial_medoid_indices(D, self.n_clusters)
125 | 
126 |         # Old medoids will be stored here for reference
127 |         old_medoid_ics = np.zeros((self.n_clusters,))
128 | 
129 |         # Continue the algorithm as long as
130 |         # the medoids keep changing and the maximum number
131 |         # of iterations is not exceeded
132 |         self.n_iter_ = 0
133 |         while not np.all(old_medoid_ics == medoid_ics) and \
134 |                 self.n_iter_ < self.max_iter:
135 | 
136 |             self.n_iter_ += 1
137 | 
138 |             # Keep a copy of the old medoid assignments
139 |             old_medoid_ics = np.copy(medoid_ics)
140 | 
141 |             # Get cluster indices
142 |             cluster_ics = self._get_cluster_ics(D, medoid_ics)
143 | 
144 |             # Update medoids with the new cluster indices
145 |             self._update_medoid_ics_in_place(D, cluster_ics, medoid_ics)
146 | 
147 |         # Expose labels_ which are the assignments of
148 |         # the training data to clusters
149 |         self.labels_ = cluster_ics
150 | 
151 |         # Expose cluster centers, i.e. medoids
152 |         self.cluster_centers_ = X.take(medoid_ics, axis=0)
153 | 
154 |         # Return self to enable method chaining
155 |         return self
156 | 
157 |     def _check_array(self, X):
158 | 
159 |         X = check_array(X)
160 | 
161 |         # Check that the number of clusters is less than or equal to
162 |         # the number of samples
163 |         if self.n_clusters > X.shape[0]:
164 |             raise ValueError("The number of medoids " +
165 |                              "({}) ".format(self.n_clusters) +
166 |                              "must be larger than the number " +
167 |                              "of samples ({})".format(X.shape[0]))
168 | 
169 |         return X
170 | 
171 |     def _get_cluster_ics(self, D, medoid_ics):
172 |         """Returns cluster indices for D and current medoid indices"""
173 | 
174 |         # Assign data points to clusters based on
175 |         # which cluster assignment yields
176 |         # the smallest distance
177 |         cluster_ics = np.argmin(D[medoid_ics, :], axis=0)
178 | 
179 |         return cluster_ics
180 | 
181 |     def _update_medoid_ics_in_place(self, D, cluster_ics, medoid_ics):
182 |         """In-place update of the medoid indices"""
183 | 
184 |         # Update the medoids for each cluster
185 |         for cluster_idx in range(self.n_clusters):
186 | 
187 |             if sum(cluster_ics == cluster_idx) == 0:
188 |                 warnings.warn("Cluster {} is empty!".format(cluster_idx))
189 |                 continue
190 | 
191 |             # Find current cost that is associated with cluster_idx.
192 |             # Cost is the sum of the distance from the cluster
193 |             # members to the medoid.
194 |             curr_cost = np.sum(D[medoid_ics[cluster_idx],
195 |                                  cluster_ics == cluster_idx])
196 | 
197 |             # Extract the distance matrix between the data points
198 |             # inside the cluster_idx
199 |             D_in = D[cluster_ics == cluster_idx, :]
200 |             D_in = D_in[:, cluster_ics == cluster_idx]
201 | 
202 |             # Calculate all costs there exists between all
203 |             # the data points in the cluster_idx
204 |             all_costs = np.sum(D_in, axis=1)
205 | 
206 |             # Find the index for the smallest cost in cluster_idx
207 |             min_cost_idx = np.argmin(all_costs)
208 | 
209 |             # find the value of the minimum cost in cluster_idx
210 |             min_cost = all_costs[min_cost_idx]
211 | 
212 |             # If the minimum cost is smaller than that
213 |             # exhibited by the currently used medoid,
214 |             # we switch to using the new medoid in cluster_idx
215 |             if min_cost < curr_cost:
216 | 
217 |                 # Find data points that belong to cluster_idx,
218 |                 # and assign the newly found medoid as the medoid
219 |                 # for cluster c
220 |                 medoid_ics[cluster_idx] = \
221 |                     np.where(cluster_ics == cluster_idx)[0][min_cost_idx]
222 | 
223 |     def transform(self, X):
224 |         """Transforms X to cluster-distance space.
225 | 
226 |         Parameters
227 |         ----------
228 |         X : array-like or sparse matrix, shape=(n_samples, n_features)
229 |             Data to transform.
230 | 
231 |         Returns
232 |         -------
233 |         X_new : array, shape=(n_samples, n_clusters)
234 |             X transformed in the new space.
235 |         """
236 | 
237 |         check_is_fitted(self, "cluster_centers_")
238 | 
239 |         # Apply distance metric wrt. cluster centers (medoids),
240 |         # and return these distances
241 |         return self.distance_func(X, Y=self.cluster_centers_)
242 | 
243 |     def predict(self, X):
244 | 
245 |         check_is_fitted(self, "cluster_centers_")
246 | 
247 |         # Check that the array is good and attempt to convert it to
248 |         # Numpy array if possible
249 |         X = check_array(X)
250 | 
251 |         # Apply distance metric wrt. cluster centers (medoids)
252 |         D = self.distance_func(X, Y=self.cluster_centers_)
253 | 
254 |         # Assign data points to clusters based on
255 |         # which cluster assignment yields
256 |         # the smallest distance
257 |         labels = np.argmin(D, axis=1)
258 | 
259 |         return labels
260 | 
261 |     def inertia(self, X):
262 | 
263 |         # Map the original X to the distance-space
264 |         Xt = self.transform(X)
265 | 
266 |         # Define inertia as the sum of the sample-distances
267 |         # to closest cluster centers
268 |         inertia = np.sum(np.min(Xt, axis=1))
269 | 
270 |         return inertia
271 | 
272 |     def _get_initial_medoid_indices(self, D, n_clusters):
273 | 
274 |         if self.init == 'random':  # Random initialization
275 | 
276 |             # Pick random k medoids as the initial ones.
277 |             medoids = self.random_state_.permutation(D.shape[0])[:n_clusters]
278 | 
279 |         elif self.init == 'heuristic':  # Initialization by heuristic
280 | 
281 |             # Pick K first data points that have the smallest sum distance
282 |             # to every other point. These are the initial medoids.
283 |             medoids = list(np.argsort(np.sum(D, axis=1))[:n_clusters])
284 | 
285 |         else:
286 | 
287 |             raise ValueError("Initialization not implemented for method: " +
288 |                              "'{}'".format(self.init))
289 | 
290 |         return medoids
291 | 


--------------------------------------------------------------------------------
/forest_cluster/similarity.pyx:
--------------------------------------------------------------------------------
 1 | #cython: boundscheck=False
 2 | #cython: cdivision=True
 3 | #cython: wraparound=False
 4 | 
 5 | import numpy as np
 6 | cimport numpy as np
 7 | np.import_array()
 8 | 
 9 | ctypedef np.int_t DTYPE_t
10 | ctypedef np.intp_t ITYPE_t
11 | 
12 | 
13 | cdef inline double dist(DTYPE_t* x1, DTYPE_t* x2,
14 |                          ITYPE_t size) nogil except -1:
15 |     cdef int n_eq = 0
16 |     cdef ITYPE_t j
17 |     for j in range(size):
18 |         n_eq += (x1[j] == x2[j])
19 |     return n_eq * 1. / size
20 | 
21 | 
22 | def matching_dist(np.ndarray[DTYPE_t, ndim=1] X1, np.ndarray[DTYPE_t, ndim=1] X2):
23 |     cdef ITYPE_t size = X1.shape[0]
24 | 
25 |     return dist(<DTYPE_t*> X1.data, <DTYPE_t*> X2.data, size)
26 | 


--------------------------------------------------------------------------------
/forest_cluster/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joshloyal/RandomForestClustering/f883eb5391befbd5e79080ebae979cc7b7e6f1b5/forest_cluster/tests/__init__.py


--------------------------------------------------------------------------------
/forest_cluster/tests/fixtures.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np
 3 | 
 4 | 
 5 | def generate_clustered_data(seed=0, n_clusters=3, n_features=2,
 6 |                             n_samples_per_cluster=20, std=.4):
 7 |     prng = np.random.RandomState(seed)
 8 | 
 9 |     # the data is voluntary shifted away from zero to check clustering
10 |     # algorithm robustness with regards to non centered data
11 |     means = np.array([[1, 1, 1, 0],
12 |                       [-1, -1, 0, 1],
13 |                       [1, -1, 1, 1],
14 |                       [-1, 1, 1, 0],
15 |                      ]) + 10
16 | 
17 |     X = np.empty((0, n_features))
18 |     for i in range(n_clusters):
19 |         X = np.r_[X, means[i][:n_features]
20 |                   + std * prng.randn(n_samples_per_cluster, n_features)]
21 |     return X
22 | 
23 | 
24 | @pytest.fixture
25 | def simple_cluster():
26 |     return generate_clustered_data
27 | 


--------------------------------------------------------------------------------
/forest_cluster/tests/test_forest_clustering.py:
--------------------------------------------------------------------------------
1 | import forest_cluster as rfc
2 | from forest_cluster.tests.fixtures import simple_cluster
3 | 
4 | 
5 | def test_forest_clusterer(simple_cluster):
6 |     X = simple_cluster()
7 |     cluster = rfc.RandomForestClusterer()
8 |     cluster.fit(X)
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import contextlib
  5 | 
  6 | from setuptools import Extension, setup
  7 | import numpy
  8 | 
  9 | 
 10 | PACKAGES = [
 11 |     'forest_cluster',
 12 |     'forest_cluster.tests',
 13 | ]
 14 | 
 15 | CYTHON_MODS = [
 16 |     'forest_cluster.similarity'
 17 | ]
 18 | 
 19 | @contextlib.contextmanager
 20 | def chdir(new_dir):
 21 |     old_dir = os.getcwd()
 22 |     try:
 23 |         sys.path.insert(0, new_dir)
 24 |         yield
 25 |     finally:
 26 |         del sys.path[0]
 27 |         os.chdir(old_dir)
 28 | 
 29 | 
 30 | def clean(path):
 31 |     for name in CYTHON_MODS:
 32 |         name = name.replace('.', '/')
 33 |         for ext in ['.c', '.cpp', '.so']:
 34 |             file_path = os.path.join(path, name + ext)
 35 |             if os.path.exists(file_path):
 36 |                 os.unlink(file_path)
 37 | 
 38 | 
 39 | def get_python_package(root):
 40 |     return os.path.join(root, 'forest_cluster')
 41 | 
 42 | 
 43 | def generate_sources(root):
 44 |     for base, _, files in os.walk(root):
 45 |         for filename in files:
 46 |             if filename.endswith('pyx'):
 47 |                 yield os.path.join(base, filename)
 48 | 
 49 | 
 50 | def generate_cython(root, cython_cov=False):
 51 |     print("Cythonizing sources")
 52 |     for source in generate_sources(get_python_package(root)):
 53 |         cythonize_source(source, cython_cov)
 54 | 
 55 | 
 56 | def cythonize_source(source, cython_cov=False):
 57 |     print("Processing %s" % source)
 58 | 
 59 |     flags = ['--fast-fail']
 60 |     if cython_cov:
 61 |         flags.extend(['--directive', 'linetrace=True'])
 62 |         flags.extend(['--directive', 'binding=True'])
 63 | 
 64 |     try:
 65 |         p = subprocess.call(['cython'] + flags + [source])
 66 |         if p != 0:
 67 |             raise Exception('Cython failed')
 68 |     except OSError:
 69 |         raise OSError('Cython needs to be installed')
 70 | 
 71 | 
 72 | def generate_extensions(root, macros=[]):
 73 |     ext_modules = []
 74 |     for mod_name in CYTHON_MODS:
 75 |         mod_path = mod_name.replace('.', '/') + '.c'
 76 |         ext_modules.append(
 77 |             Extension(mod_name,
 78 |                       sources=[mod_path],
 79 |                       include_dirs=[numpy.get_include()],
 80 |                       extra_compile_args=['-O3', '-fPIC'],
 81 |                       define_macros=macros))
 82 | 
 83 |     return ext_modules
 84 | 
 85 | 
 86 | def setup_package():
 87 |     root = os.path.abspath(os.path.dirname(__file__))
 88 | 
 89 |     if len(sys.argv) > 1 and sys.argv[1] == 'clean':
 90 |         return clean(root)
 91 | 
 92 |     cython_cov = 'CYTHON_COV' in os.environ
 93 | 
 94 |     macros = []
 95 |     if cython_cov:
 96 |         print("Adding coverage information to cythonized files.")
 97 |         macros =  [('CYTHON_TRACE', 1)]
 98 | 
 99 |     with chdir(root):
100 |         generate_cython(root, cython_cov)
101 |         ext_modules = generate_extensions(root, macros=macros)
102 |         setup(
103 |             name="Random Forest Clustering",
104 |             version='0.1.0',
105 |             description='Unsupervised Clustering using Random Forests',
106 |             author='Joshua D. Loyal',
107 |             url='https://github.com/joshloyal/RandomForestClustering',
108 |             license='MIT',
109 |             install_requires=['numpy', 'scipy', 'scikit-learn', 'joblib'],
110 |             packages=PACKAGES,
111 |             ext_modules=ext_modules,
112 |         )
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     setup_package()
117 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | from sklearn import manifold
 4 | from sklearn import ensemble
 5 | from sklearn.decomposition import TruncatedSVD
 6 | from sklearn.cluster import KMeans
 7 | import forest_cluster as rfc
 8 | from forest_cluster import KMedoids
 9 | import numpy as np
10 | from mysuper.datasets import fetch_cars, fetch_10kdiabetes
11 | from scipy.spatial.distance import hamming
12 | import seaborn as sns
13 | import pandas as pd
14 | import time
15 | import scipy.sparse as sp
16 | 
17 | 
18 | """
19 | Random Forest clustering works as follows
20 | 1. Construct a dissimilarity measure using RF
21 | 2. Use an embedding algorithm (MDS, TSNE) to embed into a 2D space preserving that dissimilarity measure.
22 | 3. Cluster using K-means or K-medoids
23 | """
24 | 
25 | # [1, 2, 3] == 1 , [1, 0, 0]
26 | # [1, 3, 1] == 1, [1, 0, 1]
27 | def fast_hamming_binary_dense(X):
28 |     # does a conversion to dense....
29 |     n_features = X.shape[1]
30 |     D = np.dot(1 - X, X.T)
31 |     return (D + D.T) / X.shape[1]
32 | 
33 | 
34 | def fast_hamming_binary_sparse(X, n_matches=None):
35 |     if n_matches:
36 |         n_features = n_matches
37 |     else:
38 |         n_features = X.shape[1]
39 |     H = (X * X.T).toarray()
40 |     return 1 - H / n_features
41 | 
42 | 
43 | def fast_hamming_dense(X):
44 |     unique_values = np.unique(X)
45 |     U = sp.csr_matrix((X == unique_values[0]).astype(np.int32))
46 |     H = (U * U.transpose()).toarray()
47 |     for unique_value in unique_values[1:]:
48 |         U = sp.csr_matrix((X == unique_value).astype(np.int32))
49 |         H += (U * U.transpose()).toarray()
50 |     return 1 - H.astype(np.float64) / X.shape[1]
51 | 
52 | X = fetch_cars().values
53 | #X = fetch_10kdiabetes(one_hot=False).values
54 | 
55 | n_trees = 5000
56 | print('tree embedding')
57 | t0 = time.time()
58 | rf = rfc.RandomForestEmbedding(n_estimators=n_trees, random_state=10, n_jobs=-1, sparse_output=False)
59 | leaves = rf.fit_transform(X)
60 | print('time: %r s', time.time() - t0)
61 | 
62 | 
63 | print('embedding data')
64 | t0 = time.time()
65 | 
66 | #if leaves.shape[1] > 50:
67 | #    projection = TruncatedSVD(n_components=50, random_state=123).fit_transform(leaves)
68 | #else:
69 | #    projection = leaves.toarray()
70 | #dissimilarity = fast_hamming_binary_sparse(leaves, n_matches=n_trees)
71 | #projector = manifold.TSNE(random_state=1234, metric='precomputed')
72 | projector = manifold.TSNE(random_state=1234, metric='hamming')
73 | embedding = projector.fit_transform(leaves)
74 | 
75 | 
76 | #projector = manifold.MDS(random_state=1234, dissimilarity='precomputed')
77 | #embedding = projector.fit_transform(dissimilarity)
78 | print('time: %r s', time.time() - t0)
79 | 
80 | 
81 | print('clustering')
82 | t0 = time.time()
83 | clusterer = KMeans(n_clusters=4, random_state=1234, n_init=20, n_jobs=-1)
84 | clusterer.fit(embedding)
85 | 
86 | #clusterer = KMedoids(n_clusters=3, random_state=1234, distance_metric='precomputed')
87 | #clusterer.fit(np.load('hamming.npy'))
88 | print('time: %r s', time.time() - t0)
89 | 
90 | 
91 | df = pd.DataFrame({'x': embedding[:, 0], 'y': embedding[:, 1], 'z': clusterer.labels_})
92 | sns.lmplot('x', 'y', hue='z', data=df, fit_reg=False)
93 | 


--------------------------------------------------------------------------------