├── LICENSE ├── README ├── plot_sparse_filtering.py ├── setup.py └── sparse_filtering.py /LICENSE: -------------------------------------------------------------------------------- 1 | New BSD License 2 | 3 | Copyright (c) 2014 Jan Hendrik Metzen 4 | All rights reserved. 5 | 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | a. Redistributions of source code must retain the above copyright notice, 11 | this list of conditions and the following disclaimer. 12 | b. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | c. Neither the name of the software's developer nor the names of 16 | its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written 18 | permission. 19 | 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 31 | DAMAGE. 32 | 33 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | sparse-filtering 2 | ================ 3 | 4 | Unsupervised feature learning based on sparse-filtering 5 | 6 | This implements the method described `Jiquan Ngiam, Pang Wei Koh, 7 | Zhenghao Chen, Sonia Bhaskar, Andrew Y. Ng: 8 | Sparse Filtering. NIPS 2011: 1125-1133` 9 | and is based on the Matlab code provided in the supplementary material 10 | -------------------------------------------------------------------------------- /plot_sparse_filtering.py: -------------------------------------------------------------------------------- 1 | """ 2 | =========================================== 3 | Sparse filtering on Olivetti faces 4 | =========================================== 5 | 6 | Unsupervised learning of features for images from the Olivetti faces dataset 7 | using the sparse filtering algorithm. Linear features for sub-patches of the 8 | Olivetti faces are learned using the sparse filtering algorithm. This algorithm 9 | does not try to model the data's distribution but rather to learn features 10 | which are sparsely activated (in the sense that for each image, only a small 11 | subset of features is activated, that each feature is only activated on a small 12 | subset of the examples, and that features are roughly activated equally often). 13 | This sparsity is encoded as an objective function and L-BFGS is used to 14 | minimize this function. 15 | 16 | Plotted are the weight matrices of the features (corresponding roughly to gabor 17 | filters) and feature activation histograms. 18 | """ 19 | print(__doc__) 20 | 21 | import numpy as np 22 | import pylab as pl 23 | 24 | from sparse_filtering import SparseFiltering 25 | 26 | from sklearn.feature_extraction.image import extract_patches_2d 27 | 28 | from sklearn.datasets import fetch_olivetti_faces 29 | 30 | patch_width = 16 # Learn features for patches of size patch_width*patch_width 31 | n_patches = 25 # Determines number of random patches extracted from each image 32 | n_features = 64 # How many features are learned 33 | maxfun = 200 # The maximal number of evaluations of the objective function 34 | iprint = 10 # after how many function evaluations is information printed 35 | # by L-BFGS. -1 for no information 36 | 37 | ############################################################################### 38 | # Load faces data, normalize faces, and convert 2d structures 39 | dataset = fetch_olivetti_faces(shuffle=True) 40 | faces = dataset.data 41 | 42 | n_samples, _ = faces.shape 43 | 44 | faces_centered = faces - faces.mean(axis=0) # global centering 45 | 46 | faces_centered -= \ 47 | faces_centered.mean(axis=1).reshape(n_samples, -1) # local centering 48 | 49 | faces_centered = \ 50 | faces_centered.reshape(n_samples, 64, 64) # Reshaping to 64*64 pixel 51 | 52 | print("Dataset consists of %d faces" % n_samples) 53 | 54 | ############################################################################### 55 | # Extract n_patches patches randomly from each image 56 | patches = [extract_patches_2d(faces_centered[i], (patch_width, patch_width), 57 | max_patches=n_patches, random_state=i) 58 | for i in range(n_samples)] 59 | patches = np.array(patches).reshape(-1, patch_width * patch_width) 60 | 61 | ############################################################################### 62 | estimator = \ 63 | SparseFiltering(n_features=n_features, maxfun=maxfun, iprint=iprint) 64 | features = estimator.fit_transform(patches) 65 | 66 | # ############################################################################# 67 | # Plot weights of features 68 | pl.figure(0, figsize=(12, 10)) 69 | pl.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.95, 70 | wspace=0.1, hspace=0.4) 71 | for i in range(estimator.w_.shape[0]): 72 | pl.subplot(int(np.sqrt(n_features)), int(np.sqrt(n_features)), i + 1) 73 | pl.pcolor(estimator.w_[i].reshape(patch_width, patch_width), 74 | cmap=pl.cm.gray) 75 | pl.xticks(()) 76 | pl.yticks(()) 77 | pl.title("Feature %4d" % i) 78 | 79 | # Plot feature histogram 80 | pl.figure(1) 81 | pl.hist(features) 82 | pl.title("Feature activation histogram") 83 | 84 | # Plot Lifetime Sparsity histogram 85 | # Lifetime Sparsity: Each feature should only be active for a few examples 86 | pl.figure(2) 87 | activated_features = (features > 0.1).mean(0) 88 | pl.hist(activated_features) 89 | pl.xlabel("Feature activation ratio over all examples") 90 | pl.title("Lifetime Sparsity Histogram") 91 | 92 | # Plot Population Sparsity histogram 93 | # Population Sparsity: Each example should be represented by only a few active 94 | # features 95 | pl.figure(3) 96 | activated_features = (features > 0.1).mean(1) 97 | pl.hist(activated_features) 98 | pl.xlabel("Ratio of active features in example") 99 | pl.title("Population Sparsity Histogram") 100 | 101 | pl.show() 102 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | setup(name='sparse_filtering', 6 | version='1.1', 7 | description='Unsupervised feature learning based on sparse-filtering', 8 | author='Jan Hendrik Metzen', 9 | author_email='jhm@informatik.uni-bremen.de', 10 | url='https://github.com/jmetzen/sparse-filtering', 11 | py_modules = ['sparse_filtering'] 12 | ) 13 | -------------------------------------------------------------------------------- /sparse_filtering.py: -------------------------------------------------------------------------------- 1 | """Feature learning based on sparse filtering""" 2 | # Author: Jan Hendrik Metzen 3 | # License: BSD 3 clause 4 | 5 | import numpy as np 6 | from scipy.optimize import fmin_l_bfgs_b 7 | 8 | from sklearn.base import BaseEstimator 9 | 10 | 11 | class SparseFiltering(BaseEstimator): 12 | """Sparse filtering 13 | 14 | Unsupervised learning of features using the sparse filtering algorithm. 15 | Features are linear in the inputs, i.e., f_j(x) = \sum_i w_{ij}x_i 16 | This algorithm does not try to model the 17 | data's distribution but rather to learn features which are sparsely 18 | activated in the sense of 19 | * Population Sparsity: for each image, only a small subset of features 20 | is activated 21 | * Lifetime Sparsity: each feature is only activated on a small 22 | subset of the examples 23 | * High Dispersal: Uniform activity distribution of features. 24 | 25 | This is encoded as an objective function which maps the the weight vector w 26 | onto a scalar value which is the smaller the more sparse the features are. 27 | L-BFGS is used to minimize this objective function. 28 | 29 | Parameters 30 | ---------- 31 | n_features : int, 32 | Number of features to be learned. 33 | 34 | maxfun : int, 35 | Maximum number of evaluations of the objective function in L-BFGS-B. 36 | Defaults to 500. 37 | 38 | iprint : int, 39 | Verbosity of the L-BFGS-B. Prints information regarding the objective 40 | function every iprint iterations. Does not print any information if set 41 | to -1. Defaults to -1. 42 | 43 | Attributes 44 | ---------- 45 | `w_` : array, [n_features, n_inputs] 46 | Sparse components extracted from the data. 47 | 48 | Notes 49 | ----- 50 | This implements the method described in `Jiquan Ngiam, Pang Wei Koh, 51 | Zhenghao Chen, Sonia Bhaskar, Andrew Y. Ng: 52 | Sparse Filtering. NIPS 2011: 1125-1133` 53 | and is based on the Matlab code provided in the supplementary material 54 | """ 55 | 56 | def __init__(self, n_features, maxfun=500, iprint=-1): 57 | self.n_features = n_features 58 | self.iprint = iprint 59 | self.maxfun = maxfun 60 | 61 | def fit(self, X, y=None, **params): 62 | """Fit the model with X. 63 | 64 | Parameters 65 | ---------- 66 | X: array-like, shape (n_samples, n_features) 67 | Training data, where n_samples in the number of samples 68 | and n_features is the number of features. 69 | 70 | Returns 71 | ------- 72 | self : object 73 | Returns the instance itself. 74 | """ 75 | self.w_ = self._fit(X, **params) 76 | return self 77 | 78 | def transform(self, X): 79 | return self._transform(X) 80 | 81 | def fit_transform(self, X, y=None, **params): 82 | """Fit the model with X and apply the dimensionality reduction on X. 83 | 84 | Parameters 85 | ---------- 86 | X : array-like, shape (n_samples, n_features) 87 | Training data, where n_samples in the number of samples 88 | and n_features is the number of features. 89 | 90 | Returns 91 | ------- 92 | X_new : array-like, shape (n_samples, n_components) 93 | 94 | """ 95 | self.w_ = self._fit(X, **params) 96 | return self._transform(X) 97 | 98 | def _fit(self, X): 99 | # transpose data in order to be consistent with the Matlab code 100 | X = np.array(X.T) 101 | # substract the mean from each image patch 102 | X -= X.mean(0) 103 | 104 | def objective_fct(w): 105 | # View 1d weight vector as a 2d matrix 106 | W = w.reshape(self.n_features, X.shape[0]) 107 | 108 | # Determine features resulting from weight vector 109 | F, Fs, L2Fs, NFs, L2Fn, Fhat = self._determine_features(X, W) 110 | 111 | # Compute sparsity of each feature over all example, i.e., compute 112 | # its l1-norm; the objective function is the sum over these 113 | # sparsities 114 | obj = np.apply_along_axis(np.linalg.norm, 1, Fhat, 1).sum() 115 | # Backprop through each feedforward step 116 | deltaW = l2grad(NFs.T, Fhat, L2Fn, np.ones_like(Fhat)) 117 | deltaW = l2grad(Fs, NFs, L2Fs, deltaW.T) 118 | deltaW = (deltaW * (F / Fs)).dot(X.T) 119 | 120 | # Return objective value and gradient 121 | return obj, deltaW.flatten() 122 | 123 | def l2grad(X, Y, N, D): 124 | # Backpropagate through normalization 125 | return D / N[:, None] - Y \ 126 | * (D * X).sum(1)[:, None] / (N ** 2)[:, None] 127 | 128 | # Choose initial weights randomly 129 | w0 = np.random.random(X.shape[0] * self.n_features) * 2 - 1 130 | # Use L-BFGS to find weights which correspond to a (local) minimum of 131 | # the objective function 132 | w, s, d = fmin_l_bfgs_b(objective_fct, w0, iprint=self.iprint, 133 | maxfun=self.maxfun) 134 | 135 | return w.reshape(self.n_features, X.shape[0]) 136 | 137 | def _transform(self, X): 138 | # transpose data in order to be consistent with the Matlab code 139 | X = np.array(X.T) 140 | # substract the mean from each image patch 141 | X -= X.mean(0) 142 | 143 | W = self.w_.reshape(self.n_features, X.shape[0]) 144 | 145 | # Determine features resulting from weight vector 146 | # (ignore internals required for gradient) 147 | _, _, _, _, _, Fhat = self._determine_features(X, W) 148 | 149 | return Fhat 150 | 151 | def _determine_features(self, X, W): 152 | # Compute unnormalized features by multiplying weight matrix with 153 | # data 154 | F = W.dot(X) # Linear Activation 155 | Fs = np.sqrt(F ** 2 + 1e-8) # Soft-Absolute Activation 156 | # Normalize each feature to be equally active by dividing each 157 | # feature by its l2-norm across all examples 158 | L2Fs = np.apply_along_axis(np.linalg.norm, 1, Fs) 159 | NFs = Fs / L2Fs[:, None] 160 | # Normalize features per example, so that they lie on the unit 161 | # l2-ball 162 | L2Fn = np.apply_along_axis(np.linalg.norm, 1, NFs.T) 163 | Fhat = NFs.T / L2Fn[:, None] 164 | 165 | return F, Fs, L2Fs, NFs, L2Fn, Fhat 166 | --------------------------------------------------------------------------------