├── setup.py ├── test ├── create_digits_data.py └── test.py ├── README.markdown ├── LICENSE.txt └── feast.py /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='feast', 4 | version='1.0', 5 | py_modules=['feast'], 6 | ) 7 | -------------------------------------------------------------------------------- /test/create_digits_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from sklearn import datasets 3 | 4 | digits = datasets.load_digits() # load the data from scikits 5 | data = digits.images.reshape((digits.images.shape[0], -1)) 6 | labels = digits.target # extract the labels 7 | 8 | fw = open('digit.txt', 'w') 9 | 10 | for n in range(len(data)): 11 | mstr = '' 12 | for x in data[n]: 13 | mstr += str(x) + '\t' 14 | fw.write(mstr + str(labels[n]) + '\n') 15 | 16 | fw.close() -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # PyFeast 2 | Python bindings to the FEAST Feature Selection Toolbox.. 3 | 4 | ## Download 5 | 6 | [Downlaod Version 1.1](https://github.com/mutantturkey/PyFeast/releases/tag/v1.1) 7 | ## About PyFeast 8 | PyFeast is a interface for the FEAST feature selection toolbox, which was 9 | originally written in C with a interface to Matlab. 10 | 11 | Because Python is also commonly used in computational science, writing bindings 12 | to enable researchers to utilize these feature selection algorithms in Python 13 | was only natural. 14 | 15 | At Drexel University's [EESI Lab](http://www.ece.drexel.edu/gailr/EESI/), we are using PyFeast to create a feature 16 | selection tool for the Department of Energy's upcoming KBase platform. We are also integrating a tool that utilizes 17 | PyFeast as a script for Qiime users: [Qiime Fizzy Branch](https://github.com/EESI/FizzyQIIME) 18 | 19 | ## Requirements 20 | In order to use the feast module, you will need the following dependencies 21 | 22 | * Python 2.7 23 | * Numpy 24 | * Linux or OS X 25 | * [MIToolbox](https://github.com/Craigacp/MIToolbox) 26 | * [FEAST](https://github.com/Craigacp/FEAST) v1.1.1 or higher 27 | 28 | ## Installation 29 | 30 | python ./setup.py build 31 | sudo python ./setup.py install 32 | 33 | ## Demonstration 34 | See test/test.py for an example with uniform data and an image 35 | data set. The image data set was collected from the digits example in 36 | the Scikits-Learn toolbox. Make sure that if you are loading the data from a file and converting the data to a `numpy` array that you set `order="F"`. This is *very* important. 37 | 38 | ## Documentation 39 | We have documentation for each of the functions available [here](http://mutantturkey.github.com/PyFeast/feast-module.html) 40 | 41 | ## References 42 | * [FEAST](http://www.cs.man.ac.uk/~gbrown/fstoolbox/) - The Feature Selection Toolbox 43 | * [Fizzy](http://www.kbase.us/developer-zone/api-documentation/fizzy-feature-selection-service/) - A KBase Service for Feature Selection 44 | * [Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection](http://jmlr.csail.mit.edu/papers/v13/brown12a.html) 45 | -------------------------------------------------------------------------------- /test/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from feast import * 3 | import numpy as np 4 | import csv 5 | 6 | 7 | def check_result(selected_features, n_relevant): 8 | selected_features = sorted(selected_features) 9 | success = True 10 | for k in range(n_relevant): 11 | if k != selected_features[k]: 12 | success = False 13 | return success 14 | 15 | def read_digits(fname='digit.txt'): 16 | ''' 17 | read_digits(fname='digit.txt') 18 | 19 | read a data file that contains the features and class labels. 20 | each row of the file is a feature vector with the class 21 | label appended. 22 | ''' 23 | 24 | fw = csv.reader(open(fname,'rb'), delimiter='\t') 25 | data = [] 26 | for line in fw: 27 | data.append( [float(x) for x in line] ) 28 | data = np.array(data, order="F") 29 | labels = data[:,len(data.transpose())-1] 30 | data = data[:,:len(data.transpose())-1] 31 | return data, labels 32 | 33 | def uniform_data(n_observations = 1000, n_features = 50, n_relevant = 5): 34 | import numpy as np 35 | xmax = 10 36 | xmin = 0 37 | data = 1.0*np.random.randint(xmax + 1, size = (n_features, n_observations)) 38 | labels = np.zeros(n_observations) 39 | delta = n_relevant * (xmax - xmin) / 2.0 40 | 41 | for m in range(n_observations): 42 | zz = 0.0 43 | for k in range(n_relevant): 44 | zz += data[k, m] 45 | if zz > delta: 46 | labels[m] = 1 47 | else: 48 | labels[m] = 2 49 | data = data.transpose() 50 | return data, labels 51 | 52 | 53 | 54 | 55 | 56 | n_relevant = 5 57 | data_source = 'uniform' # set the data set we want to test 58 | 59 | 60 | if data_source == 'uniform': 61 | data, labels = uniform_data(n_relevant = n_relevant) 62 | elif data_source == 'digits': 63 | data, labels = read_digits('digit.txt') 64 | 65 | n_observations = len(data) # number of samples in the data set 66 | n_features = len(data.transpose()) # number of features in the data set 67 | n_select = 15 # how many features to select 68 | method = 'MIM' # feature selection algorithm 69 | 70 | 71 | print '---> Information' 72 | print ' :n_observations - ' + str(n_observations) 73 | print ' :n_features - ' + str(n_features) 74 | print ' :n_select - ' + str(n_select) 75 | print ' :algorithm - ' + str(method) 76 | print ' ' 77 | print '---> Running unit tests on FEAST 4 Python... ' 78 | 79 | 80 | ################################################################# 81 | ################################################################# 82 | print ' Running BetaGamma... ' 83 | sf = BetaGamma(data, labels, n_select, beta=0.5, gamma=0.5) 84 | if check_result(sf, n_relevant) == True: 85 | print ' BetaGamma passed!' 86 | else: 87 | print ' BetaGamma failed!' 88 | 89 | ################################################################# 90 | ################################################################# 91 | print ' Running CMIM... ' 92 | sf = CMIM(data, labels, n_select) 93 | if check_result(sf, n_relevant) == True: 94 | print ' CMIM passed!' 95 | else: 96 | print ' CMIM failed!' 97 | 98 | 99 | ################################################################# 100 | ################################################################# 101 | print ' Running CondMI... ' 102 | sf = CondMI(data, labels, n_select) 103 | if check_result(sf, n_relevant) == True: 104 | print ' CondMI passed!' 105 | else: 106 | print ' CondMI failed!' 107 | 108 | 109 | ################################################################# 110 | ################################################################# 111 | print ' Running DISR... ' 112 | sf = DISR(data, labels, n_select) 113 | if check_result(sf, n_relevant) == True: 114 | print ' DISR passed!' 115 | else: 116 | print ' DISR failed!' 117 | 118 | 119 | ################################################################# 120 | ################################################################# 121 | print ' Running ICAP... ' 122 | sf = ICAP(data, labels, n_select) 123 | if check_result(sf, n_relevant) == True: 124 | print ' ICAP passed!' 125 | else: 126 | print ' ICAP failed!' 127 | 128 | 129 | ################################################################# 130 | ################################################################# 131 | print ' Running JMI... ' 132 | sf = JMI(data, labels, n_select) 133 | if check_result(sf, n_relevant) == True: 134 | print ' JMI passed!' 135 | else: 136 | print ' JMI failed!' 137 | 138 | 139 | ################################################################# 140 | ################################################################# 141 | print ' Running mRMR... ' 142 | sf = mRMR(data, labels, n_select) 143 | if check_result(sf, n_relevant) == True: 144 | print ' mRMR passed!' 145 | else: 146 | print ' mRMR failed!' 147 | 148 | ################################################################# 149 | ################################################################# 150 | print ' Running MIM...' 151 | sf = MIM(data, labels, n_select) 152 | if check_result(sf, n_relevant) == True: 153 | print ' MIM passed!' 154 | else: 155 | print ' MIM failed!' 156 | 157 | print '---> Done unit tests!' 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Pyfeast - python bindings for the Feature Selection Toolbox 4 | 5 | Copyright (C) Calvin Morrison, Gregory Ditzler 6 | 7 | This program is free software: you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation, either version 3 of the License, or 10 | (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | license below for more details.. 16 | 17 | 18 | GNU LESSER GENERAL PUBLIC LICENSE 19 | Version 3, 29 June 2007 20 | 21 | Copyright (C) 2007 Free Software Foundation, Inc. 22 | Everyone is permitted to copy and distribute verbatim copies 23 | of this license document, but changing it is not allowed. 24 | 25 | 26 | This version of the GNU Lesser General Public License incorporates 27 | the terms and conditions of version 3 of the GNU General Public 28 | License, supplemented by the additional permissions listed below. 29 | 30 | 0. Additional Definitions. 31 | 32 | As used herein, "this License" refers to version 3 of the GNU Lesser 33 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 34 | General Public License. 35 | 36 | "The Library" refers to a covered work governed by this License, 37 | other than an Application or a Combined Work as defined below. 38 | 39 | An "Application" is any work that makes use of an interface provided 40 | by the Library, but which is not otherwise based on the Library. 41 | Defining a subclass of a class defined by the Library is deemed a mode 42 | of using an interface provided by the Library. 43 | 44 | A "Combined Work" is a work produced by combining or linking an 45 | Application with the Library. The particular version of the Library 46 | with which the Combined Work was made is also called the "Linked 47 | Version". 48 | 49 | The "Minimal Corresponding Source" for a Combined Work means the 50 | Corresponding Source for the Combined Work, excluding any source code 51 | for portions of the Combined Work that, considered in isolation, are 52 | based on the Application, and not on the Linked Version. 53 | 54 | The "Corresponding Application Code" for a Combined Work means the 55 | object code and/or source code for the Application, including any data 56 | and utility programs needed for reproducing the Combined Work from the 57 | Application, but excluding the System Libraries of the Combined Work. 58 | 59 | 1. Exception to Section 3 of the GNU GPL. 60 | 61 | You may convey a covered work under sections 3 and 4 of this License 62 | without being bound by section 3 of the GNU GPL. 63 | 64 | 2. Conveying Modified Versions. 65 | 66 | If you modify a copy of the Library, and, in your modifications, a 67 | facility refers to a function or data to be supplied by an Application 68 | that uses the facility (other than as an argument passed when the 69 | facility is invoked), then you may convey a copy of the modified 70 | version: 71 | 72 | a) under this License, provided that you make a good faith effort to 73 | ensure that, in the event an Application does not supply the 74 | function or data, the facility still operates, and performs 75 | whatever part of its purpose remains meaningful, or 76 | 77 | b) under the GNU GPL, with none of the additional permissions of 78 | this License applicable to that copy. 79 | 80 | 3. Object Code Incorporating Material from Library Header Files. 81 | 82 | The object code form of an Application may incorporate material from 83 | a header file that is part of the Library. You may convey such object 84 | code under terms of your choice, provided that, if the incorporated 85 | material is not limited to numerical parameters, data structure 86 | layouts and accessors, or small macros, inline functions and templates 87 | (ten or fewer lines in length), you do both of the following: 88 | 89 | a) Give prominent notice with each copy of the object code that the 90 | Library is used in it and that the Library and its use are 91 | covered by this License. 92 | 93 | b) Accompany the object code with a copy of the GNU GPL and this license 94 | document. 95 | 96 | 4. Combined Works. 97 | 98 | You may convey a Combined Work under terms of your choice that, 99 | taken together, effectively do not restrict modification of the 100 | portions of the Library contained in the Combined Work and reverse 101 | engineering for debugging such modifications, if you also do each of 102 | the following: 103 | 104 | a) Give prominent notice with each copy of the Combined Work that 105 | the Library is used in it and that the Library and its use are 106 | covered by this License. 107 | 108 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 109 | document. 110 | 111 | c) For a Combined Work that displays copyright notices during 112 | execution, include the copyright notice for the Library among 113 | these notices, as well as a reference directing the user to the 114 | copies of the GNU GPL and this license document. 115 | 116 | d) Do one of the following: 117 | 118 | 0) Convey the Minimal Corresponding Source under the terms of this 119 | License, and the Corresponding Application Code in a form 120 | suitable for, and under terms that permit, the user to 121 | recombine or relink the Application with a modified version of 122 | the Linked Version to produce a modified Combined Work, in the 123 | manner specified by section 6 of the GNU GPL for conveying 124 | Corresponding Source. 125 | 126 | 1) Use a suitable shared library mechanism for linking with the 127 | Library. A suitable mechanism is one that (a) uses at run time 128 | a copy of the Library already present on the user's computer 129 | system, and (b) will operate properly with a modified version 130 | of the Library that is interface-compatible with the Linked 131 | Version. 132 | 133 | e) Provide Installation Information, but only if you would otherwise 134 | be required to provide such information under section 6 of the 135 | GNU GPL, and only to the extent that such information is 136 | necessary to install and execute a modified version of the 137 | Combined Work produced by recombining or relinking the 138 | Application with a modified version of the Linked Version. (If 139 | you use option 4d0, the Installation Information must accompany 140 | the Minimal Corresponding Source and Corresponding Application 141 | Code. If you use option 4d1, you must provide the Installation 142 | Information in the manner specified by section 6 of the GNU GPL 143 | for conveying Corresponding Source.) 144 | 145 | 5. Combined Libraries. 146 | 147 | You may place library facilities that are a work based on the 148 | Library side by side in a single library together with other library 149 | facilities that are not Applications and are not covered by this 150 | License, and convey such a combined library under terms of your 151 | choice, if you do both of the following: 152 | 153 | a) Accompany the combined library with a copy of the same work based 154 | on the Library, uncombined with any other library facilities, 155 | conveyed under the terms of this License. 156 | 157 | b) Give prominent notice with the combined library that part of it 158 | is a work based on the Library, and explaining where to find the 159 | accompanying uncombined form of the same work. 160 | 161 | 6. Revised Versions of the GNU Lesser General Public License. 162 | 163 | The Free Software Foundation may publish revised and/or new versions 164 | of the GNU Lesser General Public License from time to time. Such new 165 | versions will be similar in spirit to the present version, but may 166 | differ in detail to address new problems or concerns. 167 | 168 | Each version is given a distinguishing version number. If the 169 | Library as you received it specifies that a certain numbered version 170 | of the GNU Lesser General Public License "or any later version" 171 | applies to it, you have the option of following the terms and 172 | conditions either of that published version or of any later version 173 | published by the Free Software Foundation. If the Library as you 174 | received it does not specify a version number of the GNU Lesser 175 | General Public License, you may choose any version of the GNU Lesser 176 | General Public License ever published by the Free Software Foundation. 177 | 178 | If the Library as you received it specifies that a proxy can decide 179 | whether future versions of the GNU Lesser General Public License shall 180 | apply, that proxy's public statement of acceptance of any version is 181 | permanent authorization for you to choose that version for the 182 | Library. 183 | -------------------------------------------------------------------------------- /feast.py: -------------------------------------------------------------------------------- 1 | """ 2 | The FEAST module provides an interface between the C-library 3 | for feature selection to Python. 4 | 5 | References: 6 | 1) G. Brown, A. Pocock, M.-J. Zhao, and M. Lujan, "Conditional 7 | likelihood maximization: A unifying framework for information 8 | theoretic feature selection," Journal of Machine Learning 9 | Research, vol. 13, pp. 27-66, 2012. 10 | 11 | """ 12 | __author__ = "Calvin Morrison" 13 | __copyright__ = "Copyright 2013, EESI Laboratory" 14 | __credits__ = ["Calvin Morrison", "Gregory Ditzler"] 15 | __license__ = "GPL" 16 | __version__ = "0.2.0" 17 | __maintainer__ = "Calvin Morrison" 18 | __email__ = "mutantturkey@gmail.com" 19 | __status__ = "Release" 20 | 21 | import numpy as np 22 | import ctypes as c 23 | 24 | libFSToolbox = c.CDLL("libFSToolbox.so"); 25 | 26 | def BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0): 27 | """ 28 | This algorithm implements conditional mutual information 29 | feature select, such that beta and gamma control the 30 | weight attached to the redundant mutual and conditional 31 | mutual information, respectively. 32 | 33 | @param data: data in a Numpy array such that len(data) = 34 | n_observations, and len(data.transpose()) = n_features 35 | (REQUIRED) 36 | @type data: ndarray 37 | @param labels: labels represented in a numpy list with 38 | n_observations as the number of elements. That is 39 | len(labels) = len(data) = n_observations. 40 | (REQUIRED) 41 | @type labels: ndarray 42 | @param n_select: number of features to select. (REQUIRED) 43 | @type n_select: integer 44 | @param beta: penalty attacted to I(X_j;X_k) 45 | @type beta: float between 0 and 1.0 46 | @param gamma: positive weight attached to the conditional 47 | redundancy term I(X_k;X_j|Y) 48 | @type gamma: float between 0 and 1.0 49 | @return: features in the order they were selected. 50 | @rtype: list 51 | """ 52 | data, labels = check_data(data, labels) 53 | 54 | # python values 55 | n_observations, n_features = data.shape 56 | output = np.zeros(n_select) 57 | 58 | # cast as C types 59 | c_n_observations = c.c_int(n_observations) 60 | c_n_select = c.c_int(n_select) 61 | c_n_features = c.c_int(n_features) 62 | c_beta = c.c_double(beta) 63 | c_gamma = c.c_double(gamma) 64 | 65 | libFSToolbox.BetaGamma.restype = c.POINTER(c.c_double * n_select) 66 | features = libFSToolbox.BetaGamma(c_n_select, 67 | c_n_observations, 68 | c_n_features, 69 | data.ctypes.data_as(c.POINTER(c.c_double)), 70 | labels.ctypes.data_as(c.POINTER(c.c_double)), 71 | output.ctypes.data_as(c.POINTER(c.c_double)), 72 | c_beta, 73 | c_gamma 74 | ) 75 | 76 | selected_features = [] 77 | for i in features.contents: 78 | selected_features.append(i) 79 | return selected_features 80 | 81 | 82 | def CIFE(data, labels, n_select): 83 | """ 84 | This function implements the Condred feature selection algorithm. 85 | beta = 1; gamma = 1; 86 | 87 | @param data: A Numpy array such that len(data) = 88 | n_observations, and len(data.transpose()) = n_features 89 | @type data: ndarray 90 | @param labels: labels represented in a numpy list with 91 | n_observations as the number of elements. That is 92 | len(labels) = len(data) = n_observations. 93 | @type labels: ndarray 94 | @param n_select: number of features to select. 95 | @type n_select: integer 96 | @return selected_features: features in the order they were selected. 97 | @rtype: list 98 | """ 99 | return BetaGamma(data, labels, n_select, beta=1.0, gamma=1.0) 100 | 101 | def CMIM(data, labels, n_select): 102 | """ 103 | This function implements the conditional mutual information 104 | maximization feature selection algorithm. Note that this 105 | implementation does not allow for the weighting of the 106 | redundancy terms that BetaGamma will allow you to do. 107 | 108 | @param data: A Numpy array such that len(data) = 109 | n_observations, and len(data.transpose()) = n_features 110 | @type data: ndarray 111 | @param labels: labels represented in a numpy array with 112 | n_observations as the number of elements. That is 113 | len(labels) = len(data) = n_observations. 114 | @type labels: ndarray 115 | @param n_select: number of features to select. 116 | @type n_select: integer 117 | @return: features in the order that they were selected. 118 | @rtype: list 119 | """ 120 | data, labels = check_data(data, labels) 121 | 122 | # python values 123 | n_observations, n_features = data.shape 124 | output = np.zeros(n_select) 125 | 126 | # cast as C types 127 | c_n_observations = c.c_int(n_observations) 128 | c_n_select = c.c_int(n_select) 129 | c_n_features = c.c_int(n_features) 130 | 131 | libFSToolbox.CMIM.restype = c.POINTER(c.c_double * n_select) 132 | features = libFSToolbox.CMIM(c_n_select, 133 | c_n_observations, 134 | c_n_features, 135 | data.ctypes.data_as(c.POINTER(c.c_double)), 136 | labels.ctypes.data_as(c.POINTER(c.c_double)), 137 | output.ctypes.data_as(c.POINTER(c.c_double)) 138 | ) 139 | 140 | selected_features = [] 141 | for i in features.contents: 142 | selected_features.append(i) 143 | 144 | return selected_features 145 | 146 | 147 | 148 | def CondMI(data, labels, n_select): 149 | """ 150 | This function implements the conditional mutual information 151 | maximization feature selection algorithm. 152 | 153 | @param data: data in a Numpy array such that len(data) = n_observations, 154 | and len(data.transpose()) = n_features 155 | @type data: ndarray 156 | @param labels: represented in a numpy list with 157 | n_observations as the number of elements. That is 158 | len(labels) = len(data) = n_observations. 159 | @type labels: ndarray 160 | @param n_select: number of features to select. 161 | @type n_select: integer 162 | @return: features in the order they were selected. 163 | @rtype list 164 | """ 165 | data, labels = check_data(data, labels) 166 | 167 | # python values 168 | n_observations, n_features = data.shape 169 | output = np.zeros(n_select) 170 | 171 | # cast as C types 172 | c_n_observations = c.c_int(n_observations) 173 | c_n_select = c.c_int(n_select) 174 | c_n_features = c.c_int(n_features) 175 | 176 | libFSToolbox.CondMI.restype = c.POINTER(c.c_double * n_select) 177 | features = libFSToolbox.CondMI(c_n_select, 178 | c_n_observations, 179 | c_n_features, 180 | data.ctypes.data_as(c.POINTER(c.c_double)), 181 | labels.ctypes.data_as(c.POINTER(c.c_double)), 182 | output.ctypes.data_as(c.POINTER(c.c_double)) 183 | ) 184 | 185 | selected_features = [] 186 | for i in features.contents: 187 | selected_features.append(i) 188 | 189 | return selected_features 190 | 191 | 192 | def Condred(data, labels, n_select): 193 | """ 194 | This function implements the Condred feature selection algorithm. 195 | beta = 0; gamma = 1; 196 | 197 | @param data: data in a Numpy array such that len(data) = 198 | n_observations, and len(data.transpose()) = n_features 199 | @type data: ndarray 200 | @param labels: labels represented in a numpy list with 201 | n_observations as the number of elements. That is 202 | len(labels) = len(data) = n_observations. 203 | @type labels: ndarray 204 | @param n_select: number of features to select. 205 | @type n_select: integer 206 | @return: the features in the order they were selected. 207 | @rtype: list 208 | """ 209 | data, labels = check_data(data, labels) 210 | return BetaGamma(data, labels, n_select, beta=0.0, gamma=1.0) 211 | 212 | 213 | 214 | def DISR(data, labels, n_select): 215 | """ 216 | This function implements the double input symmetrical relevance 217 | feature selection algorithm. 218 | 219 | @param data: data in a Numpy array such that len(data) = 220 | n_observations, and len(data.transpose()) = n_features 221 | @type data: ndarray 222 | @param labels: labels represented in a numpy list with 223 | n_observations as the number of elements. That is 224 | len(labels) = len(data) = n_observations. 225 | @type labels: ndarray 226 | @param n_select: number of features to select. (REQUIRED) 227 | @type n_select: integer 228 | @return: the features in the order they were selected. 229 | @rtype: list 230 | """ 231 | data, labels = check_data(data, labels) 232 | 233 | # python values 234 | n_observations, n_features = data.shape 235 | output = np.zeros(n_select) 236 | 237 | # cast as C types 238 | c_n_observations = c.c_int(n_observations) 239 | c_n_select = c.c_int(n_select) 240 | c_n_features = c.c_int(n_features) 241 | 242 | libFSToolbox.DISR.restype = c.POINTER(c.c_double * n_select) 243 | features = libFSToolbox.DISR(c_n_select, 244 | c_n_observations, 245 | c_n_features, 246 | data.ctypes.data_as(c.POINTER(c.c_double)), 247 | labels.ctypes.data_as(c.POINTER(c.c_double)), 248 | output.ctypes.data_as(c.POINTER(c.c_double)) 249 | ) 250 | 251 | selected_features = [] 252 | for i in features.contents: 253 | selected_features.append(i) 254 | 255 | return selected_features 256 | 257 | def ICAP(data, labels, n_select): 258 | """ 259 | This function implements the interaction capping feature 260 | selection algorithm. 261 | 262 | @param data: data in a Numpy array such that len(data) = 263 | n_observations, and len(data.transpose()) = n_features 264 | @type data: ndarray 265 | @param labels: labels represented in a numpy list with 266 | n_observations as the number of elements. That is 267 | len(labels) = len(data) = n_observations. 268 | @type labels: ndarray 269 | @param n_select: number of features to select. (REQUIRED) 270 | @type n_select: integer 271 | @return: the features in the order they were selected. 272 | @rtype: list 273 | """ 274 | data, labels = check_data(data, labels) 275 | 276 | # python values 277 | n_observations, n_features = data.shape 278 | output = np.zeros(n_select) 279 | 280 | # cast as C types 281 | c_n_observations = c.c_int(n_observations) 282 | c_n_select = c.c_int(n_select) 283 | c_n_features = c.c_int(n_features) 284 | 285 | libFSToolbox.ICAP.restype = c.POINTER(c.c_double * n_select) 286 | features = libFSToolbox.ICAP(c_n_select, 287 | c_n_observations, 288 | c_n_features, 289 | data.ctypes.data_as(c.POINTER(c.c_double)), 290 | labels.ctypes.data_as(c.POINTER(c.c_double)), 291 | output.ctypes.data_as(c.POINTER(c.c_double)) 292 | ) 293 | 294 | selected_features = [] 295 | for i in features.contents: 296 | selected_features.append(i) 297 | 298 | return selected_features 299 | 300 | def JMI(data, labels, n_select): 301 | """ 302 | This function implements the joint mutual information feature 303 | selection algorithm. 304 | 305 | @param data: data in a Numpy array such that len(data) = 306 | n_observations, and len(data.transpose()) = n_features 307 | @type data: ndarray 308 | @param labels: labels represented in a numpy list with 309 | n_observations as the number of elements. That is 310 | len(labels) = len(data) = n_observations. 311 | @type labels: ndarray 312 | @param n_select: number of features to select. (REQUIRED) 313 | @type n_select: integer 314 | @return: the features in the order they were selected. 315 | @rtype: list 316 | """ 317 | data, labels = check_data(data, labels) 318 | 319 | # python values 320 | n_observations, n_features = data.shape 321 | output = np.zeros(n_select) 322 | 323 | # cast as C types 324 | c_n_observations = c.c_int(n_observations) 325 | c_n_select = c.c_int(n_select) 326 | c_n_features = c.c_int(n_features) 327 | 328 | libFSToolbox.JMI.restype = c.POINTER(c.c_double * n_select) 329 | features = libFSToolbox.JMI(c_n_select, 330 | c_n_observations, 331 | c_n_features, 332 | data.ctypes.data_as(c.POINTER(c.c_double)), 333 | labels.ctypes.data_as(c.POINTER(c.c_double)), 334 | output.ctypes.data_as(c.POINTER(c.c_double)) 335 | ) 336 | 337 | selected_features = [] 338 | for i in features.contents: 339 | selected_features.append(i) 340 | return selected_features 341 | 342 | 343 | 344 | def MIFS(data, labels, n_select): 345 | """ 346 | This function implements the MIFS algorithm. 347 | beta = 1; gamma = 0; 348 | 349 | @param data: data in a Numpy array such that len(data) = 350 | n_observations, and len(data.transpose()) = n_features 351 | @type data: ndarray 352 | @param labels: labels represented in a numpy list with 353 | n_observations as the number of elements. That is 354 | len(labels) = len(data) = n_observations. 355 | @type labels: ndarray 356 | @param n_select: number of features to select. (REQUIRED) 357 | @type n_select: integer 358 | @return: the features in the order they were selected. 359 | @rtype: list 360 | """ 361 | return BetaGamma(data, labels, n_select, beta=0.0, gamma=0.0) 362 | 363 | 364 | def MIM(data, labels, n_select): 365 | """ 366 | This function implements the MIM algorithm. 367 | beta = 0; gamma = 0; 368 | 369 | @param data: data in a Numpy array such that len(data) = 370 | n_observations, and len(data.transpose()) = n_features 371 | @type data: ndarray 372 | @param labels: labels represented in a numpy list with 373 | n_observations as the number of elements. That is 374 | len(labels) = len(data) = n_observations. 375 | @type labels: ndarray 376 | @param n_select: number of features to select. (REQUIRED) 377 | @type n_select: integer 378 | @return: the features in the order they were selected. 379 | @rtype: list 380 | """ 381 | data, labels = check_data(data, labels) 382 | 383 | # python values 384 | n_observations, n_features = data.shape 385 | output = np.zeros(n_select) 386 | 387 | # cast as C types 388 | c_n_observations = c.c_int(n_observations) 389 | c_n_select = c.c_int(n_select) 390 | c_n_features = c.c_int(n_features) 391 | 392 | libFSToolbox.MIM.restype = c.POINTER(c.c_double * n_select) 393 | features = libFSToolbox.MIM(c_n_select, 394 | c_n_observations, 395 | c_n_features, 396 | data.ctypes.data_as(c.POINTER(c.c_double)), 397 | labels.ctypes.data_as(c.POINTER(c.c_double)), 398 | output.ctypes.data_as(c.POINTER(c.c_double)) 399 | ) 400 | 401 | selected_features = [] 402 | for i in features.contents: 403 | selected_features.append(i) 404 | return selected_features 405 | 406 | 407 | def mRMR(data, labels, n_select): 408 | """ 409 | This funciton implements the max-relevance min-redundancy feature 410 | selection algorithm. 411 | 412 | @param data: data in a Numpy array such that len(data) = 413 | n_observations, and len(data.transpose()) = n_features 414 | @type data: ndarray 415 | @param labels: labels represented in a numpy list with 416 | n_observations as the number of elements. That is 417 | len(labels) = len(data) = n_observations. 418 | @type labels: ndarray 419 | @param n_select: number of features to select. (REQUIRED) 420 | @type n_select: integer 421 | @return: the features in the order they were selected. 422 | @rtype: list 423 | """ 424 | data, labels = check_data(data, labels) 425 | 426 | # python values 427 | n_observations, n_features = data.shape 428 | output = np.zeros(n_select) 429 | 430 | # cast as C types 431 | c_n_observations = c.c_int(n_observations) 432 | c_n_select = c.c_int(n_select) 433 | c_n_features = c.c_int(n_features) 434 | 435 | libFSToolbox.mRMR_D.restype = c.POINTER(c.c_double * n_select) 436 | features = libFSToolbox.mRMR_D(c_n_select, 437 | c_n_observations, 438 | c_n_features, 439 | data.ctypes.data_as(c.POINTER(c.c_double)), 440 | labels.ctypes.data_as(c.POINTER(c.c_double)), 441 | output.ctypes.data_as(c.POINTER(c.c_double)) 442 | ) 443 | 444 | selected_features = [] 445 | for i in features.contents: 446 | selected_features.append(i) 447 | return selected_features 448 | 449 | def check_data(data, labels): 450 | """ 451 | Check dimensions of the data and the labels. Raise and exception 452 | if there is a problem. 453 | 454 | Data and Labels are automatically cast as doubles before calling the 455 | feature selection functions 456 | 457 | @param data: the data 458 | @param labels: the labels 459 | @return (data, labels): ndarray of floats 460 | @rtype: tuple 461 | """ 462 | 463 | if isinstance(data, np.ndarray) is False: 464 | raise Exception("data must be an numpy ndarray.") 465 | if isinstance(labels, np.ndarray) is False: 466 | raise Exception("labels must be an numpy ndarray.") 467 | 468 | if len(data) != len(labels): 469 | raise Exception("data and labels must be the same length") 470 | 471 | return 1.0*np.array(data, order="F"), 1.0*np.array(labels, order="F") 472 | --------------------------------------------------------------------------------