├── .gitignore ├── LICENSE ├── PKG-INFO ├── README.md ├── setup.py └── skfeature ├── __init__.py ├── data ├── ALLAML.mat ├── BASEHOCK.mat ├── CLL-SUB-111.mat ├── COIL20.mat ├── Carcinom.mat ├── GLI-85.mat ├── GLIOMA.mat ├── Isolet.mat ├── ORL.mat ├── PCMAC.mat ├── Prostate-GE.mat ├── RELATHE.mat ├── SMK-CAN-187.mat ├── TOX-171.mat ├── USPS.mat ├── Yale.mat ├── arcene.mat ├── colon.mat ├── gisette.mat ├── leukemia.mat ├── lung.mat ├── lung_small.mat ├── lymphoma.mat ├── madelon.mat ├── nci9.mat ├── orlraws10P.mat ├── pixraw10P.mat ├── warpAR10P.mat └── warpPIE10P.mat ├── example ├── test_CFS.py ├── test_CIFE.py ├── test_CMIM.py ├── test_DISR.py ├── test_FCBF.py ├── test_ICAP.py ├── test_JMI.py ├── test_MCFS.py ├── test_MIFS.py ├── test_MIM.py ├── test_MRMR.py ├── test_NDFS.py ├── test_RFS.py ├── test_SPEC.py ├── test_UDFS.py ├── test_alpha_investing.py ├── test_chi_square.py ├── test_decision_tree_backward.py ├── test_decision_tree_forward.py ├── test_f_score.py ├── test_fisher_score.py ├── test_gini_index.py ├── test_group_fs.py ├── test_lap_score.py ├── test_ll_l21.py ├── test_low_variance.py ├── test_ls_l21.py ├── test_reliefF.py ├── test_svm_backward.py ├── test_svm_forward.py ├── test_t_score.py ├── test_trace_ratio.py └── test_tree_fs.py ├── function ├── __init__.py ├── information_theoretical_based │ ├── CIFE.py │ ├── CMIM.py │ ├── DISR.py │ ├── FCBF.py │ ├── ICAP.py │ ├── JMI.py │ ├── LCSI.py │ ├── MIFS.py │ ├── MIM.py │ ├── MRMR.py │ └── __init__.py ├── similarity_based │ ├── SPEC.py │ ├── __init__.py │ ├── fisher_score.py │ ├── lap_score.py │ ├── reliefF.py │ └── trace_ratio.py ├── sparse_learning_based │ ├── MCFS.py │ ├── NDFS.py │ ├── RFS.py │ ├── UDFS.py │ ├── __init__.py │ ├── ll_l21.py │ └── ls_l21.py ├── statistical_based │ ├── CFS.py │ ├── __init__.py │ ├── chi_square.py │ ├── f_score.py │ ├── gini_index.py │ ├── low_variance.py │ └── t_score.py ├── streaming │ ├── __init__.py │ └── alpha_investing.py ├── structure │ ├── __init__.py │ ├── graph_fs.py │ ├── group_fs.py │ └── tree_fs.py └── wrapper │ ├── __init__.py │ ├── decision_tree_backward.py │ ├── decision_tree_forward.py │ ├── svm_backward.py │ └── svm_forward.py └── utility ├── __init__.py ├── construct_W.py ├── data_discretization.py ├── entropy_estimators.py ├── mutual_information.py ├── sparse_learning.py └── unsupervised_evaluation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.0 2 | Name: skfeature 3 | Version: 1.0.0 4 | Summary: Feature Selection Repository in Python (DMML Lab@ASU) 5 | Home-page: https://github.com/jundongl/scikit-feature 6 | Author: Jundong Li, Kewei Cheng, Suhang Wang 7 | Author-email: jundong.li@asu.edu, kcheng18@asu.edu, suhang.wang@asu.edu 8 | License: UNKNOWN 9 | Description: UNKNOWN 10 | Keywords: Feature Selection Repository 11 | Platform: UNKNOWN 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scikit-feature 2 | =============================== 3 | Feature selection repository scikit-feature in Python. 4 | 5 | scikit-feature is an open-source feature selection repository in Python developed by Data Mining and Machine Learning Lab at Arizona State University. It is built upon one widely used machine learning package scikit-learn and two scientific computing packages Numpy and Scipy. scikit-feature contains around 40 popular feature selection algorithms, including traditional feature selection algorithms and some structural and streaming feature selection algorithms. 6 | 7 | It serves as a platform for facilitating feature selection application, research and comparative study. It is designed to share widely used feature selection algorithms developed in the feature selection research, and offer convenience for researchers and practitioners to perform empirical evaluation in developing new feature selection algorithms. 8 | 9 | ## Installing scikit-feature 10 | ### Prerequisites: 11 | Python 2.7 *and Python 3* 12 | 13 | NumPy 14 | 15 | SciPy 16 | 17 | Scikit-learn 18 | 19 | ### Steps: 20 | For Linux users, you can install the repository by the following command: 21 | 22 | python setup.py install 23 | 24 | For Windows users, you can also install the repository by the following command: 25 | 26 | setup.py install 27 | 28 | ## Project website 29 | Instructions of using this repository can be found in our project webpage at http://featureselection.asu.edu/ 30 | 31 | ## Citation 32 | 33 | If you find scikit-feature feature selection reposoitory useful in your research, please consider citing the following paper:: 34 | 35 | @article{li2018feature, 36 | title={Feature selection: A data perspective}, 37 | author={Li, Jundong and Cheng, Kewei and Wang, Suhang and Morstatter, Fred and Trevino, Robert P and Tang, Jiliang and Liu, Huan}, 38 | journal={ACM Computing Surveys (CSUR)}, 39 | volume={50}, 40 | number={6}, 41 | pages={94}, 42 | year={2018}, 43 | publisher={ACM} 44 | } 45 | 46 | ## Contact 47 | Jundong Li 48 | E-mail: jundong@virgnia.edu 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | NAME = "skfeature" 4 | 5 | DESCRIPTION = "Feature Selection Repository in Python (DMML Lab@ASU)" 6 | 7 | KEYWORDS = "Feature Selection Repository" 8 | 9 | AUTHOR = "Jundong Li, Kewei Cheng, Suhang Wang" 10 | 11 | AUTHOR_EMAIL = "jundong.li@asu.edu, kcheng18@asu.edu, suhang.wang@asu.edu" 12 | 13 | URL = "https://github.com/jundongl/scikit-feature" 14 | 15 | VERSION = "1.0.0" 16 | 17 | 18 | setup( 19 | name = NAME, 20 | version = VERSION, 21 | description = DESCRIPTION, 22 | keywords = KEYWORDS, 23 | author = AUTHOR, 24 | author_email = AUTHOR_EMAIL, 25 | url = URL, 26 | packages =['skfeature', 'skfeature.utility','skfeature.function','skfeature.function.information_theoretical_based','skfeature.function.similarity_based','skfeature.function.sparse_learning_based','skfeature.function.statistical_based','skfeature.function.streaming','skfeature.function.structure','skfeature.function.wrapper',] , 27 | ) 28 | -------------------------------------------------------------------------------- /skfeature/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/__init__.py -------------------------------------------------------------------------------- /skfeature/data/ALLAML.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/ALLAML.mat -------------------------------------------------------------------------------- /skfeature/data/BASEHOCK.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/BASEHOCK.mat -------------------------------------------------------------------------------- /skfeature/data/CLL-SUB-111.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/CLL-SUB-111.mat -------------------------------------------------------------------------------- /skfeature/data/COIL20.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/COIL20.mat -------------------------------------------------------------------------------- /skfeature/data/Carcinom.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Carcinom.mat -------------------------------------------------------------------------------- /skfeature/data/GLI-85.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/GLI-85.mat -------------------------------------------------------------------------------- /skfeature/data/GLIOMA.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/GLIOMA.mat -------------------------------------------------------------------------------- /skfeature/data/Isolet.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Isolet.mat -------------------------------------------------------------------------------- /skfeature/data/ORL.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/ORL.mat -------------------------------------------------------------------------------- /skfeature/data/PCMAC.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/PCMAC.mat -------------------------------------------------------------------------------- /skfeature/data/Prostate-GE.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Prostate-GE.mat -------------------------------------------------------------------------------- /skfeature/data/RELATHE.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/RELATHE.mat -------------------------------------------------------------------------------- /skfeature/data/SMK-CAN-187.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/SMK-CAN-187.mat -------------------------------------------------------------------------------- /skfeature/data/TOX-171.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/TOX-171.mat -------------------------------------------------------------------------------- /skfeature/data/USPS.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/USPS.mat -------------------------------------------------------------------------------- /skfeature/data/Yale.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Yale.mat -------------------------------------------------------------------------------- /skfeature/data/arcene.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/arcene.mat -------------------------------------------------------------------------------- /skfeature/data/colon.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/colon.mat -------------------------------------------------------------------------------- /skfeature/data/gisette.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/gisette.mat -------------------------------------------------------------------------------- /skfeature/data/leukemia.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/leukemia.mat -------------------------------------------------------------------------------- /skfeature/data/lung.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lung.mat -------------------------------------------------------------------------------- /skfeature/data/lung_small.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lung_small.mat -------------------------------------------------------------------------------- /skfeature/data/lymphoma.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lymphoma.mat -------------------------------------------------------------------------------- /skfeature/data/madelon.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/madelon.mat -------------------------------------------------------------------------------- /skfeature/data/nci9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/nci9.mat -------------------------------------------------------------------------------- /skfeature/data/orlraws10P.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/orlraws10P.mat -------------------------------------------------------------------------------- /skfeature/data/pixraw10P.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/pixraw10P.mat -------------------------------------------------------------------------------- /skfeature/data/warpAR10P.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/warpAR10P.mat -------------------------------------------------------------------------------- /skfeature/data/warpPIE10P.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/warpPIE10P.mat -------------------------------------------------------------------------------- /skfeature/example/test_CFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import svm 3 | from sklearn import cross_validation 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.function.statistical_based import CFS 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of selected features on training set 27 | idx = CFS.cfs(X[train], y[train]) 28 | 29 | # obtain the dataset on the selected features 30 | selected_features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(selected_features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(selected_features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() -------------------------------------------------------------------------------- /skfeature/example/test_CIFE.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import CIFE 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = CIFE.cife(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_CMIM.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import CMIM 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = CMIM.cmim(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_DISR.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import DISR 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = DISR.disr(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_FCBF.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import FCBF 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx = FCBF.fcbf(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_ICAP.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import ICAP 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = ICAP.icap(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_JMI.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import JMI 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = JMI.jmi(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_MCFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.sparse_learning_based import MCFS 3 | from skfeature.utility import construct_W 4 | from skfeature.utility import unsupervised_evaluation 5 | 6 | 7 | def main(): 8 | # load data 9 | mat = scipy.io.loadmat('../data/COIL20.mat') 10 | X = mat['X'] # data 11 | X = X.astype(float) 12 | y = mat['Y'] # label 13 | y = y[:, 0] 14 | 15 | # construct affinity matrix 16 | kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} 17 | W = construct_W.construct_W(X, **kwargs) 18 | 19 | num_fea = 100 # specify the number of selected features 20 | num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth 21 | 22 | # obtain the feature weight matrix 23 | Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20) 24 | 25 | # sort the feature scores in an ascending order according to the feature scores 26 | idx = MCFS.feature_ranking(Weight) 27 | 28 | # obtain the dataset on the selected features 29 | selected_features = X[:, idx[0:num_fea]] 30 | 31 | # perform kmeans clustering based on the selected features and repeats 20 times 32 | nmi_total = 0 33 | acc_total = 0 34 | for i in range(0, 20): 35 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 36 | nmi_total += nmi 37 | acc_total += acc 38 | 39 | # output the average NMI and average ACC 40 | print 'NMI:', float(nmi_total)/20 41 | print 'ACC:', float(acc_total)/20 42 | 43 | if __name__ == '__main__': 44 | main() -------------------------------------------------------------------------------- /skfeature/example/test_MIFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import MIFS 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/BASEHOCK.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx = MIFS.mifs(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | print acc 41 | correct = correct + acc 42 | 43 | # output the average classification accuracy over all 10 folds 44 | print 'Accuracy:', float(correct)/10 45 | 46 | if __name__ == '__main__': 47 | main() -------------------------------------------------------------------------------- /skfeature/example/test_MIM.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import MIM 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = MIM.mim(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_MRMR.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.information_theoretical_based import MRMR 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 10 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of each feature on the training set 27 | idx,_,_ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea) 28 | 29 | # obtain the dataset on the selected features 30 | features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_NDFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.sparse_learning_based import NDFS 3 | from skfeature.utility import construct_W 4 | from skfeature.utility.sparse_learning import feature_ranking 5 | from skfeature.utility import unsupervised_evaluation 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | 16 | # construct affinity matrix 17 | kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} 18 | W = construct_W.construct_W(X, **kwargs) 19 | 20 | # obtain the feature weight matrix 21 | Weight = NDFS.ndfs(X, W=W, n_clusters=20) 22 | 23 | # sort the feature scores in an ascending order according to the feature scores 24 | idx = feature_ranking(Weight) 25 | 26 | # perform evaluation on clustering task 27 | num_fea = 100 # number of selected features 28 | num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth 29 | 30 | # obtain the dataset on the selected features 31 | selected_features = X[:, idx[0:num_fea]] 32 | 33 | # perform kmeans clustering based on the selected features and repeats 20 times 34 | nmi_total = 0 35 | acc_total = 0 36 | for i in range(0, 20): 37 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 38 | nmi_total += nmi 39 | acc_total += acc 40 | 41 | # output the average NMI and average ACC 42 | print 'NMI:', float(nmi_total)/20 43 | print 'ACC:', float(acc_total)/20 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_RFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import svm 3 | from sklearn import cross_validation 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.function.sparse_learning_based import RFS 6 | from skfeature.utility.sparse_learning import construct_label_matrix, feature_ranking 7 | 8 | 9 | def main(): 10 | # load data 11 | mat = scipy.io.loadmat('../data/COIL20.mat') 12 | X = mat['X'] # data 13 | X = X.astype(float) 14 | y = mat['Y'] # label 15 | y = y[:, 0] 16 | Y = construct_label_matrix(y) 17 | n_samples, n_features = X.shape 18 | 19 | # split data into 10 folds 20 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 21 | 22 | # perform evaluation on classification task 23 | num_fea = 100 # number of selected features 24 | clf = svm.LinearSVC() # linear SVM 25 | 26 | correct = 0 27 | for train, test in ss: 28 | # obtain the feature weight matrix 29 | Weight = RFS.rfs(X[train, :], Y[train, :], gamma=0.1) 30 | 31 | # sort the feature scores in an ascending order according to the feature scores 32 | idx = feature_ranking(Weight) 33 | 34 | # obtain the dataset on the selected features 35 | selected_features = X[:, idx[0:num_fea]] 36 | 37 | # train a classification model with the selected features on the training dataset 38 | clf.fit(selected_features[train], y[train]) 39 | 40 | # predict the class labels of test data 41 | y_predict = clf.predict(selected_features[test]) 42 | 43 | # obtain the classification accuracy on the test data 44 | acc = accuracy_score(y[test], y_predict) 45 | print acc 46 | correct = correct + acc 47 | 48 | # output the average classification accuracy over all 10 folds 49 | print 'Accuracy:', float(correct)/10 50 | 51 | if __name__ == '__main__': 52 | main() -------------------------------------------------------------------------------- /skfeature/example/test_SPEC.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.similarity_based import SPEC 3 | from skfeature.utility import unsupervised_evaluation 4 | 5 | 6 | def main(): 7 | # load data 8 | mat = scipy.io.loadmat('../data/COIL20.mat') 9 | X = mat['X'] # data 10 | X = X.astype(float) 11 | y = mat['Y'] # label 12 | y = y[:, 0] 13 | 14 | # specify the second ranking function which uses all except the 1st eigenvalue 15 | kwargs = {'style': 0} 16 | 17 | # obtain the scores of features 18 | score = SPEC.spec(X, **kwargs) 19 | 20 | # sort the feature scores in an descending order according to the feature scores 21 | idx = SPEC.feature_ranking(score, **kwargs) 22 | 23 | # perform evaluation on clustering task 24 | num_fea = 100 # number of selected features 25 | num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth 26 | 27 | # obtain the dataset on the selected features 28 | selected_features = X[:, idx[0:num_fea]] 29 | 30 | # perform kmeans clustering based on the selected features and repeats 20 times 31 | nmi_total = 0 32 | acc_total = 0 33 | for i in range(0, 20): 34 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 35 | nmi_total += nmi 36 | acc_total += acc 37 | 38 | # output the average NMI and average ACC 39 | print 'NMI:', float(nmi_total)/20 40 | print 'ACC:', float(acc_total)/20 41 | 42 | if __name__ == '__main__': 43 | main() -------------------------------------------------------------------------------- /skfeature/example/test_UDFS.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.sparse_learning_based import UDFS 3 | from skfeature.utility import unsupervised_evaluation 4 | from skfeature.utility.sparse_learning import feature_ranking 5 | 6 | 7 | def main(): 8 | # load data 9 | mat = scipy.io.loadmat('../data/COIL20.mat') 10 | X = mat['X'] # data 11 | X = X.astype(float) 12 | y = mat['Y'] # label 13 | y = y[:, 0] 14 | 15 | # perform evaluation on clustering task 16 | num_fea = 100 # number of selected features 17 | num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth 18 | 19 | # obtain the feature weight matrix 20 | Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster) 21 | 22 | # sort the feature scores in an ascending order according to the feature scores 23 | idx = feature_ranking(Weight) 24 | 25 | # obtain the dataset on the selected features 26 | selected_features = X[:, idx[0:num_fea]] 27 | 28 | # perform kmeans clustering based on the selected features and repeats 20 times 29 | nmi_total = 0 30 | acc_total = 0 31 | for i in range(0, 20): 32 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 33 | nmi_total += nmi 34 | acc_total += acc 35 | 36 | # output the average NMI and average ACC 37 | print 'NMI:', float(nmi_total)/20 38 | print 'ACC:', float(acc_total)/20 39 | 40 | if __name__ == '__main__': 41 | main() -------------------------------------------------------------------------------- /skfeature/example/test_alpha_investing.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import cross_validation 3 | from sklearn.metrics import accuracy_score 4 | from skfeature.function.streaming import alpha_investing 5 | from sklearn import svm 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | y = y.astype(float) 16 | n_samples, n_features = X.shape # number of samples and number of features 17 | 18 | # split data into 10 folds 19 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 20 | 21 | # perform evaluation on classification task 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of selected features 27 | idx = alpha_investing.alpha_investing(X[train], y[train], 0.05, 0.05) 28 | 29 | # obtain the dataset on the selected features 30 | selected_features = X[:, idx] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(selected_features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(selected_features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() -------------------------------------------------------------------------------- /skfeature/example/test_chi_square.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.statistical_based import chi_square 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/BASEHOCK.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the chi-square score of each feature 27 | score = chi_square.chi_square(X, y) 28 | 29 | # rank features in descending order according to score 30 | idx = chi_square.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() -------------------------------------------------------------------------------- /skfeature/example/test_decision_tree_backward.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.cross_validation import KFold 3 | from skfeature.function.wrapper import decision_tree_backward 4 | from sklearn import svm 5 | from sklearn.metrics import accuracy_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | clf = svm.LinearSVC() # linear SVM 22 | 23 | correct = 0 24 | for train, test in ss: 25 | # obtain the idx of selected features from the training set 26 | idx = decision_tree_backward.decision_tree_backward(X[train], y[train], n_features) 27 | 28 | # obtain the dataset on the selected features 29 | X_selected = X[:, idx] 30 | 31 | # train a classification model with the selected features on the training dataset 32 | clf.fit(X_selected[train], y[train]) 33 | 34 | # predict the class labels of test data 35 | y_predict = clf.predict(X_selected[test]) 36 | 37 | # obtain the classification accuracy on the test data 38 | acc = accuracy_score(y[test], y_predict) 39 | correct = correct + acc 40 | 41 | # output the average classification accuracy over all 10 folds 42 | print 'Accuracy:', float(correct)/10 43 | 44 | if __name__ == '__main__': 45 | main() -------------------------------------------------------------------------------- /skfeature/example/test_decision_tree_forward.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.cross_validation import KFold 3 | from skfeature.function.wrapper import decision_tree_forward 4 | from sklearn import svm 5 | from sklearn.metrics import accuracy_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | clf = svm.LinearSVC() # linear SVM 22 | 23 | correct = 0 24 | for train, test in ss: 25 | # obtain the idx of selected features from the training set 26 | idx = decision_tree_forward.decision_tree_forward(X[train], y[train], n_features) 27 | 28 | # obtain the dataset on the selected features 29 | X_selected = X[:, idx] 30 | 31 | # train a classification model with the selected features on the training dataset 32 | clf.fit(X_selected[train], y[train]) 33 | 34 | # predict the class labels of test data 35 | y_predict = clf.predict(X_selected[test]) 36 | 37 | # obtain the classification accuracy on the test data 38 | acc = accuracy_score(y[test], y_predict) 39 | correct = correct + acc 40 | 41 | # output the average classification accuracy over all 10 folds 42 | print 'Accuracy:', float(correct)/10 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /skfeature/example/test_f_score.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.statistical_based import f_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the f-score of each feature 27 | score = f_score.f_score(X, y) 28 | 29 | # rank features in descending order according to score 30 | idx = f_score.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() -------------------------------------------------------------------------------- /skfeature/example/test_fisher_score.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import cross_validation 3 | from sklearn import svm 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.function.similarity_based import fisher_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the score of each feature on the training set 27 | score = fisher_score.fisher_score(X[train], y[train]) 28 | 29 | # rank features in descending order according to score 30 | idx = fisher_score.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() 50 | 51 | -------------------------------------------------------------------------------- /skfeature/example/test_gini_index.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import svm 3 | from sklearn.metrics import accuracy_score 4 | from skfeature.function.statistical_based import gini_index 5 | from sklearn import cross_validation 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/colon.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the gini_index score of each feature 27 | score = gini_index.gini_index(X[train], y[train]) 28 | 29 | # rank features in descending order according to score 30 | idx = gini_index.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() 50 | 51 | -------------------------------------------------------------------------------- /skfeature/example/test_group_fs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import rand 3 | from skfeature.function.structure import group_fs 4 | 5 | 6 | def main(): 7 | n_samples = 50 # specify the number of samples in the simulated data 8 | n_features = 100 # specify the number of features in the simulated data 9 | 10 | # simulate the dataset 11 | X = np.random.rand(n_samples, n_features) 12 | 13 | # simulate the feature weight 14 | w_orin = rand(n_features, 1, 1).toarray() 15 | w_orin[0:50] = 0 16 | 17 | # obtain the ground truth of the simulated dataset 18 | noise = np.random.rand(n_samples, 1) 19 | y = np.dot(X, w_orin) + 0.01 * noise 20 | y = y[:, 0] 21 | 22 | z1 = 0.1 # specify the regularization parameter of L1 norm 23 | z2 = 0.1 # specify the regularization parameter of L2 norm for the non-overlapping group 24 | 25 | # specify the group structure among features 26 | idx = np.array([[1, 20, np.sqrt(20)], [21, 40, np.sqrt(20)], [41, 50, np.sqrt(10)], 27 | [51, 70, np.sqrt(20)], [71, 100, np.sqrt(30)]]).T 28 | idx = idx.astype(int) 29 | 30 | # perform feature selection and obtain the feature weight of all the features 31 | w, obj, value_gamma = group_fs.group_fs(X, y, z1, z2, idx, verbose=True) 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /skfeature/example/test_lap_score.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.similarity_based import lap_score 3 | from skfeature.utility import construct_W 4 | from skfeature.utility import unsupervised_evaluation 5 | 6 | 7 | def main(): 8 | # load data 9 | mat = scipy.io.loadmat('../data/COIL20.mat') 10 | X = mat['X'] # data 11 | X = X.astype(float) 12 | y = mat['Y'] # label 13 | y = y[:, 0] 14 | 15 | # construct affinity matrix 16 | kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} 17 | W = construct_W.construct_W(X, **kwargs_W) 18 | 19 | # obtain the scores of features 20 | score = lap_score.lap_score(X, W=W) 21 | 22 | # sort the feature scores in an ascending order according to the feature scores 23 | idx = lap_score.feature_ranking(score) 24 | 25 | # perform evaluation on clustering task 26 | num_fea = 100 # number of selected features 27 | num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth 28 | 29 | # obtain the dataset on the selected features 30 | selected_features = X[:, idx[0:num_fea]] 31 | 32 | # perform kmeans clustering based on the selected features and repeats 20 times 33 | nmi_total = 0 34 | acc_total = 0 35 | for i in range(0, 20): 36 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 37 | nmi_total += nmi 38 | acc_total += acc 39 | 40 | # output the average NMI and average ACC 41 | print 'NMI:', float(nmi_total)/20 42 | print 'ACC:', float(acc_total)/20 43 | 44 | if __name__ == '__main__': 45 | main() -------------------------------------------------------------------------------- /skfeature/example/test_ll_l21.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import svm 3 | from sklearn import cross_validation 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.utility.sparse_learning import * 6 | from skfeature.function.sparse_learning_based import ll_l21 7 | 8 | 9 | def main(): 10 | # load data 11 | mat = scipy.io.loadmat('../data/COIL20.mat') 12 | X = mat['X'] # data 13 | X = X.astype(float) 14 | y = mat['Y'] # label 15 | y = y[:, 0] 16 | Y = construct_label_matrix_pan(y) 17 | n_samples, n_features = X.shape # number of samples and number of features 18 | 19 | # split data into 10 folds 20 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 21 | 22 | # perform evaluation on classification task 23 | num_fea = 100 # number of selected features 24 | clf = svm.LinearSVC() # linear SVM 25 | 26 | correct = 0 27 | for train, test in ss: 28 | # obtain the feature weight matrix 29 | Weight, obj, value_gamma = ll_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False) 30 | 31 | # sort the feature scores in an ascending order according to the feature scores 32 | idx = feature_ranking(Weight) 33 | 34 | # obtain the dataset on the selected features 35 | selected_features = X[:, idx[0:num_fea]] 36 | 37 | # train a classification model with the selected features on the training dataset 38 | clf.fit(selected_features[train], y[train]) 39 | 40 | # predict the class labels of test data 41 | y_predict = clf.predict(selected_features[test]) 42 | 43 | # obtain the classification accuracy on the test data 44 | acc = accuracy_score(y[test], y_predict) 45 | correct = correct + acc 46 | 47 | # output the average classification accuracy over all 10 folds 48 | print 'Accuracy:', float(correct)/10 49 | 50 | if __name__ == '__main__': 51 | main() -------------------------------------------------------------------------------- /skfeature/example/test_low_variance.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from skfeature.function.statistical_based import low_variance 3 | from skfeature.utility import unsupervised_evaluation 4 | 5 | 6 | def main(): 7 | # load data 8 | mat = scipy.io.loadmat('../data/BASEHOCK.mat') 9 | X = mat['X'] # data 10 | X = X.astype(float) 11 | y = mat['Y'] # label 12 | y = y[:, 0] 13 | 14 | p = 0.1 # specify the threshold p to be 0.1 15 | num_cluster = 2 # specify the number of clusters to be 2 16 | 17 | # perform feature selection and obtain the dataset on the selected features 18 | selected_features = low_variance.low_variance_feature_selection(X, p*(1-p)) 19 | 20 | # perform kmeans clustering based on the selected features and repeats 20 times 21 | nmi_total = 0 22 | acc_total = 0 23 | for i in range(0, 20): 24 | nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) 25 | nmi_total += nmi 26 | acc_total += acc 27 | 28 | # output the average NMI and average ACC 29 | print 'NMI:', float(nmi_total)/20 30 | print 'ACC:', float(acc_total)/20 31 | 32 | if __name__ == '__main__': 33 | main() -------------------------------------------------------------------------------- /skfeature/example/test_ls_l21.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import svm 3 | from sklearn import cross_validation 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.utility.sparse_learning import * 6 | from skfeature.function.sparse_learning_based import ls_l21 7 | 8 | 9 | def main(): 10 | # load data 11 | mat = scipy.io.loadmat('../data/COIL20.mat') 12 | X = mat['X'] # data 13 | X = X.astype(float) 14 | y = mat['Y'] # label 15 | y = y[:, 0] 16 | Y = construct_label_matrix_pan(y) 17 | n_samples, n_features = X.shape # number of samples and number of features 18 | 19 | # split data into 10 folds 20 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 21 | 22 | # perform evaluation on classification task 23 | num_fea = 100 # number of selected features 24 | clf = svm.LinearSVC() # linear SVM 25 | 26 | correct = 0 27 | for train, test in ss: 28 | # obtain the feature weight matrix 29 | Weight, obj, value_gamma = ls_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False) 30 | 31 | # sort the feature scores in an ascending order according to the feature scores 32 | idx = feature_ranking(Weight) 33 | 34 | # obtain the dataset on the selected features 35 | selected_features = X[:, idx[0:num_fea]] 36 | 37 | # train a classification model with the selected features on the training dataset 38 | clf.fit(selected_features[train], y[train]) 39 | 40 | # predict the class labels of test data 41 | y_predict = clf.predict(selected_features[test]) 42 | 43 | # obtain the classification accuracy on the test data 44 | acc = accuracy_score(y[test], y_predict) 45 | correct = correct + acc 46 | 47 | # output the average classification accuracy over all 10 folds 48 | print 'Accuracy:', float(correct)/10 49 | 50 | if __name__ == '__main__': 51 | main() -------------------------------------------------------------------------------- /skfeature/example/test_reliefF.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import cross_validation 3 | from sklearn import svm 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.function.similarity_based import reliefF 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the score of each feature on the training set 27 | score = reliefF.reliefF(X[train], y[train]) 28 | 29 | # rank features in descending order according to score 30 | idx = reliefF.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() -------------------------------------------------------------------------------- /skfeature/example/test_svm_backward.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.cross_validation import KFold 3 | from skfeature.function.wrapper import svm_backward 4 | from sklearn import svm 5 | from sklearn.metrics import accuracy_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | clf = svm.LinearSVC() # linear SVM 22 | 23 | correct = 0 24 | for train, test in ss: 25 | # obtain the idx of selected features from the training set 26 | idx = svm_backward.svm_backward(X[train], y[train], n_features) 27 | 28 | # obtain the dataset on the selected features 29 | X_selected = X[:, idx] 30 | 31 | # train a classification model with the selected features on the training dataset 32 | clf.fit(X_selected[train], y[train]) 33 | 34 | # predict the class labels of test data 35 | y_predict = clf.predict(X_selected[test]) 36 | 37 | # obtain the classification accuracy on the test data 38 | acc = accuracy_score(y[test], y_predict) 39 | correct = correct + acc 40 | 41 | # output the average classification accuracy over all 10 folds 42 | print 'Accuracy:', float(correct)/10 43 | 44 | if __name__ == '__main__': 45 | main() -------------------------------------------------------------------------------- /skfeature/example/test_svm_forward.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.cross_validation import KFold 3 | from skfeature.function.wrapper import svm_forward 4 | from sklearn import svm 5 | from sklearn.metrics import accuracy_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | clf = svm.LinearSVC() # linear SVM 22 | 23 | correct = 0 24 | for train, test in ss: 25 | # obtain the idx of selected features from the training set 26 | idx = svm_forward.svm_forward(X[train], y[train], n_features) 27 | 28 | # obtain the dataset on the selected features 29 | X_selected = X[:, idx] 30 | 31 | # train a classification model with the selected features on the training dataset 32 | clf.fit(X_selected[train], y[train]) 33 | 34 | # predict the class labels of test data 35 | y_predict = clf.predict(X_selected[test]) 36 | 37 | # obtain the classification accuracy on the test data 38 | acc = accuracy_score(y[test], y_predict) 39 | correct = correct + acc 40 | 41 | # output the average classification accuracy over all 10 folds 42 | print 'Accuracy:', float(correct)/10 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /skfeature/example/test_t_score.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn.metrics import accuracy_score 3 | from sklearn import cross_validation 4 | from sklearn import svm 5 | from skfeature.function.statistical_based import t_score 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the t-score of each feature 27 | score = t_score.t_score(X, y) 28 | 29 | # rank features in descending order according to score 30 | idx = t_score.feature_ranking(score) 31 | 32 | # obtain the dataset on the selected features 33 | selected_features = X[:, idx[0:num_fea]] 34 | 35 | # train a classification model with the selected features on the training dataset 36 | clf.fit(selected_features[train], y[train]) 37 | 38 | # predict the class labels of test data 39 | y_predict = clf.predict(selected_features[test]) 40 | 41 | # obtain the classification accuracy on the test data 42 | acc = accuracy_score(y[test], y_predict) 43 | correct = correct + acc 44 | 45 | # output the average classification accuracy over all 10 folds 46 | print 'Accuracy:', float(correct)/10 47 | 48 | if __name__ == '__main__': 49 | main() -------------------------------------------------------------------------------- /skfeature/example/test_trace_ratio.py: -------------------------------------------------------------------------------- 1 | import scipy.io 2 | from sklearn import cross_validation 3 | from sklearn import svm 4 | from sklearn.metrics import accuracy_score 5 | from skfeature.function.similarity_based import trace_ratio 6 | 7 | 8 | def main(): 9 | # load data 10 | mat = scipy.io.loadmat('../data/COIL20.mat') 11 | X = mat['X'] # data 12 | X = X.astype(float) 13 | y = mat['Y'] # label 14 | y = y[:, 0] 15 | n_samples, n_features = X.shape # number of samples and number of features 16 | 17 | # split data into 10 folds 18 | ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True) 19 | 20 | # perform evaluation on classification task 21 | num_fea = 100 # number of selected features 22 | clf = svm.LinearSVC() # linear SVM 23 | 24 | correct = 0 25 | for train, test in ss: 26 | # obtain the index of selected features 27 | idx, feature_score, subset_score = trace_ratio.trace_ratio(X[train], y[train], num_fea, style='fisher') 28 | 29 | # obtain the dataset on the selected features 30 | selected_features = X[:, idx[0:num_fea]] 31 | 32 | # train a classification model with the selected features on the training dataset 33 | clf.fit(selected_features[train], y[train]) 34 | 35 | # predict the class labels of test data 36 | y_predict = clf.predict(selected_features[test]) 37 | 38 | # obtain the classification accuracy on the test data 39 | acc = accuracy_score(y[test], y_predict) 40 | correct = correct + acc 41 | 42 | # output the average classification accuracy over all 10 folds 43 | print 'Accuracy:', float(correct)/10 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /skfeature/example/test_tree_fs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import rand 3 | from skfeature.function.structure import tree_fs 4 | 5 | 6 | def main(): 7 | n_samples = 50 # specify the number of samples in the simulated data 8 | n_features = 100 # specify the number of features in the simulated data 9 | 10 | # simulate the dataset 11 | X = np.random.rand(n_samples, n_features) 12 | 13 | # simulate the feature weight 14 | w_orin = rand(n_features, 1, 1).toarray() 15 | w_orin[0:50] = 0 16 | 17 | # obtain the ground truth of the simulated dataset 18 | noise = np.random.rand(n_samples, 1) 19 | y = np.dot(X, w_orin) + 0.01 * noise 20 | y = y[:, 0] 21 | 22 | 23 | z = 0.01 # specify the regularization parameter of regularization parameter of L2 norm for the non-overlapping group 24 | 25 | # specify the tree structure among features 26 | idx = np.array([[-1, -1, 1], [1, 20, np.sqrt(20)], [21, 40, np.sqrt(20)], [41, 50, np.sqrt(10)], 27 | [51, 70, np.sqrt(20)], [71, 100, np.sqrt(30)], [1, 50, np.sqrt(50)], [51, 100, np.sqrt(50)]]).T 28 | idx = idx.astype(int) 29 | 30 | # perform feature selection and obtain the feature weight of all the features 31 | w, obj, value_gamma = tree_fs.tree_fs(X, y, z, idx, verbose=True) 32 | 33 | 34 | if __name__ == '__main__': 35 | main() 36 | -------------------------------------------------------------------------------- /skfeature/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/__init__.py -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/CIFE.py: -------------------------------------------------------------------------------- 1 | from skfeature.function.information_theoretical_based import LCSI 2 | 3 | 4 | def cife(X, y, **kwargs): 5 | """ 6 | This function implements the CIFE feature selection 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data, guaranteed to be discrete 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | kwargs: {dictionary} 15 | n_selected_features: {int} 16 | number of features to select 17 | 18 | Output 19 | ------ 20 | F: {numpy array}, shape (n_features,) 21 | index of selected features, F[0] is the most important feature 22 | J_CMI: {numpy array}, shape: (n_features,) 23 | corresponding objective function value of selected features 24 | MIfy: {numpy array}, shape: (n_features,) 25 | corresponding mutual information between selected features and response 26 | 27 | Reference 28 | --------- 29 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 30 | """ 31 | 32 | if 'n_selected_features' in kwargs.keys(): 33 | n_selected_features = kwargs['n_selected_features'] 34 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=1, gamma=1, n_selected_features=n_selected_features) 35 | else: 36 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=1, gamma=1) 37 | return F, J_CMI, MIfy 38 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/CMIM.py: -------------------------------------------------------------------------------- 1 | from skfeature.utility.entropy_estimators import * 2 | 3 | 4 | def cmim(X, y, **kwargs): 5 | """ 6 | This function implements the CMIM feature selection. 7 | The scoring criteria is calculated based on the formula j_cmim=I(f;y)-max_j(I(fj;f)-I(fj;f|y)) 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | Input data, guaranteed to be a discrete numpy array 13 | y: {numpy array}, shape (n_samples,) 14 | guaranteed to be a numpy array 15 | kwargs: {dictionary} 16 | n_selected_features: {int} 17 | number of features to select 18 | 19 | Output 20 | ------ 21 | F: {numpy array}, shape (n_features,) 22 | index of selected features, F[0] is the most important feature 23 | J_CMIM: {numpy array}, shape: (n_features,) 24 | corresponding objective function value of selected features 25 | MIfy: {numpy array}, shape: (n_features,) 26 | corresponding mutual information between selected features and response 27 | 28 | Reference 29 | --------- 30 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 31 | """ 32 | 33 | n_samples, n_features = X.shape 34 | # index of selected features, initialized to be empty 35 | F = [] 36 | # Objective function value for selected features 37 | J_CMIM = [] 38 | # Mutual information between feature and response 39 | MIfy = [] 40 | # indicate whether the user specifies the number of features 41 | is_n_selected_features_specified = False 42 | 43 | if 'n_selected_features' in kwargs.keys(): 44 | n_selected_features = kwargs['n_selected_features'] 45 | is_n_selected_features_specified = True 46 | 47 | # t1 stores I(f;y) for each feature f 48 | t1 = np.zeros(n_features) 49 | 50 | # max stores max(I(fj;f)-I(fj;f|y)) for each feature f 51 | # we assign an extreme small value to max[i] ito make it is smaller than possible value of max(I(fj;f)-I(fj;f|y)) 52 | max = -10000000*np.ones(n_features) 53 | for i in range(n_features): 54 | f = X[:, i] 55 | t1[i] = midd(f, y) 56 | 57 | # make sure that j_cmi is positive at the very beginning 58 | j_cmim = 1 59 | 60 | while True: 61 | if len(F) == 0: 62 | # select the feature whose mutual information is the largest 63 | idx = np.argmax(t1) 64 | F.append(idx) 65 | J_CMIM.append(t1[idx]) 66 | MIfy.append(t1[idx]) 67 | f_select = X[:, idx] 68 | 69 | if is_n_selected_features_specified: 70 | if len(F) == n_selected_features: 71 | break 72 | else: 73 | if j_cmim <= 0: 74 | break 75 | 76 | # we assign an extreme small value to j_cmim to ensure it is smaller than all possible values of j_cmim 77 | j_cmim = -1000000000000 78 | for i in range(n_features): 79 | if i not in F: 80 | f = X[:, i] 81 | t2 = midd(f_select, f) 82 | t3 = cmidd(f_select, f, y) 83 | if t2-t3 > max[i]: 84 | max[i] = t2-t3 85 | # calculate j_cmim for feature i (not in F) 86 | t = t1[i] - max[i] 87 | # record the largest j_cmim and the corresponding feature index 88 | if t > j_cmim: 89 | j_cmim = t 90 | idx = i 91 | F.append(idx) 92 | J_CMIM.append(j_cmim) 93 | MIfy.append(t1[idx]) 94 | f_select = X[:, idx] 95 | 96 | return np.array(F), np.array(J_CMIM), np.array(MIfy) -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/DISR.py: -------------------------------------------------------------------------------- 1 | from skfeature.utility.entropy_estimators import * 2 | from skfeature.utility.mutual_information import conditional_entropy 3 | 4 | 5 | def disr(X, y, **kwargs): 6 | """ 7 | This function implement the DISR feature selection. 8 | The scoring criteria is calculated based on the formula j_disr=sum_j(I(f,fj;y)/H(f,fj,y)) 9 | 10 | Input 11 | ----- 12 | X: {numpy array}, shape (n_samples, n_features) 13 | input data, guaranteed to be a discrete data matrix 14 | y: {numpy array}, shape (n_samples,) 15 | input class labels 16 | 17 | kwargs: {dictionary} 18 | n_selected_features: {int} 19 | number of features to select 20 | 21 | Output 22 | ------ 23 | F: {numpy array}, shape (n_features, ) 24 | index of selected features, F[0] is the most important feature 25 | J_DISR: {numpy array}, shape: (n_features,) 26 | corresponding objective function value of selected features 27 | MIfy: {numpy array}, shape: (n_features,) 28 | corresponding mutual information between selected features and response 29 | 30 | Reference 31 | --------- 32 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 33 | """ 34 | 35 | n_samples, n_features = X.shape 36 | # index of selected features, initialized to be empty 37 | F = [] 38 | # Objective function value for selected features 39 | J_DISR = [] 40 | # Mutual information between feature and response 41 | MIfy = [] 42 | # indicate whether the user specifies the number of features 43 | is_n_selected_features_specified = False 44 | 45 | if 'n_selected_features' in kwargs.keys(): 46 | n_selected_features = kwargs['n_selected_features'] 47 | is_n_selected_features_specified = True 48 | 49 | # sum stores sum_j(I(f,fj;y)/H(f,fj,y)) for each feature f 50 | sum = np.zeros(n_features) 51 | 52 | # make sure that j_cmi is positive at the very beginning 53 | j_disr = 1 54 | 55 | while True: 56 | if len(F) == 0: 57 | # t1 stores I(f;y) for each feature f 58 | t1 = np.zeros(n_features) 59 | for i in range(n_features): 60 | f = X[:, i] 61 | t1[i] = midd(f, y) 62 | # select the feature whose mutual information is the largest 63 | idx = np.argmax(t1) 64 | F.append(idx) 65 | J_DISR.append(t1[idx]) 66 | MIfy.append(t1[idx]) 67 | f_select = X[:, idx] 68 | 69 | if is_n_selected_features_specified is True: 70 | if len(F) == n_selected_features: 71 | break 72 | if is_n_selected_features_specified is not True: 73 | if j_disr <= 0: 74 | break 75 | 76 | # we assign an extreme small value to j_disr to ensure that it is smaller than all possible value of j_disr 77 | j_disr = -1E30 78 | for i in range(n_features): 79 | if i not in F: 80 | f = X[:, i] 81 | t2 = midd(f_select, y) + cmidd(f, y, f_select) 82 | t3 = entropyd(f) + conditional_entropy(f_select, f) + (conditional_entropy(y, f_select) - cmidd(y, f, f_select)) 83 | sum[i] += np.true_divide(t2, t3) 84 | # record the largest j_disr and the corresponding feature index 85 | if sum[i] > j_disr: 86 | j_disr = sum[i] 87 | idx = i 88 | F.append(idx) 89 | J_DISR.append(j_disr) 90 | MIfy.append(t1[idx]) 91 | f_select = X[:, idx] 92 | 93 | return np.array(F), np.array(J_DISR), np.array(MIfy) 94 | 95 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/FCBF.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skfeature.utility.mutual_information import su_calculation 3 | 4 | 5 | def fcbf(X, y, **kwargs): 6 | """ 7 | This function implements Fast Correlation Based Filter algorithm 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data, guaranteed to be discrete 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | kwargs: {dictionary} 16 | delta: {float} 17 | delta is a threshold parameter, the default value of delta is 0 18 | 19 | Output 20 | ------ 21 | F: {numpy array}, shape (n_features,) 22 | index of selected features, F[0] is the most important feature 23 | SU: {numpy array}, shape (n_features,) 24 | symmetrical uncertainty of selected features 25 | 26 | Reference 27 | --------- 28 | Yu, Lei and Liu, Huan. "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution." ICML 2003. 29 | """ 30 | 31 | n_samples, n_features = X.shape 32 | if 'delta' in kwargs.keys(): 33 | delta = kwargs['delta'] 34 | else: 35 | # the default value of delta is 0 36 | delta = 0 37 | 38 | # t1[:,0] stores index of features, t1[:,1] stores symmetrical uncertainty of features 39 | t1 = np.zeros((n_features, 2), dtypes='object') 40 | for i in range(n_features): 41 | f = X[:, i] 42 | t1[i, 0] = i 43 | t1[i, 1] = su_calculation(f, y) 44 | s_list = t1[t1[:, 1] > delta, :] 45 | # index of selected features, initialized to be empty 46 | F = [] 47 | # Symmetrical uncertainty of selected features 48 | SU = [] 49 | while len(s_list) != 0: 50 | # select the largest su inside s_list 51 | idx = np.argmax(s_list[:, 1]) 52 | # record the index of the feature with the largest su 53 | fp = X[:, s_list[idx, 0]] 54 | np.delete(s_list, idx, 0) 55 | F.append(s_list[idx, 0]) 56 | SU.append(s_list[idx, 1]) 57 | for i in s_list[:, 0]: 58 | fi = X[:, i] 59 | if su_calculation(fp, fi) >= t1[i, 1]: 60 | # construct the mask for feature whose su is larger than su(fp,y) 61 | idx = s_list[:, 0] != i 62 | idx = np.array([idx, idx]) 63 | idx = np.transpose(idx) 64 | # delete the feature by using the mask 65 | s_list = s_list[idx] 66 | length = len(s_list)//2 67 | s_list = s_list.reshape((length, 2)) 68 | return np.array(F, dtype=int), np.array(SU) 69 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/ICAP.py: -------------------------------------------------------------------------------- 1 | from skfeature.utility.entropy_estimators import * 2 | 3 | 4 | def icap(X, y, **kwargs): 5 | """ 6 | This function implements the ICAP feature selection. 7 | The scoring criteria is calculated based on the formula j_icap = I(f;y) - max_j(0,(I(fj;f)-I(fj;f|y))) 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data, guaranteed to be a discrete data matrix 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | kwargs: {dictionary} 16 | n_selected_features: {int} 17 | number of features to select 18 | 19 | Output 20 | ------ 21 | F: {numpy array}, shape (n_features,) 22 | index of selected features, F[0] is the most important feature 23 | J_ICAP: {numpy array}, shape: (n_features,) 24 | corresponding objective function value of selected features 25 | MIfy: {numpy array}, shape: (n_features,) 26 | corresponding mutual information between selected features and response 27 | """ 28 | n_samples, n_features = X.shape 29 | # index of selected features, initialized to be empty 30 | F = [] 31 | # Objective function value for selected features 32 | J_ICAP = [] 33 | # Mutual information between feature and response 34 | MIfy = [] 35 | # indicate whether the user specifies the number of features 36 | is_n_selected_features_specified = False 37 | if 'n_selected_features' in kwargs.keys(): 38 | n_selected_features = kwargs['n_selected_features'] 39 | is_n_selected_features_specified = True 40 | 41 | # t1 contains I(f;y) for each feature f 42 | t1 = np.zeros(n_features) 43 | # max contains max_j(0,(I(fj;f)-I(fj;f|y))) for each feature f 44 | max = np.zeros(n_features) 45 | for i in range(n_features): 46 | f = X[:, i] 47 | t1[i] = midd(f, y) 48 | 49 | # make sure that j_cmi is positive at the very beginning 50 | j_icap = 1 51 | 52 | while True: 53 | if len(F) == 0: 54 | # select the feature whose mutual information is the largest 55 | idx = np.argmax(t1) 56 | F.append(idx) 57 | J_ICAP.append(t1[idx]) 58 | MIfy.append(t1[idx]) 59 | f_select = X[:, idx] 60 | 61 | if is_n_selected_features_specified is True: 62 | if len(F) == n_selected_features: 63 | break 64 | if is_n_selected_features_specified is not True: 65 | if j_icap <= 0: 66 | break 67 | 68 | # we assign an extreme small value to j_icap to ensure it is smaller than all possible values of j_icap 69 | j_icap = -1000000000000 70 | for i in range(n_features): 71 | if i not in F: 72 | f = X[:, i] 73 | t2 = midd(f_select, f) 74 | t3 = cmidd(f_select, f, y) 75 | if t2-t3 > max[i]: 76 | max[i] = t2-t3 77 | # calculate j_icap for feature i (not in F) 78 | t = t1[i] - max[i] 79 | # record the largest j_icap and the corresponding feature index 80 | if t > j_icap: 81 | j_icap = t 82 | idx = i 83 | F.append(idx) 84 | J_ICAP.append(j_icap) 85 | MIfy.append(t1[idx]) 86 | f_select = X[:, idx] 87 | 88 | return np.array(F), np.array(J_ICAP), np.array(MIfy) 89 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/JMI.py: -------------------------------------------------------------------------------- 1 | from skfeature.function.information_theoretical_based import LCSI 2 | 3 | 4 | def jmi(X, y, **kwargs): 5 | """ 6 | This function implements the JMI feature selection 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data, guaranteed to be discrete 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | kwargs: {dictionary} 15 | n_selected_features: {int} 16 | number of features to select 17 | 18 | Output 19 | ------ 20 | F: {numpy array}, shape (n_features,) 21 | index of selected features, F[0] is the most important feature 22 | J_CMI: {numpy array}, shape: (n_features,) 23 | corresponding objective function value of selected features 24 | MIfy: {numpy array}, shape: (n_features,) 25 | corresponding mutual information between selected features and response 26 | 27 | Reference 28 | --------- 29 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 30 | """ 31 | if 'n_selected_features' in kwargs.keys(): 32 | n_selected_features = kwargs['n_selected_features'] 33 | F, J_CMI, MIfy = LCSI.lcsi(X, y, function_name='JMI', n_selected_features=n_selected_features) 34 | else: 35 | F, J_CMI, MIfy = LCSI.lcsi(X, y, function_name='JMI') 36 | return F, J_CMI, MIfy -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/LCSI.py: -------------------------------------------------------------------------------- 1 | from skfeature.utility.entropy_estimators import * 2 | 3 | 4 | def lcsi(X, y, **kwargs): 5 | """ 6 | This function implements the basic scoring criteria for linear combination of shannon information term. 7 | The scoring criteria is calculated based on the formula j_cmi=I(f;y)-beta*sum_j(I(fj;f))+gamma*sum(I(fj;f|y)) 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data, guaranteed to be a discrete data matrix 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | kwargs: {dictionary} 16 | Parameters for different feature selection algorithms. 17 | beta: {float} 18 | beta is the parameter in j_cmi=I(f;y)-beta*sum(I(fj;f))+gamma*sum(I(fj;f|y)) 19 | gamma: {float} 20 | gamma is the parameter in j_cmi=I(f;y)-beta*sum(I(fj;f))+gamma*sum(I(fj;f|y)) 21 | function_name: {string} 22 | name of the feature selection function 23 | n_selected_features: {int} 24 | number of features to select 25 | 26 | Output 27 | ------ 28 | F: {numpy array}, shape: (n_features,) 29 | index of selected features, F[0] is the most important feature 30 | J_CMI: {numpy array}, shape: (n_features,) 31 | corresponding objective function value of selected features 32 | MIfy: {numpy array}, shape: (n_features,) 33 | corresponding mutual information between selected features and response 34 | 35 | Reference 36 | --------- 37 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 38 | """ 39 | 40 | n_samples, n_features = X.shape 41 | # index of selected features, initialized to be empty 42 | F = [] 43 | # Objective function value for selected features 44 | J_CMI = [] 45 | # Mutual information between feature and response 46 | MIfy = [] 47 | # indicate whether the user specifies the number of features 48 | is_n_selected_features_specified = False 49 | # initialize the parameters 50 | if 'beta' in kwargs.keys(): 51 | beta = kwargs['beta'] 52 | if 'gamma' in kwargs.keys(): 53 | gamma = kwargs['gamma'] 54 | if 'n_selected_features' in kwargs.keys(): 55 | n_selected_features = kwargs['n_selected_features'] 56 | is_n_selected_features_specified = True 57 | 58 | # select the feature whose j_cmi is the largest 59 | # t1 stores I(f;y) for each feature f 60 | t1 = np.zeros(n_features) 61 | # t2 stores sum_j(I(fj;f)) for each feature f 62 | t2 = np.zeros(n_features) 63 | # t3 stores sum_j(I(fj;f|y)) for each feature f 64 | t3 = np.zeros(n_features) 65 | for i in range(n_features): 66 | f = X[:, i] 67 | t1[i] = midd(f, y) 68 | 69 | # make sure that j_cmi is positive at the very beginning 70 | j_cmi = 1 71 | 72 | while True: 73 | if len(F) == 0: 74 | # select the feature whose mutual information is the largest 75 | idx = np.argmax(t1) 76 | F.append(idx) 77 | J_CMI.append(t1[idx]) 78 | MIfy.append(t1[idx]) 79 | f_select = X[:, idx] 80 | 81 | if is_n_selected_features_specified: 82 | if len(F) == n_selected_features: 83 | break 84 | else: 85 | if j_cmi < 0: 86 | break 87 | 88 | # we assign an extreme small value to j_cmi to ensure it is smaller than all possible values of j_cmi 89 | j_cmi = -1E30 90 | if 'function_name' in kwargs.keys(): 91 | if kwargs['function_name'] == 'MRMR': 92 | beta = 1.0 / len(F) 93 | elif kwargs['function_name'] == 'JMI': 94 | beta = 1.0 / len(F) 95 | gamma = 1.0 / len(F) 96 | for i in range(n_features): 97 | if i not in F: 98 | f = X[:, i] 99 | t2[i] += midd(f_select, f) 100 | t3[i] += cmidd(f_select, f, y) 101 | # calculate j_cmi for feature i (not in F) 102 | t = t1[i] - beta*t2[i] + gamma*t3[i] 103 | # record the largest j_cmi and the corresponding feature index 104 | if t > j_cmi: 105 | j_cmi = t 106 | idx = i 107 | F.append(idx) 108 | J_CMI.append(j_cmi) 109 | MIfy.append(t1[idx]) 110 | f_select = X[:, idx] 111 | 112 | return np.array(F), np.array(J_CMI), np.array(MIfy) 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/MIFS.py: -------------------------------------------------------------------------------- 1 | from skfeature.function.information_theoretical_based import LCSI 2 | 3 | 4 | def mifs(X, y, **kwargs): 5 | """ 6 | This function implements the MIFS feature selection 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data, guaranteed to be discrete 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | kwargs: {dictionary} 15 | n_selected_features: {int} 16 | number of features to select 17 | 18 | Output 19 | ------ 20 | F: {numpy array}, shape (n_features,) 21 | index of selected features, F[0] is the most important feature 22 | J_CMI: {numpy array}, shape: (n_features,) 23 | corresponding objective function value of selected features 24 | MIfy: {numpy array}, shape: (n_features,) 25 | corresponding mutual information between selected features and response 26 | 27 | Reference 28 | --------- 29 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 30 | """ 31 | 32 | if 'beta' not in kwargs.keys(): 33 | beta = 0.5 34 | else: 35 | beta = kwargs['beta'] 36 | if 'n_selected_features' in kwargs.keys(): 37 | n_selected_features = kwargs['n_selected_features'] 38 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0, n_selected_features=n_selected_features) 39 | else: 40 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0) 41 | return F, J_CMI, MIfy 42 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/MIM.py: -------------------------------------------------------------------------------- 1 | from skfeature.function.information_theoretical_based import LCSI 2 | 3 | 4 | def mim(X, y, **kwargs): 5 | """ 6 | This function implements the MIM feature selection 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data, guaranteed to be discrete 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | kwargs: {dictionary} 15 | n_selected_features: {int} 16 | number of features to select 17 | 18 | Output 19 | ------ 20 | F: {numpy array}, shape (n_features, ) 21 | index of selected features, F[0] is the most important feature 22 | J_CMI: {numpy array}, shape: (n_features,) 23 | corresponding objective function value of selected features 24 | MIfy: {numpy array}, shape: (n_features,) 25 | corresponding mutual information between selected features and response 26 | 27 | Reference 28 | --------- 29 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 30 | """ 31 | 32 | if 'n_selected_features' in kwargs.keys(): 33 | n_selected_features = kwargs['n_selected_features'] 34 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=0, gamma=0, n_selected_features=n_selected_features) 35 | else: 36 | F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=0, gamma=0) 37 | return F, J_CMI, MIfy 38 | -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/MRMR.py: -------------------------------------------------------------------------------- 1 | from skfeature.function.information_theoretical_based import LCSI 2 | 3 | 4 | def mrmr(X, y, **kwargs): 5 | """ 6 | This function implements the MRMR feature selection 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data, guaranteed to be discrete 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | kwargs: {dictionary} 15 | n_selected_features: {int} 16 | number of features to select 17 | 18 | Output 19 | ------ 20 | F: {numpy array}, shape (n_features,) 21 | index of selected features, F[0] is the most important feature 22 | J_CMI: {numpy array}, shape: (n_features,) 23 | corresponding objective function value of selected features 24 | MIfy: {numpy array}, shape: (n_features,) 25 | corresponding mutual information between selected features and response 26 | 27 | Reference 28 | --------- 29 | Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. 30 | """ 31 | if 'n_selected_features' in kwargs.keys(): 32 | n_selected_features = kwargs['n_selected_features'] 33 | F, J_CMI, MIfy = LCSI.lcsi(X, y, gamma=0, function_name='MRMR', n_selected_features=n_selected_features) 34 | else: 35 | F, J_CMI, MIfy = LCSI.lcsi(X, y, gamma=0, function_name='MRMR') 36 | return F, J_CMI, MIfy -------------------------------------------------------------------------------- /skfeature/function/information_theoretical_based/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/information_theoretical_based/__init__.py -------------------------------------------------------------------------------- /skfeature/function/similarity_based/SPEC.py: -------------------------------------------------------------------------------- 1 | import numpy.matlib 2 | import numpy as np 3 | from scipy.sparse import * 4 | from sklearn.metrics.pairwise import rbf_kernel 5 | from numpy import linalg as LA 6 | 7 | 8 | def spec(X, **kwargs): 9 | """ 10 | This function implements the SPEC feature selection 11 | 12 | Input 13 | ----- 14 | X: {numpy array}, shape (n_samples, n_features) 15 | input data 16 | kwargs: {dictionary} 17 | style: {int} 18 | style == -1, the first feature ranking function, use all eigenvalues 19 | style == 0, the second feature ranking function, use all except the 1st eigenvalue 20 | style >= 2, the third feature ranking function, use the first k except 1st eigenvalue 21 | W: {sparse matrix}, shape (n_samples, n_samples} 22 | input affinity matrix 23 | 24 | Output 25 | ------ 26 | w_fea: {numpy array}, shape (n_features,) 27 | SPEC feature score for each feature 28 | 29 | Reference 30 | --------- 31 | Zhao, Zheng and Liu, Huan. "Spectral Feature Selection for Supervised and Unsupervised Learning." ICML 2007. 32 | """ 33 | 34 | if 'style' not in kwargs: 35 | kwargs['style'] = 0 36 | if 'W' not in kwargs: 37 | kwargs['W'] = rbf_kernel(X, gamma=1) 38 | 39 | style = kwargs['style'] 40 | W = kwargs['W'] 41 | if type(W) is numpy.ndarray: 42 | W = csc_matrix(W) 43 | 44 | n_samples, n_features = X.shape 45 | 46 | # build the degree matrix 47 | X_sum = np.array(W.sum(axis=1)) 48 | D = np.zeros((n_samples, n_samples)) 49 | for i in range(n_samples): 50 | D[i, i] = X_sum[i] 51 | 52 | # build the laplacian matrix 53 | L = D - W 54 | d1 = np.power(np.array(W.sum(axis=1)), -0.5) 55 | d1[np.isinf(d1)] = 0 56 | d2 = np.power(np.array(W.sum(axis=1)), 0.5) 57 | v = np.dot(np.diag(d2[:, 0]), np.ones(n_samples)) 58 | v = v/LA.norm(v) 59 | 60 | # build the normalized laplacian matrix 61 | L_hat = (np.matlib.repmat(d1, 1, n_samples)) * np.array(L) * np.matlib.repmat(np.transpose(d1), n_samples, 1) 62 | 63 | # calculate and construct spectral information 64 | s, U = np.linalg.eigh(L_hat) 65 | s = np.flipud(s) 66 | U = np.fliplr(U) 67 | 68 | # begin to select features 69 | w_fea = np.ones(n_features)*1000 70 | 71 | for i in range(n_features): 72 | f = X[:, i] 73 | F_hat = np.dot(np.diag(d2[:, 0]), f) 74 | l = LA.norm(F_hat) 75 | if l < 100*np.spacing(1): 76 | w_fea[i] = 1000 77 | continue 78 | else: 79 | F_hat = F_hat/l 80 | a = np.array(np.dot(np.transpose(F_hat), U)) 81 | a = np.multiply(a, a) 82 | a = np.transpose(a) 83 | 84 | # use f'Lf formulation 85 | if style == -1: 86 | w_fea[i] = np.sum(a * s) 87 | # using all eigenvalues except the 1st 88 | elif style == 0: 89 | a1 = a[0:n_samples-1] 90 | w_fea[i] = np.sum(a1 * s[0:n_samples-1])/(1-np.power(np.dot(np.transpose(F_hat), v), 2)) 91 | # use first k except the 1st 92 | else: 93 | a1 = a[n_samples-style:n_samples-1] 94 | w_fea[i] = np.sum(a1 * (2-s[n_samples-style: n_samples-1])) 95 | 96 | if style != -1 and style != 0: 97 | w_fea[w_fea == 1000] = -1000 98 | 99 | return w_fea 100 | 101 | 102 | def feature_ranking(score, **kwargs): 103 | if 'style' not in kwargs: 104 | kwargs['style'] = 0 105 | style = kwargs['style'] 106 | 107 | # if style = -1 or 0, ranking features in descending order, the higher the score, the more important the feature is 108 | if style == -1 or style == 0: 109 | idx = np.argsort(score, 0) 110 | return idx[::-1] 111 | # if style != -1 and 0, ranking features in ascending order, the lower the score, the more important the feature is 112 | elif style != -1 and style != 0: 113 | idx = np.argsort(score, 0) 114 | return idx -------------------------------------------------------------------------------- /skfeature/function/similarity_based/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/similarity_based/__init__.py -------------------------------------------------------------------------------- /skfeature/function/similarity_based/fisher_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import * 3 | from skfeature.utility.construct_W import construct_W 4 | 5 | 6 | def fisher_score(X, y): 7 | """ 8 | This function implements the fisher score feature selection, steps are as follows: 9 | 1. Construct the affinity matrix W in fisher score way 10 | 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 11 | 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 12 | 4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1 13 | 14 | Input 15 | ----- 16 | X: {numpy array}, shape (n_samples, n_features) 17 | input data 18 | y: {numpy array}, shape (n_samples,) 19 | input class labels 20 | 21 | Output 22 | ------ 23 | score: {numpy array}, shape (n_features,) 24 | fisher score for each feature 25 | 26 | Reference 27 | --------- 28 | He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. 29 | Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012. 30 | """ 31 | 32 | # Construct weight matrix W in a fisherScore way 33 | kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} 34 | W = construct_W(X, **kwargs) 35 | 36 | # build the diagonal D matrix from affinity matrix W 37 | D = np.array(W.sum(axis=1)) 38 | L = W 39 | tmp = np.dot(np.transpose(D), X) 40 | D = diags(np.transpose(D), [0]) 41 | Xt = np.transpose(X) 42 | t1 = np.transpose(np.dot(Xt, D.todense())) 43 | t2 = np.transpose(np.dot(Xt, L.todense())) 44 | # compute the numerator of Lr 45 | D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() 46 | # compute the denominator of Lr 47 | L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() 48 | # avoid the denominator of Lr to be 0 49 | D_prime[D_prime < 1e-12] = 10000 50 | lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] 51 | 52 | # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 53 | score = 1.0/lap_score - 1 54 | return np.transpose(score) 55 | 56 | 57 | def feature_ranking(score): 58 | """ 59 | Rank features in descending order according to fisher score, the larger the fisher score, the more important the 60 | feature is 61 | """ 62 | idx = np.argsort(score, 0) 63 | return idx[::-1] -------------------------------------------------------------------------------- /skfeature/function/similarity_based/lap_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import * 3 | from skfeature.utility.construct_W import construct_W 4 | 5 | 6 | def lap_score(X, **kwargs): 7 | """ 8 | This function implements the laplacian score feature selection, steps are as follows: 9 | 1. Construct the affinity matrix W if it is not specified 10 | 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 11 | 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 12 | 4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat) 13 | 14 | Input 15 | ----- 16 | X: {numpy array}, shape (n_samples, n_features) 17 | input data 18 | kwargs: {dictionary} 19 | W: {sparse matrix}, shape (n_samples, n_samples) 20 | input affinity matrix 21 | 22 | Output 23 | ------ 24 | score: {numpy array}, shape (n_features,) 25 | laplacian score for each feature 26 | 27 | Reference 28 | --------- 29 | He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. 30 | """ 31 | 32 | # if 'W' is not specified, use the default W 33 | if 'W' not in kwargs.keys(): 34 | W = construct_W(X) 35 | # construct the affinity matrix W 36 | W = kwargs['W'] 37 | # build the diagonal D matrix from affinity matrix W 38 | D = np.array(W.sum(axis=1)) 39 | L = W 40 | tmp = np.dot(np.transpose(D), X) 41 | D = diags(np.transpose(D), [0]) 42 | Xt = np.transpose(X) 43 | t1 = np.transpose(np.dot(Xt, D.todense())) 44 | t2 = np.transpose(np.dot(Xt, L.todense())) 45 | # compute the numerator of Lr 46 | D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() 47 | # compute the denominator of Lr 48 | L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() 49 | # avoid the denominator of Lr to be 0 50 | D_prime[D_prime < 1e-12] = 10000 51 | 52 | # compute laplacian score for all features 53 | score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] 54 | return np.transpose(score) 55 | 56 | 57 | def feature_ranking(score): 58 | """ 59 | Rank features in ascending order according to their laplacian scores, the smaller the laplacian score is, the more 60 | important the feature is 61 | """ 62 | idx = np.argsort(score, 0) 63 | return idx 64 | -------------------------------------------------------------------------------- /skfeature/function/similarity_based/reliefF.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.metrics.pairwise import pairwise_distances 3 | 4 | 5 | def reliefF(X, y, **kwargs): 6 | """ 7 | This function implements the reliefF feature selection 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | kwargs: {dictionary} 16 | parameters of reliefF: 17 | k: {int} 18 | choices for the number of neighbors (default k = 5) 19 | 20 | Output 21 | ------ 22 | score: {numpy array}, shape (n_features,) 23 | reliefF score for each feature 24 | 25 | Reference 26 | --------- 27 | Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003. 28 | Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013. 29 | """ 30 | 31 | if "k" not in kwargs.keys(): 32 | k = 5 33 | else: 34 | k = kwargs["k"] 35 | n_samples, n_features = X.shape 36 | 37 | # calculate pairwise distances between instances 38 | distance = pairwise_distances(X, metric='manhattan') 39 | 40 | score = np.zeros(n_features) 41 | 42 | # the number of sampled instances is equal to the number of total instances 43 | for idx in range(n_samples): 44 | near_hit = [] 45 | near_miss = dict() 46 | 47 | self_fea = X[idx, :] 48 | c = np.unique(y).tolist() 49 | 50 | stop_dict = dict() 51 | for label in c: 52 | stop_dict[label] = 0 53 | del c[c.index(y[idx])] 54 | 55 | p_dict = dict() 56 | p_label_idx = float(len(y[y == y[idx]]))/float(n_samples) 57 | 58 | for label in c: 59 | p_label_c = float(len(y[y == label]))/float(n_samples) 60 | p_dict[label] = p_label_c/(1-p_label_idx) 61 | near_miss[label] = [] 62 | 63 | distance_sort = [] 64 | distance[idx, idx] = np.max(distance[idx, :]) 65 | 66 | for i in range(n_samples): 67 | distance_sort.append([distance[idx, i], int(i), y[i]]) 68 | distance_sort.sort(key=lambda x: x[0]) 69 | 70 | for i in range(n_samples): 71 | # find k nearest hit points 72 | if distance_sort[i][2] == y[idx]: 73 | if len(near_hit) < k: 74 | near_hit.append(distance_sort[i][1]) 75 | elif len(near_hit) == k: 76 | stop_dict[y[idx]] = 1 77 | else: 78 | # find k nearest miss points for each label 79 | if len(near_miss[distance_sort[i][2]]) < k: 80 | near_miss[distance_sort[i][2]].append(distance_sort[i][1]) 81 | else: 82 | if len(near_miss[distance_sort[i][2]]) == k: 83 | stop_dict[distance_sort[i][2]] = 1 84 | stop = True 85 | for (key, value) in stop_dict.items(): 86 | if value != 1: 87 | stop = False 88 | if stop: 89 | break 90 | 91 | # update reliefF score 92 | near_hit_term = np.zeros(n_features) 93 | for ele in near_hit: 94 | near_hit_term = np.array(abs(self_fea-X[ele, :]))+np.array(near_hit_term) 95 | 96 | near_miss_term = dict() 97 | for (label, miss_list) in near_miss.items(): 98 | near_miss_term[label] = np.zeros(n_features) 99 | for ele in miss_list: 100 | near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label]) 101 | score += near_miss_term[label]/(k*p_dict[label]) 102 | score -= near_hit_term/k 103 | return score 104 | 105 | 106 | def feature_ranking(score): 107 | """ 108 | Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the 109 | feature is 110 | """ 111 | idx = np.argsort(score, 0) 112 | return idx[::-1] 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /skfeature/function/similarity_based/trace_ratio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skfeature.utility.construct_W import construct_W 3 | 4 | 5 | def trace_ratio(X, y, n_selected_features, **kwargs): 6 | """ 7 | This function implements the trace ratio criterion for feature selection 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | n_selected_features: {int} 16 | number of features to select 17 | kwargs: {dictionary} 18 | style: {string} 19 | style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way 20 | style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way 21 | verbose: {boolean} 22 | True if user want to print out the objective function value in each iteration, False if not 23 | 24 | Output 25 | ------ 26 | feature_idx: {numpy array}, shape (n_features,) 27 | the ranked (descending order) feature index based on subset-level score 28 | feature_score: {numpy array}, shape (n_features,) 29 | the feature-level score 30 | subset_score: {float} 31 | the subset-level score 32 | 33 | Reference 34 | --------- 35 | Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008. 36 | """ 37 | 38 | # if 'style' is not specified, use the fisher score way to built two affinity matrix 39 | if 'style' not in kwargs.keys(): 40 | kwargs['style'] = 'fisher' 41 | # get the way to build affinity matrix, 'fisher' or 'laplacian' 42 | style = kwargs['style'] 43 | n_samples, n_features = X.shape 44 | 45 | # if 'verbose' is not specified, do not output the value of objective function 46 | if 'verbose' not in kwargs: 47 | kwargs['verbose'] = False 48 | verbose = kwargs['verbose'] 49 | 50 | if style is 'fisher': 51 | kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} 52 | # build within class and between class laplacian matrix L_w and L_b 53 | W_within = construct_W(X, **kwargs_within) 54 | L_within = np.eye(n_samples) - W_within 55 | L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples 56 | L_between = L_within - L_tmp 57 | 58 | if style is 'laplacian': 59 | kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} 60 | # build within class and between class laplacian matrix L_w and L_b 61 | W_within = construct_W(X, **kwargs_within) 62 | D_within = np.diag(np.array(W_within.sum(1))[:, 0]) 63 | L_within = D_within - W_within 64 | W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within) 65 | D_between = np.diag(np.array(W_between.sum(1))) 66 | L_between = D_between - W_between 67 | 68 | # build X'*L_within*X and X'*L_between*X 69 | L_within = (np.transpose(L_within) + L_within)/2 70 | L_between = (np.transpose(L_between) + L_between)/2 71 | S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) 72 | S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) 73 | 74 | # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' 75 | S_within = (np.transpose(S_within) + S_within)/2 76 | # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' 77 | S_between = (np.transpose(S_between) + S_between)/2 78 | 79 | # take the absolute values of diagonal 80 | s_within = np.absolute(S_within.diagonal()) 81 | s_between = np.absolute(S_between.diagonal()) 82 | s_between[s_between == 0] = 1e-14 # this number if from authors' code 83 | 84 | # preprocessing 85 | fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] 86 | k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features]) 87 | s_within = s_within[fs_idx[0:n_selected_features]] 88 | s_between = s_between[fs_idx[0:n_selected_features]] 89 | 90 | # iterate util converge 91 | count = 0 92 | while True: 93 | score = np.sort(s_between-k*s_within)[::-1] 94 | I = np.argsort(s_between-k*s_within)[::-1] 95 | idx = I[0:n_selected_features] 96 | old_k = k 97 | k = np.sum(s_between[idx])/np.sum(s_within[idx]) 98 | if verbose: 99 | print('obj at iter {0}: {1}'.format(count+1, k)) 100 | count += 1 101 | if abs(k - old_k) < 1e-3: 102 | break 103 | 104 | # get feature index, feature-level score and subset-level score 105 | feature_idx = fs_idx[I] 106 | feature_score = score 107 | subset_score = k 108 | 109 | return feature_idx, feature_score, subset_score 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/MCFS.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | import numpy as np 3 | from sklearn import linear_model 4 | from skfeature.utility.construct_W import construct_W 5 | 6 | 7 | def mcfs(X, n_selected_features, **kwargs): 8 | """ 9 | This function implements unsupervised feature selection for multi-cluster data. 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | n_selected_features: {int} 16 | number of features to select 17 | kwargs: {dictionary} 18 | W: {sparse matrix}, shape (n_samples, n_samples) 19 | affinity matrix 20 | n_clusters: {int} 21 | number of clusters (default is 5) 22 | 23 | Output 24 | ------ 25 | W: {numpy array}, shape(n_features, n_clusters) 26 | feature weight matrix 27 | 28 | Reference 29 | --------- 30 | Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. 31 | """ 32 | 33 | # use the default affinity matrix 34 | if 'W' not in kwargs: 35 | W = construct_W(X) 36 | else: 37 | W = kwargs['W'] 38 | # default number of clusters is 5 39 | if 'n_clusters' not in kwargs: 40 | n_clusters = 5 41 | else: 42 | n_clusters = kwargs['n_clusters'] 43 | 44 | # solve the generalized eigen-decomposition problem and get the top K 45 | # eigen-vectors with respect to the smallest eigenvalues 46 | W = W.toarray() 47 | W = (W + W.T) / 2 48 | W_norm = np.diag(np.sqrt(1 / W.sum(1))) 49 | W = np.dot(W_norm, np.dot(W, W_norm)) 50 | WT = W.T 51 | W[W < WT] = WT[W < WT] 52 | eigen_value, ul = scipy.linalg.eigh(a=W) 53 | Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1]) 54 | 55 | # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d 56 | n_sample, n_feature = X.shape 57 | W = np.zeros((n_feature, n_clusters)) 58 | for i in range(n_clusters): 59 | clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) 60 | clf.fit(X, Y[:, i]) 61 | W[:, i] = clf.coef_ 62 | return W 63 | 64 | 65 | def feature_ranking(W): 66 | """ 67 | This function computes MCFS score and ranking features according to feature weights matrix W 68 | """ 69 | mcfs_score = W.max(1) 70 | idx = np.argsort(mcfs_score, 0) 71 | idx = idx[::-1] 72 | return idx -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/NDFS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import math 4 | import sklearn.cluster 5 | from skfeature.utility.construct_W import construct_W 6 | 7 | 8 | def ndfs(X, **kwargs): 9 | """ 10 | This function implement unsupervised feature selection using nonnegative spectral analysis, i.e., 11 | min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2 12 | s.t. F >= 0 13 | 14 | Input 15 | ----- 16 | X: {numpy array}, shape (n_samples, n_features) 17 | input data 18 | kwargs: {dictionary} 19 | W: {sparse matrix}, shape {n_samples, n_samples} 20 | affinity matrix 21 | alpha: {float} 22 | Parameter alpha in objective function 23 | beta: {float} 24 | Parameter beta in objective function 25 | gamma: {float} 26 | a very large number used to force F^T F = I 27 | F0: {numpy array}, shape (n_samples, n_clusters) 28 | initialization of the pseudo label matirx F, if not provided 29 | n_clusters: {int} 30 | number of clusters 31 | verbose: {boolean} 32 | True if user want to print out the objective function value in each iteration, false if not 33 | 34 | Output 35 | ------ 36 | W: {numpy array}, shape(n_features, n_clusters) 37 | feature weight matrix 38 | 39 | Reference: 40 | Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012. 41 | """ 42 | 43 | # default gamma is 10e8 44 | if 'gamma' not in kwargs: 45 | gamma = 10e8 46 | else: 47 | gamma = kwargs['gamma'] 48 | # use the default affinity matrix 49 | if 'W' not in kwargs: 50 | W = construct_W(X) 51 | else: 52 | W = kwargs['W'] 53 | if 'alpha' not in kwargs: 54 | alpha = 1 55 | else: 56 | alpha = kwargs['alpha'] 57 | if 'beta' not in kwargs: 58 | beta = 1 59 | else: 60 | beta = kwargs['beta'] 61 | if 'F0' not in kwargs: 62 | if 'n_clusters' not in kwargs: 63 | print >>sys.stderr, "either F0 or n_clusters should be provided" 64 | else: 65 | # initialize F 66 | n_clusters = kwargs['n_clusters'] 67 | F = kmeans_initialization(X, n_clusters) 68 | else: 69 | F = kwargs['F0'] 70 | if 'verbose' not in kwargs: 71 | verbose = False 72 | else: 73 | verbose = kwargs['verbose'] 74 | 75 | n_samples, n_features = X.shape 76 | 77 | # initialize D as identity matrix 78 | D = np.identity(n_features) 79 | I = np.identity(n_samples) 80 | 81 | # build laplacian matrix 82 | L = np.array(W.sum(1))[:, 0] - W 83 | 84 | max_iter = 1000 85 | obj = np.zeros(max_iter) 86 | for iter_step in range(max_iter): 87 | # update W 88 | T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D + 1e-6*np.eye(n_features)) 89 | W = np.dot(np.dot(T, X.transpose()), F) 90 | # update D 91 | temp = np.sqrt((W*W).sum(1)) 92 | temp[temp < 1e-16] = 1e-16 93 | temp = 0.5 / temp 94 | D = np.diag(temp) 95 | # update M 96 | M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose())) 97 | M = (M + M.transpose())/2 98 | # update F 99 | denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F) 100 | temp = np.divide(gamma*F, denominator) 101 | F = F*np.array(temp) 102 | temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16)))) 103 | F = np.dot(F, temp) 104 | 105 | # calculate objective function 106 | obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro') 107 | if verbose: 108 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 109 | 110 | if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 111 | break 112 | return W 113 | 114 | 115 | def kmeans_initialization(X, n_clusters): 116 | """ 117 | This function uses kmeans to initialize the pseudo label 118 | 119 | Input 120 | ----- 121 | X: {numpy array}, shape (n_samples, n_features) 122 | input data 123 | n_clusters: {int} 124 | number of clusters 125 | 126 | Output 127 | ------ 128 | Y: {numpy array}, shape (n_samples, n_clusters) 129 | pseudo label matrix 130 | """ 131 | 132 | n_samples, n_features = X.shape 133 | kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, 134 | tol=0.0001, precompute_distances=True, verbose=0, 135 | random_state=None, copy_x=True, n_jobs=1) 136 | kmeans.fit(X) 137 | labels = kmeans.labels_ 138 | Y = np.zeros((n_samples, n_clusters)) 139 | for row in range(0, n_samples): 140 | Y[row, labels[row]] = 1 141 | T = np.dot(Y.transpose(), Y) 142 | F = np.dot(Y, np.sqrt(np.linalg.inv(T))) 143 | F = F + 0.02*np.ones((n_samples, n_clusters)) 144 | return F 145 | 146 | 147 | def calculate_obj(X, W, F, L, alpha, beta): 148 | """ 149 | This function calculates the objective function of NDFS 150 | """ 151 | # Tr(F^T L F) 152 | T1 = np.trace(np.dot(np.dot(F.transpose(), L), F)) 153 | T2 = np.linalg.norm(np.dot(X, W) - F, 'fro') 154 | T3 = (np.sqrt((W*W).sum(1))).sum() 155 | obj = T1 + alpha*(T2 + beta*T3) 156 | return obj -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/RFS.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from skfeature.utility.sparse_learning import generate_diagonal_matrix 5 | from skfeature.utility.sparse_learning import calculate_l21_norm 6 | 7 | 8 | def rfs(X, Y, **kwargs): 9 | """ 10 | This function implementS efficient and robust feature selection via joint l21-norms minimization 11 | min_W||X^T W - Y||_2,1 + gamma||W||_2,1 12 | 13 | Input 14 | ----- 15 | X: {numpy array}, shape (n_samples, n_features) 16 | input data 17 | Y: {numpy array}, shape (n_samples, n_classes) 18 | input class label matrix, each row is a one-hot-coding class label 19 | kwargs: {dictionary} 20 | gamma: {float} 21 | parameter in RFS 22 | verbose: boolean 23 | True if want to display the objective function value, false if not 24 | 25 | Output 26 | ------ 27 | W: {numpy array}, shape(n_samples, n_features) 28 | feature weight matrix 29 | 30 | Reference 31 | --------- 32 | Nie, Feiping et al. "Efficient and Robust Feature Selection via Joint l2,1-Norms Minimization" NIPS 2010. 33 | """ 34 | 35 | # default gamma is 1 36 | if 'gamma' not in kwargs: 37 | gamma = 1 38 | else: 39 | gamma = kwargs['gamma'] 40 | if 'verbose' not in kwargs: 41 | verbose = False 42 | else: 43 | verbose = kwargs['verbose'] 44 | 45 | n_samples, n_features = X.shape 46 | A = np.zeros((n_samples, n_samples + n_features)) 47 | A[:, 0:n_features] = X 48 | A[:, n_features:n_features+n_samples] = gamma*np.eye(n_samples) 49 | D = np.eye(n_features+n_samples) 50 | 51 | max_iter = 1000 52 | obj = np.zeros(max_iter) 53 | for iter_step in range(max_iter): 54 | # update U as U = D^{-1} A^T (A D^-1 A^T)^-1 Y 55 | D_inv = LA.inv(D) 56 | temp = LA.inv(np.dot(np.dot(A, D_inv), A.T) + 1e-6*np.eye(n_samples)) # (A D^-1 A^T)^-1 57 | U = np.dot(np.dot(np.dot(D_inv, A.T), temp), Y) 58 | # update D as D_ii = 1 / 2 / ||U(i,:)|| 59 | D = generate_diagonal_matrix(U) 60 | 61 | obj[iter_step] = calculate_obj(X, Y, U[0:n_features, :], gamma) 62 | 63 | if verbose: 64 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 65 | if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 66 | break 67 | 68 | # the first d rows of U are the feature weights 69 | W = U[0:n_features, :] 70 | return W 71 | 72 | 73 | def calculate_obj(X, Y, W, gamma): 74 | """ 75 | This function calculates the objective function of rfs 76 | """ 77 | temp = np.dot(X, W) - Y 78 | return calculate_l21_norm(temp) + gamma*calculate_l21_norm(W) -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/UDFS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import math 4 | from skfeature.utility.sparse_learning import generate_diagonal_matrix, calculate_l21_norm 5 | from sklearn.metrics.pairwise import pairwise_distances 6 | 7 | 8 | def udfs(X, **kwargs): 9 | """ 10 | This function implements l2,1-norm regularized discriminative feature 11 | selection for unsupervised learning, i.e., min_W Tr(W^T M W) + gamma ||W||_{2,1}, s.t. W^T W = I 12 | 13 | Input 14 | ----- 15 | X: {numpy array}, shape (n_samples, n_features) 16 | input data 17 | kwargs: {dictionary} 18 | gamma: {float} 19 | parameter in the objective function of UDFS (default is 1) 20 | n_clusters: {int} 21 | Number of clusters 22 | k: {int} 23 | number of nearest neighbor 24 | verbose: {boolean} 25 | True if want to display the objective function value, false if not 26 | 27 | Output 28 | ------ 29 | W: {numpy array}, shape(n_features, n_clusters) 30 | feature weight matrix 31 | 32 | Reference 33 | Yang, Yi et al. "l2,1-Norm Regularized Discriminative Feature Selection for Unsupervised Learning." AAAI 2012. 34 | """ 35 | 36 | # default gamma is 0.1 37 | if 'gamma' not in kwargs: 38 | gamma = 0.1 39 | else: 40 | gamma = kwargs['gamma'] 41 | # default k is set to be 5 42 | if 'k' not in kwargs: 43 | k = 5 44 | else: 45 | k = kwargs['k'] 46 | if 'n_clusters' not in kwargs: 47 | n_clusters = 5 48 | else: 49 | n_clusters = kwargs['n_clusters'] 50 | if 'verbose' not in kwargs: 51 | verbose = False 52 | else: 53 | verbose = kwargs['verbose'] 54 | 55 | # construct M 56 | n_sample, n_feature = X.shape 57 | M = construct_M(X, k, gamma) 58 | 59 | D = np.eye(n_feature) 60 | max_iter = 1000 61 | obj = np.zeros(max_iter) 62 | for iter_step in range(max_iter): 63 | # update W as the eigenvectors of P corresponding to the first n_clusters 64 | # smallest eigenvalues 65 | P = M + gamma*D 66 | eigen_value, eigen_vector = scipy.linalg.eigh(a=P) 67 | W = eigen_vector[:, 0:n_clusters] 68 | # update D as D_ii = 1 / 2 / ||W(i,:)|| 69 | D = generate_diagonal_matrix(W) 70 | 71 | obj[iter_step] = calculate_obj(X, W, M, gamma) 72 | if verbose: 73 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 74 | 75 | if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 76 | break 77 | return W 78 | 79 | 80 | def construct_M(X, k, gamma): 81 | """ 82 | This function constructs the M matrix described in the paper 83 | """ 84 | n_sample, n_feature = X.shape 85 | Xt = X.T 86 | D = pairwise_distances(X) 87 | # sort the distance matrix D in ascending order 88 | idx = np.argsort(D, axis=1) 89 | # choose the k-nearest neighbors for each instance 90 | idx_new = idx[:, 0:k+1] 91 | H = np.eye(k+1) - 1/(k+1) * np.ones((k+1, k+1)) 92 | I = np.eye(k+1) 93 | Mi = np.zeros((n_sample, n_sample)) 94 | for i in range(n_sample): 95 | Xi = Xt[:, idx_new[i, :]] 96 | Xi_tilde =np.dot(Xi, H) 97 | Bi = np.linalg.inv(np.dot(Xi_tilde.T, Xi_tilde) + gamma*I) 98 | Si = np.zeros((n_sample, k+1)) 99 | for q in range(k+1): 100 | Si[idx_new[q], q] = 1 101 | Mi = Mi + np.dot(np.dot(Si, np.dot(np.dot(H, Bi), H)), Si.T) 102 | M = np.dot(np.dot(X.T, Mi), X) 103 | return M 104 | 105 | 106 | def calculate_obj(X, W, M, gamma): 107 | """ 108 | This function calculates the objective function of ls_l21 described in the paper 109 | """ 110 | return np.trace(np.dot(np.dot(W.T, M), W)) + gamma*calculate_l21_norm(W) -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/sparse_learning_based/__init__.py -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/ll_l21.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from skfeature.utility.sparse_learning import euclidean_projection, calculate_l21_norm 5 | 6 | 7 | def proximal_gradient_descent(X, Y, z, **kwargs): 8 | """ 9 | This function implements supervised sparse feature selection via l2,1 norm, i.e., 10 | min_{W} sum_{i}log(1+exp(-yi*(W'*x+C))) + z*||W||_{2,1} 11 | 12 | Input 13 | ----- 14 | X: {numpy array}, shape (n_samples, n_features) 15 | input data 16 | Y: {numpy array}, shape (n_samples, n_classes) 17 | input class labels, each row is a one-hot-coding class label, guaranteed to be a numpy array 18 | z: {float} 19 | regularization parameter 20 | kwargs: {dictionary} 21 | verbose: {boolean} 22 | True if user want to print out the objective function value in each iteration, false if not 23 | 24 | Output 25 | ------ 26 | W: {numpy array}, shape (n_features, n_classes) 27 | weight matrix 28 | obj: {numpy array}, shape (n_iterations,) 29 | objective function value during iterations 30 | value_gamma: {numpy array}, shape (n_iterations,s) 31 | suitable step size during iterations 32 | 33 | 34 | Reference: 35 | Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009. 36 | """ 37 | 38 | if 'verbose' not in kwargs: 39 | verbose = False 40 | else: 41 | verbose = kwargs['verbose'] 42 | 43 | # Starting point initialization # 44 | n_samples, n_features = X.shape 45 | n_samples, n_classes = Y.shape 46 | 47 | # the indices of positive samples 48 | p_flag = (Y == 1) 49 | # the total number of positive samples 50 | n_positive_samples = np.sum(p_flag, 0) 51 | # the total number of negative samples 52 | n_negative_samples = n_samples - n_positive_samples 53 | n_positive_samples = n_positive_samples.astype(float) 54 | n_negative_samples = n_negative_samples.astype(float) 55 | 56 | # initialize a starting point 57 | W = np.zeros((n_features, n_classes)) 58 | C = np.log(np.divide(n_positive_samples, n_negative_samples)) 59 | 60 | # compute XW = X*W 61 | XW = np.dot(X, W) 62 | 63 | # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent 64 | # the intial guess of the Lipschitz continuous gradient 65 | gamma = 1.0/(n_samples*n_classes) 66 | 67 | # assign Wp with W, and XWp with XW 68 | XWp = XW 69 | WWp =np.zeros((n_features, n_classes)) 70 | CCp = np.zeros((1, n_classes)) 71 | 72 | alphap = 0 73 | alpha = 1 74 | 75 | # indicates whether the gradient step only changes a little 76 | flag = False 77 | 78 | max_iter = 1000 79 | value_gamma = np.zeros(max_iter) 80 | obj = np.zeros(max_iter) 81 | for iter_step in range(max_iter): 82 | # step1: compute search point S based on Wp and W (with beta) 83 | beta = (alphap-1)/alpha 84 | S = W + beta*WWp 85 | SC = C + beta*CCp 86 | 87 | # step2: line search for gamma and compute the new approximation solution W 88 | XS = XW + beta*(XW - XWp) 89 | aa = -np.multiply(Y, XS+np.tile(SC, (n_samples, 1))) 90 | # fun_S is the logistic loss at the search point 91 | bb = np.maximum(aa, 0) 92 | fun_S = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes) 93 | # compute prob = [p_1;p_2;...;p_m] 94 | prob = 1.0/(1+np.exp(aa)) 95 | 96 | b = np.multiply(-Y, (1-prob))/(n_samples*n_classes) 97 | # compute the gradient of C 98 | GC = np.sum(b, 0) 99 | # compute the gradient of W as X'*b 100 | G = np.dot(np.transpose(X), b) 101 | 102 | # copy W and XW to Wp and XWp 103 | Wp = W 104 | XWp = XW 105 | Cp = C 106 | 107 | while True: 108 | # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection 109 | V = S - G/gamma 110 | C = SC - GC/gamma 111 | W = euclidean_projection(V, n_features, n_classes, z, gamma) 112 | 113 | # the difference between the new approximate solution W and the search point S 114 | V = W - S 115 | # compute XW = X*W 116 | XW = np.dot(X, W) 117 | aa = -np.multiply(Y, XW+np.tile(C, (n_samples, 1))) 118 | # fun_W is the logistic loss at the new approximate solution 119 | bb = np.maximum(aa, 0) 120 | fun_W = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes) 121 | 122 | r_sum = (LA.norm(V, 'fro')**2 + LA.norm(C-SC, 2)**2) / 2 123 | l_sum = fun_W - fun_S - np.sum(np.multiply(V, G)) - np.inner((C-SC), GC) 124 | 125 | # determine weather the gradient step makes little improvement 126 | if r_sum <= 1e-20: 127 | flag = True 128 | break 129 | 130 | # the condition is fun_W <= fun_S + + + gamma/2 * ( + ) 131 | if l_sum < r_sum*gamma: 132 | break 133 | else: 134 | gamma = max(2*gamma, l_sum/r_sum) 135 | value_gamma[iter_step] = gamma 136 | 137 | # step3: update alpha and alphap, and check weather converge 138 | alphap = alpha 139 | alpha = (1+math.sqrt(4*alpha*alpha+1))/2 140 | 141 | WWp = W - Wp 142 | CCp = C - Cp 143 | 144 | # calculate obj 145 | obj[iter_step] = fun_W 146 | obj[iter_step] += z*calculate_l21_norm(W) 147 | 148 | if verbose: 149 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 150 | 151 | if flag is True: 152 | break 153 | 154 | # determine weather converge 155 | if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 156 | break 157 | return W, obj, value_gamma 158 | -------------------------------------------------------------------------------- /skfeature/function/sparse_learning_based/ls_l21.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from numpy import linalg as LA 4 | from skfeature.utility.sparse_learning import euclidean_projection, calculate_l21_norm 5 | 6 | 7 | def proximal_gradient_descent(X, Y, z, **kwargs): 8 | """ 9 | This function implements supervised sparse feature selection via l2,1 norm, i.e., 10 | min_{W} ||XW-Y||_F^2 + z*||W||_{2,1} 11 | 12 | Input 13 | ----- 14 | X: {numpy array}, shape (n_samples, n_features) 15 | input data, guaranteed to be a numpy array 16 | Y: {numpy array}, shape (n_samples, n_classes) 17 | input class labels, each row is a one-hot-coding class label 18 | z: {float} 19 | regularization parameter 20 | kwargs: {dictionary} 21 | verbose: {boolean} 22 | True if user want to print out the objective function value in each iteration, false if not 23 | 24 | Output 25 | ------ 26 | W: {numpy array}, shape (n_features, n_classes) 27 | weight matrix 28 | obj: {numpy array}, shape (n_iterations,) 29 | objective function value during iterations 30 | value_gamma: {numpy array}, shape (n_iterations,) 31 | suitable step size during iterations 32 | 33 | Reference 34 | --------- 35 | Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009. 36 | """ 37 | 38 | if 'verbose' not in kwargs: 39 | verbose = False 40 | else: 41 | verbose = kwargs['verbose'] 42 | 43 | # starting point initialization 44 | n_samples, n_features = X.shape 45 | n_samples, n_classes = Y.shape 46 | 47 | # compute X'Y 48 | XtY = np.dot(np.transpose(X), Y) 49 | 50 | # initialize a starting point 51 | W = XtY 52 | 53 | # compute XW = X*W 54 | XW = np.dot(X, W) 55 | 56 | # compute l2,1 norm of W 57 | W_norm = calculate_l21_norm(W) 58 | 59 | if W_norm >= 1e-6: 60 | ratio = init_factor(W_norm, XW, Y, z) 61 | W = ratio*W 62 | XW = ratio*XW 63 | 64 | # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent 65 | # initialize step size gamma = 1 66 | gamma = 1 67 | 68 | # assign Wp with W, and XWp with XW 69 | XWp = XW 70 | WWp =np.zeros((n_features, n_classes)) 71 | alphap = 0 72 | alpha = 1 73 | 74 | # indicate whether the gradient step only changes a little 75 | flag = False 76 | 77 | max_iter = 1000 78 | value_gamma = np.zeros(max_iter) 79 | obj = np.zeros(max_iter) 80 | for iter_step in range(max_iter): 81 | # step1: compute search point S based on Wp and W (with beta) 82 | beta = (alphap-1)/alpha 83 | S = W + beta*WWp 84 | 85 | # step2: line search for gamma and compute the new approximation solution W 86 | XS = XW + beta*(XW - XWp) 87 | # compute X'* XS 88 | XtXS = np.dot(np.transpose(X), XS) 89 | # obtain the gradient g 90 | G = XtXS - XtY 91 | # copy W and XW to Wp and XWp 92 | Wp = W 93 | XWp = XW 94 | 95 | while True: 96 | # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection 97 | V = S - G/gamma 98 | W = euclidean_projection(V, n_features, n_classes, z, gamma) 99 | # the difference between the new approximate solution W and the search point S 100 | V = W - S 101 | # compute XW = X*W 102 | XW = np.dot(X, W) 103 | XV = XW - XS 104 | r_sum = LA.norm(V, 'fro')**2 105 | l_sum = LA.norm(XV, 'fro')**2 106 | 107 | # determine weather the gradient step makes little improvement 108 | if r_sum <= 1e-20: 109 | flag = True 110 | break 111 | 112 | # the condition is ||XV||_2^2 <= gamma * ||V||_2^2 113 | if l_sum < r_sum*gamma: 114 | break 115 | else: 116 | gamma = max(2*gamma, l_sum/r_sum) 117 | value_gamma[iter_step] = gamma 118 | 119 | # step3: update alpha and alphap, and check weather converge 120 | alphap = alpha 121 | alpha = (1+math.sqrt(4*alpha*alpha+1))/2 122 | 123 | WWp = W - Wp 124 | XWY = XW -Y 125 | 126 | # calculate obj 127 | obj[iter_step] = LA.norm(XWY, 'fro')**2/2 128 | obj[iter_step] += z*calculate_l21_norm(W) 129 | 130 | if verbose: 131 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 132 | 133 | if flag is True: 134 | break 135 | 136 | # determine weather converge 137 | if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 138 | break 139 | return W, obj, value_gamma 140 | 141 | 142 | def init_factor(W_norm, XW, Y, z): 143 | """ 144 | Initialize the starting point of W, according to the author's code 145 | """ 146 | n_samples, n_classes = XW.shape 147 | a = np.inner(np.reshape(XW, n_samples*n_classes), np.reshape(Y, n_samples*n_classes)) - z*W_norm 148 | b = LA.norm(XW, 'fro')**2 149 | ratio = a / b 150 | return ratio -------------------------------------------------------------------------------- /skfeature/function/statistical_based/CFS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skfeature.utility.mutual_information import su_calculation 3 | 4 | 5 | def merit_calculation(X, y): 6 | """ 7 | This function calculates the merit of X given class labels y, where 8 | merits = (k * rcf)/sqrt(k+k*(k-1)*rff) 9 | rcf = (1/k)*sum(su(fi,y)) for all fi in X 10 | rff = (1/(k*(k-1)))*sum(su(fi,fj)) for all fi and fj in X 11 | 12 | Input 13 | ---------- 14 | X: {numpy array}, shape (n_samples, n_features) 15 | input data 16 | y: {numpy array}, shape (n_samples,) 17 | input class labels 18 | 19 | Output 20 | ---------- 21 | merits: {float} 22 | merit of a feature subset X 23 | """ 24 | 25 | n_samples, n_features = X.shape 26 | rff = 0 27 | rcf = 0 28 | for i in range(n_features): 29 | fi = X[:, i] 30 | rcf += su_calculation(fi, y) 31 | for j in range(n_features): 32 | if j > i: 33 | fj = X[:, j] 34 | rff += su_calculation(fi, fj) 35 | rff *= 2 36 | merits = rcf / np.sqrt(n_features + rff) 37 | return merits 38 | 39 | 40 | def cfs(X, y): 41 | """ 42 | This function uses a correlation based heuristic to evaluate the worth of features which is called CFS 43 | 44 | Input 45 | ----- 46 | X: {numpy array}, shape (n_samples, n_features) 47 | input data 48 | y: {numpy array}, shape (n_samples,) 49 | input class labels 50 | 51 | Output 52 | ------ 53 | F: {numpy array} 54 | index of selected features 55 | 56 | Reference 57 | --------- 58 | Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010. 59 | """ 60 | 61 | n_samples, n_features = X.shape 62 | F = [] 63 | # M stores the merit values 64 | M = [] 65 | while True: 66 | merit = -100000000000 67 | idx = -1 68 | for i in range(n_features): 69 | if i not in F: 70 | F.append(i) 71 | # calculate the merit of current selected features 72 | t = merit_calculation(X[:, F], y) 73 | if t > merit: 74 | merit = t 75 | idx = i 76 | F.pop() 77 | F.append(idx) 78 | M.append(merit) 79 | if len(M) > 5: 80 | if M[len(M)-1] <= M[len(M)-2]: 81 | if M[len(M)-2] <= M[len(M)-3]: 82 | if M[len(M)-3] <= M[len(M)-4]: 83 | if M[len(M)-4] <= M[len(M)-5]: 84 | break 85 | return np.array(F) 86 | 87 | -------------------------------------------------------------------------------- /skfeature/function/statistical_based/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/statistical_based/__init__.py -------------------------------------------------------------------------------- /skfeature/function/statistical_based/chi_square.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_selection import chi2 3 | 4 | 5 | def chi_square(X, y): 6 | """ 7 | This function implements the chi-square feature selection (existing method for classification in scikit-learn) 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data 13 | y: {numpy array},shape (n_samples,) 14 | input class labels 15 | 16 | Output 17 | ------ 18 | F: {numpy array}, shape (n_features,) 19 | chi-square score for each feature 20 | """ 21 | F, pval = chi2(X, y) 22 | return F 23 | 24 | 25 | def feature_ranking(F): 26 | """ 27 | Rank features in descending order according to chi2-score, the higher the chi2-score, the more important the feature is 28 | """ 29 | idx = np.argsort(F) 30 | return idx[::-1] -------------------------------------------------------------------------------- /skfeature/function/statistical_based/f_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.feature_selection import f_classif 3 | 4 | 5 | def f_score(X, y): 6 | """ 7 | This function implements the anova f_value feature selection (existing method for classification in scikit-learn), 8 | where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2)) 9 | 10 | Input 11 | ----- 12 | X: {numpy array}, shape (n_samples, n_features) 13 | input data 14 | y : {numpy array},shape (n_samples,) 15 | input class labels 16 | 17 | Output 18 | ------ 19 | F: {numpy array}, shape (n_features,) 20 | f-score for each feature 21 | """ 22 | 23 | F, pval = f_classif(X, y) 24 | return F 25 | 26 | 27 | def feature_ranking(F): 28 | """ 29 | Rank features in descending order according to f-score, the higher the f-score, the more important the feature is 30 | """ 31 | idx = np.argsort(F) 32 | return idx[::-1] -------------------------------------------------------------------------------- /skfeature/function/statistical_based/gini_index.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def gini_index(X, y): 5 | """ 6 | This function implements the gini index feature selection. 7 | 8 | Input 9 | ---------- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data 12 | y: {numpy array}, shape (n_samples,) 13 | input class labels 14 | 15 | Output 16 | ---------- 17 | gini: {numpy array}, shape (n_features, ) 18 | gini index value of each feature 19 | """ 20 | 21 | n_samples, n_features = X.shape 22 | 23 | # initialize gini_index for all features to be 0.5 24 | gini = np.ones(n_features) * 0.5 25 | 26 | # For i-th feature we define fi = x[:,i] ,v include all unique values in fi 27 | for i in range(n_features): 28 | v = np.unique(X[:, i]) 29 | for j in range(len(v)): 30 | # left_y contains labels of instances whose i-th feature value is less than or equal to v[j] 31 | left_y = y[X[:, i] <= v[j]] 32 | # right_y contains labels of instances whose i-th feature value is larger than v[j] 33 | right_y = y[X[:, i] > v[j]] 34 | 35 | # gini_left is sum of square of probability of occurrence of v[i] in left_y 36 | # gini_right is sum of square of probability of occurrence of v[i] in right_y 37 | gini_left = 0 38 | gini_right = 0 39 | 40 | for k in range(np.min(y), np.max(y)+1): 41 | if len(left_y) != 0: 42 | # t1_left is probability of occurrence of k in left_y 43 | t1_left = np.true_divide(len(left_y[left_y == k]), len(left_y)) 44 | t2_left = np.power(t1_left, 2) 45 | gini_left += t2_left 46 | 47 | if len(right_y) != 0: 48 | # t1_right is probability of occurrence of k in left_y 49 | t1_right = np.true_divide(len(right_y[right_y == k]), len(right_y)) 50 | t2_right = np.power(t1_right, 2) 51 | gini_right += t2_right 52 | 53 | gini_left = 1 - gini_left 54 | gini_right = 1 - gini_right 55 | 56 | # weighted average of len(left_y) and len(right_y) 57 | t1_gini = (len(left_y) * gini_left + len(right_y) * gini_right) 58 | 59 | # compute the gini_index for the i-th feature 60 | value = np.true_divide(t1_gini, len(y)) 61 | 62 | if value < gini[i]: 63 | gini[i] = value 64 | return gini 65 | 66 | 67 | def feature_ranking(W): 68 | """ 69 | Rank features in descending order according to their gini index values, the smaller the gini index, 70 | the more important the feature is 71 | """ 72 | idx = np.argsort(W) 73 | return idx 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /skfeature/function/statistical_based/low_variance.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import VarianceThreshold 2 | 3 | 4 | def low_variance_feature_selection(X, threshold): 5 | """ 6 | This function implements the low_variance feature selection (existing method in scikit-learn) 7 | 8 | Input 9 | ----- 10 | X: {numpy array}, shape (n_samples, n_features) 11 | input data 12 | p:{float} 13 | parameter used to calculate the threshold(threshold = p*(1-p)) 14 | 15 | Output 16 | ------ 17 | X_new: {numpy array}, shape (n_samples, n_selected_features) 18 | data with selected features 19 | """ 20 | sel = VarianceThreshold(threshold) 21 | return sel.fit_transform(X) -------------------------------------------------------------------------------- /skfeature/function/statistical_based/t_score.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def t_score(X, y): 5 | """ 6 | This function calculates t_score for each feature, where t_score is only used for binary problem 7 | t_score = |mean1-mean2|/sqrt(((std1^2)/n1)+((std2^2)/n2))) 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data 13 | y: {numpy array}, shape (n_samples,) 14 | input class labels 15 | 16 | Output 17 | ------ 18 | F: {numpy array}, shape (n_features,) 19 | t-score for each feature 20 | """ 21 | 22 | n_samples, n_features = X.shape 23 | F = np.zeros(n_features) 24 | c = np.unique(y) 25 | if len(c) == 2: 26 | for i in range(n_features): 27 | f = X[:, i] 28 | # class0 contains instances belonging to the first class 29 | # class1 contains instances belonging to the second class 30 | class0 = f[y == c[0]] 31 | class1 = f[y == c[1]] 32 | mean0 = np.mean(class0) 33 | mean1 = np.mean(class1) 34 | std0 = np.std(class0) 35 | std1 = np.std(class1) 36 | n0 = len(class0) 37 | n1 = len(class1) 38 | t = mean0 - mean1 39 | t0 = np.true_divide(std0**2, n0) 40 | t1 = np.true_divide(std1**2, n1) 41 | F[i] = np.true_divide(t, (t0 + t1)**0.5) 42 | else: 43 | print('y should be guaranteed to a binary class vector') 44 | exit(0) 45 | return np.abs(F) 46 | 47 | 48 | def feature_ranking(F): 49 | """ 50 | Rank features in descending order according to t-score, the higher the t-score, the more important the feature is 51 | """ 52 | idx = np.argsort(F) 53 | return idx[::-1] 54 | 55 | -------------------------------------------------------------------------------- /skfeature/function/streaming/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jundongl' 2 | -------------------------------------------------------------------------------- /skfeature/function/streaming/alpha_investing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import linear_model 3 | 4 | 5 | def alpha_investing(X, y, w0, dw): 6 | """ 7 | This function implements streamwise feature selection (SFS) algorithm alpha_investing for binary regression or 8 | univariate regression 9 | 10 | Input 11 | ----- 12 | X: {numpy array}, shape (n_samples, n_features) 13 | input data, assume feature arrives one at each time step 14 | y: {numpy array}, shape (n_samples,) 15 | input class labels or regression target 16 | 17 | Output 18 | ------ 19 | F: {numpy array}, shape (n_selected_features,) 20 | index of selected features in a streamwise way 21 | 22 | Reference 23 | --------- 24 | Zhou, Jing et al. "Streaming Feature Selection using Alpha-investing." KDD 2006. 25 | """ 26 | 27 | n_samples, n_features = X.shape 28 | w = w0 29 | F = [] # selected features 30 | for i in range(n_features): 31 | x_can = X[:, i] # generate next feature 32 | alpha = w/2/(i+1) 33 | X_old = X[:, F] 34 | if i is 0: 35 | X_old = np.ones((n_samples, 1)) 36 | linreg_old = linear_model.LinearRegression() 37 | linreg_old.fit(X_old, y) 38 | error_old = 1 - linreg_old.score(X_old, y) 39 | if i is not 0: 40 | # model built with only X_old 41 | linreg_old = linear_model.LinearRegression() 42 | linreg_old.fit(X_old, y) 43 | error_old = 1 - linreg_old.score(X_old, y) 44 | 45 | # model built with X_old & {x_can} 46 | X_new = np.concatenate((X_old, x_can.reshape(n_samples, 1)), axis=1) 47 | logreg_new = linear_model.LinearRegression() 48 | logreg_new.fit(X_new, y) 49 | error_new = 1 - logreg_new.score(X_new, y) 50 | 51 | # calculate p-value 52 | pval = np.exp((error_new - error_old)/(2*error_old/n_samples)) 53 | if pval < alpha: 54 | F.append(i) 55 | w = w + dw - alpha 56 | else: 57 | w -= alpha 58 | return np.array(F) 59 | 60 | -------------------------------------------------------------------------------- /skfeature/function/structure/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /skfeature/function/structure/graph_fs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def soft_threshold(A,b): 5 | """ 6 | This function implement the soft-threshold operator 7 | Input: 8 | A: {numpy scalar, vector, or matrix} 9 | b: scalar} 10 | """ 11 | res = np.zeros(A.shape) 12 | res[A > b] = A[A > b] - b 13 | res[A < -b] = A[A < -b] + b 14 | return res 15 | 16 | 17 | def calculate_obj(X, y, w, lambda1, lambda2, T): 18 | return 1/2 * (np.linalg.norm(y- np.dot(X, w), 'fro'))**2 + lambda1*np.abs(w).sum() + lambda2*np.abs(np.dot(T, w)).sum() 19 | 20 | 21 | def graph_fs(X, y, **kwargs): 22 | """ 23 | This function implement the graph structural feature selection algorithm GOSCAR 24 | 25 | Objective Function 26 | min_{w} 1/2 ||X*w - y||_F^2 + lambda1 ||w||_1 + lambda2 \sum_{(i,j) \in E} max{|w_i|, |w|_j} 27 | 28 | Input: 29 | X: {numpy array}, shape (n_samples, n_features) 30 | Input data, guaranteed to be a numpy array 31 | y: {numpy array}, shape (n_samples, 1) 32 | Input data, the label matrix 33 | edge_list: {numpy array}, shape (n_edges, 2) 34 | Input data, each row is a pair of linked features, note feature index should start from 0 35 | lambda1: {float} 36 | Parameter lambda1 in objective function 37 | lambda2: {float} 38 | Parameter labmda2 in objective function 39 | rho: {flot} 40 | parameter used for optimization 41 | max_iter: {int} 42 | maximal iteration 43 | verbose: {boolean} True or False 44 | True if we want to print out the objective function value in each iteration, False if not 45 | 46 | Output: 47 | w: the weights of the features 48 | obj: the value of the objective function in each iteration 49 | """ 50 | 51 | if 'lambda1' not in kwargs: 52 | lambda1 = 0.8 53 | else: 54 | lambda1 = kwargs['lambda1'] 55 | if 'lambda2' not in kwargs: 56 | lambda2 = 0.8 57 | else: 58 | lambda2 = kwargs['lambda2'] 59 | if 'edge_list' not in kwargs: 60 | print('Error using function, the network structure E is required') 61 | raise() 62 | else : 63 | edge_list = kwargs['edge_list'] 64 | if 'max_iter' not in kwargs: 65 | max_iter = 300 66 | else: 67 | max_iter = kwargs['max_iter'] 68 | if 'verbose' not in kwargs: 69 | verbose = 0 70 | else: 71 | verbose = kwargs['verbose'] 72 | if 'rho' not in kwargs: 73 | rho = 5 74 | else: 75 | rho = kwargs['rho'] 76 | 77 | n_samples, n_features = X.shape 78 | 79 | # construct T from E 80 | ind1 = edge_list[:, 0] 81 | ind2 = edge_list[:, 1] 82 | num_edge = ind1.shape[0] 83 | T = np.zeros((num_edge*2, n_features)) 84 | for i in range(num_edge): 85 | T[i, ind1[i]] = 0.5 86 | T[i, ind2[i]] = 0.5 87 | T[i+num_edge, ind1[i]] = 0.5 88 | T[i+num_edge, ind2[i]] = -0.5 89 | 90 | # calculate F = X^T X + rho(I + T^T * T) 91 | F = np.dot(X.T, X) + rho*(np.identity(n_features) + np.dot(T.T, T)) 92 | 93 | # Cholesky factorization of F = R^T R 94 | R = np.linalg.cholesky(F) # NOTE, this return F = R R^T 95 | R = R.T 96 | Rinv = np.linalg.inv(R) 97 | Rtinv = Rinv.T 98 | 99 | # initialize p, q, mu , v to be zero vectors 100 | p = np.zeros((2*num_edge, 1)) 101 | q = np.zeros((n_features, 1)) 102 | mu = np.zeros((n_features, 1)) 103 | v = np.zeros((2*num_edge, 1)) 104 | 105 | # start the main loop 106 | iter = 0 107 | obj = np.zeros((max_iter,1)) 108 | while iter < max_iter: 109 | print(iter) 110 | # update w 111 | b = np.dot(X.T, y) - mu - np.dot(T.T, v) + rho*np.dot(T.T,p) + rho*q 112 | w_hat = np.dot(Rtinv, b) 113 | w = np.dot(Rinv, w_hat) 114 | 115 | # update q 116 | q = soft_threshold(w + 1/rho*mu, lambda1/rho) 117 | # update p 118 | 119 | p = soft_threshold(np.dot(T, w)+1/rho*v, lambda2/rho) 120 | # update mu, v 121 | mu += rho*(w - q) 122 | v += rho*(np.dot(T, w) - p) 123 | 124 | # calculate objective function 125 | obj[iter] = calculate_obj(X, y, w, lambda1, lambda2, T) 126 | if verbose: 127 | print('obj at iter {0}: {1}'.format(iter, obj[iter])) 128 | iter += 1 129 | return w, obj, q 130 | 131 | def feature_ranking(w): 132 | T = w.abs() 133 | idx = np.argsort(T, 0) 134 | return idx[::-1] 135 | -------------------------------------------------------------------------------- /skfeature/function/structure/group_fs.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from skfeature.utility.sparse_learning import tree_lasso_projection, tree_norm 4 | 5 | 6 | def group_fs(X, y, z1, z2, idx, **kwargs): 7 | """ 8 | This function implements supervised sparse group feature selection with least square loss, i.e., 9 | min_{w} ||Xw-y||_2^2 + z_1||w||_1 + z_2*sum_{i} h_{i}||w_{G_{i}}|| where h_i is the weight for the i-th group 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | y: {numpy array}, shape (n_samples,) 16 | input class labels or regression target 17 | z1: {float} 18 | regularization parameter of L1 norm for each element 19 | z2: {float} 20 | regularization parameter of L2 norm for the non-overlapping group 21 | idx: {numpy array}, shape (3, n_nodes) 22 | 3*nodes matrix, where nodes denotes the number of groups 23 | idx[1,:] contains the starting index of a group 24 | idx[2,: contains the ending index of a group 25 | idx[3,:] contains the corresponding weight (w_{j}) 26 | kwargs: {dictionary} 27 | verbose: {boolean} 28 | True if user want to print out the objective function value in each iteration, false if not 29 | 30 | Output 31 | ------ 32 | w: {numpy array}, shape (n_features, ) 33 | weight matrix 34 | obj: {numpy array}, shape (n_iterations, ) 35 | objective function value during iterations 36 | value_gamma: {numpy array}, shape (n_iterations, ) 37 | suitable step size during iterations 38 | 39 | Reference 40 | --------- 41 | Liu, Jun, et al. "Moreau-Yosida Regularization for Grouped Tree Structure Learning." NIPS. 2010. 42 | Liu, Jun, et al. "SLEP: Sparse Learning with Efficient Projections." http://www.public.asu.edu/~jye02/Software/SLEP, 2009. 43 | """ 44 | if 'verbose' not in kwargs: 45 | verbose = False 46 | else: 47 | verbose = kwargs['verbose'] 48 | 49 | # starting point initialization 50 | n_samples, n_features = X.shape 51 | 52 | # compute X'y 53 | Xty = np.dot(np.transpose(X), y) 54 | 55 | # initialize a starting point 56 | w = np.zeros(n_features) 57 | 58 | # compute Xw = X*w 59 | Xw = np.dot(X, w) 60 | 61 | # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent 62 | # initialize step size gamma = 1 63 | gamma = 1 64 | 65 | # assign wp with w, and Xwp with Xw 66 | Xwp = Xw 67 | wwp = np.zeros(n_features) 68 | alphap = 0 69 | alpha = 1 70 | 71 | # indicates whether the gradient step only changes a little 72 | flag = False 73 | 74 | max_iter = 1000 75 | value_gamma = np.zeros(max_iter) 76 | obj = np.zeros(max_iter) 77 | for iter_step in range(max_iter): 78 | # step1: compute search point s based on wp and w (with beta) 79 | beta = (alphap-1)/alpha 80 | s = w + beta*wwp 81 | 82 | # step2: line search for gamma and compute the new approximation solution w 83 | Xs = Xw + beta*(Xw - Xwp) 84 | # compute X'* Xs 85 | XtXs = np.dot(np.transpose(X), Xs) 86 | # obtain the gradient g 87 | G = XtXs - Xty 88 | # copy w and Xw to wp and Xwp 89 | wp = w 90 | Xwp = Xw 91 | 92 | while True: 93 | # let s walk in a step in the antigradient of s to get v and then do the L1/L2-norm regularized projection 94 | v = s - G/gamma 95 | # tree overlapping group lasso projection 96 | n_nodes = int(idx.shape[1]) 97 | idx_tmp = np.zeros((3, n_nodes+1)) 98 | idx_tmp[0:2, :] = np.concatenate((np.array([[-1], [-1]]), idx[0:2, :]), axis=1) 99 | idx_tmp[2, :] = np.concatenate((np.array([z1/gamma]), z2/gamma*idx[2, :]), axis=1) 100 | w = tree_lasso_projection(v, n_features, idx_tmp, n_nodes+1) 101 | # the difference between the new approximate solution w and the search point s 102 | v = w - s 103 | # compute Xw = X*w 104 | Xw = np.dot(X, w) 105 | Xv = Xw - Xs 106 | r_sum = np.inner(v, v) 107 | l_sum = np.inner(Xv, Xv) 108 | # determine weather the gradient step makes little improvement 109 | if r_sum <= 1e-20: 110 | flag = True 111 | break 112 | 113 | # the condition is ||Xv||_2^2 <= gamma * ||v||_2^2 114 | if l_sum <= r_sum*gamma: 115 | break 116 | else: 117 | gamma = max(2*gamma, l_sum/r_sum) 118 | value_gamma[iter_step] = gamma 119 | 120 | # step3: update alpha and alphap, and check weather converge 121 | alphap = alpha 122 | alpha = (1+math.sqrt(4*alpha*alpha+1))/2 123 | 124 | wwp = w - wp 125 | Xwy = Xw -y 126 | 127 | # calculate the regularization part 128 | idx_tmp = np.zeros((3, n_nodes+1)) 129 | idx_tmp[0:2, :] = np.concatenate((np.array([[-1], [-1]]), idx[0:2, :]), axis=1) 130 | idx_tmp[2, :] = np.concatenate((np.array([z1]), z2*idx[2, :]), axis=1) 131 | tree_norm_val = tree_norm(w, n_features, idx_tmp, n_nodes+1) 132 | 133 | # function value = loss + regularization 134 | obj[iter_step] = np.inner(Xwy, Xwy)/2 + tree_norm_val 135 | 136 | if verbose: 137 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 138 | 139 | if flag is True: 140 | break 141 | 142 | # determine weather converge 143 | if iter_step >= 2 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 144 | break 145 | 146 | return w, obj, value_gamma 147 | 148 | 149 | -------------------------------------------------------------------------------- /skfeature/function/structure/tree_fs.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | from skfeature.utility.sparse_learning import tree_lasso_projection, tree_norm 4 | 5 | 6 | def tree_fs(X, y, z, idx, **kwargs): 7 | """ 8 | This function implements tree structured group lasso regularization with least square loss, i.e., 9 | min_{w} ||Xw-Y||_2^2 + z\sum_{i}\sum_{j} h_{j}^{i}|||w_{G_{j}^{i}}|| where h_{j}^{i} is the weight for the j-th group 10 | from the i-th level (the root node is in level 0) 11 | 12 | Input 13 | ----- 14 | X: {numpy array}, shape (n_samples, n_features) 15 | input data 16 | y: {numpy array}, shape (n_samples,) 17 | input class labels or regression target 18 | z: {float} 19 | regularization parameter of L2 norm for the non-overlapping group 20 | idx: {numpy array}, shape (3, n_nodes) 21 | 3*nodes matrix, where nodes denotes the number of nodes of the tree 22 | idx(1,:) contains the starting index 23 | idx(2,:) contains the ending index 24 | idx(3,:) contains the corresponding weight (w_{j}) 25 | kwargs: {dictionary} 26 | verbose: {boolean} 27 | True if user want to print out the objective function value in each iteration, false if not 28 | 29 | Output 30 | ------ 31 | w: {numpy array}, shape (n_features,) 32 | weight vector 33 | obj: {numpy array}, shape (n_iterations,) 34 | objective function value during iterations 35 | value_gamma: {numpy array}, shape (n_iterations,) 36 | suitable step size during iterations 37 | 38 | Note for input parameter idx: 39 | (1) For idx, if each entry in w is a leaf node of the tree and the weight for this leaf node are the same, then 40 | idx[0,0] = -1 and idx[1,0] = -1, idx[2,0] denotes the common weight 41 | (2) In idx, the features of the left tree is smaller than the right tree (idx[0,i] is always smaller than idx[1,i]) 42 | 43 | Reference: 44 | Liu, Jun, et al. "Moreau-Yosida Regularization for Grouped Tree Structure Learning." NIPS. 2010. 45 | Liu, Jun, et al. "SLEP: Sparse Learning with Efficient Projections." http://www.public.asu.edu/~jye02/Software/SLEP, 2009. 46 | """ 47 | 48 | if 'verbose' not in kwargs: 49 | verbose = False 50 | else: 51 | verbose = kwargs['verbose'] 52 | 53 | # starting point initialization 54 | n_samples, n_features = X.shape 55 | 56 | # compute X'y 57 | Xty = np.dot(np.transpose(X), y) 58 | 59 | # initialize a starting point 60 | w = np.zeros(n_features) 61 | 62 | # compute Xw = X*w 63 | Xw = np.dot(X, w) 64 | 65 | # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent 66 | # initialize step size gamma = 1 67 | gamma = 1 68 | 69 | # assign wp with w, and Xwp with Xw 70 | Xwp = Xw 71 | wwp = np.zeros(n_features) 72 | alphap = 0 73 | alpha = 1 74 | 75 | # indicates whether the gradient step only changes a little 76 | flag = False 77 | 78 | max_iter = 1000 79 | value_gamma = np.zeros(max_iter) 80 | obj = np.zeros(max_iter) 81 | for iter_step in range(max_iter): 82 | # step1: compute search point s based on wp and w (with beta) 83 | beta = (alphap-1)/alpha 84 | s = w + beta*wwp 85 | 86 | # step2: line search for gamma and compute the new approximation solution w 87 | Xs = Xw + beta*(Xw - Xwp) 88 | # compute X'* Xs 89 | XtXs = np.dot(np.transpose(X), Xs) 90 | 91 | # obtain the gradient g 92 | G = XtXs - Xty 93 | 94 | # copy w and Xw to wp and Xwp 95 | wp = w 96 | Xwp = Xw 97 | 98 | while True: 99 | # let s walk in a step in the antigradient of s to get v and then do the L1/L2-norm regularized projection 100 | v = s - G/gamma 101 | # tree overlapping group lasso projection 102 | n_nodes = int(idx.shape[1]) 103 | idx_tmp = idx.copy() 104 | idx_tmp[2, :] = idx[2, :] * z / gamma 105 | w = tree_lasso_projection(v, n_features, idx_tmp, n_nodes) 106 | # the difference between the new approximate solution w and the search point s 107 | v = w - s 108 | # compute Xw = X*w 109 | Xw = np.dot(X, w) 110 | Xv = Xw - Xs 111 | r_sum = np.inner(v, v) 112 | l_sum = np.inner(Xv, Xv) 113 | # determine weather the gradient step makes little improvement 114 | if r_sum <= 1e-20: 115 | flag = True 116 | break 117 | 118 | # the condition is ||Xv||_2^2 <= gamma * ||v||_2^2 119 | if l_sum <= r_sum*gamma: 120 | break 121 | else: 122 | gamma = max(2*gamma, l_sum/r_sum) 123 | value_gamma[iter_step] = gamma 124 | 125 | # step3: update alpha and alphap, and check weather converge 126 | alphap = alpha 127 | alpha = (1+math.sqrt(4*alpha*alpha+1))/2 128 | 129 | wwp = w - wp 130 | Xwy = Xw -y 131 | # calculate the regularization part 132 | tree_norm_val = tree_norm(w, n_features, idx, n_nodes) 133 | 134 | # function value = loss + regularization 135 | obj[iter_step] = np.inner(Xwy, Xwy)/2 + z*tree_norm_val 136 | 137 | if verbose: 138 | print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) 139 | 140 | if flag is True: 141 | break 142 | 143 | # determine whether converge 144 | if iter_step >= 2 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: 145 | break 146 | 147 | return w, obj, value_gamma 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /skfeature/function/wrapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/wrapper/__init__.py -------------------------------------------------------------------------------- /skfeature/function/wrapper/decision_tree_backward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.model_selection import KFold 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | def decision_tree_backward(X, y, n_selected_features): 8 | """ 9 | This function implements the backward feature selection algorithm based on decision tree 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | y: {numpy array}, shape (n_samples,) 16 | input class labels 17 | n_selected_features : {int} 18 | number of selected features 19 | 20 | Output 21 | ------ 22 | F: {numpy array}, shape (n_features, ) 23 | index of selected features 24 | """ 25 | 26 | n_samples, n_features = X.shape 27 | # using 10 fold cross validation 28 | cv = KFold(n_samples, n_folds=10, shuffle=True) 29 | # choose decision tree as the classifier 30 | clf = DecisionTreeClassifier() 31 | 32 | # selected feature set, initialized to contain all features 33 | F = range(n_features) 34 | count = n_features 35 | 36 | while count > n_selected_features: 37 | max_acc = 0 38 | for i in range(n_features): 39 | if i in F: 40 | F.remove(i) 41 | X_tmp = X[:, F] 42 | acc = 0 43 | for train, test in cv: 44 | clf.fit(X_tmp[train], y[train]) 45 | y_predict = clf.predict(X_tmp[test]) 46 | acc_tmp = accuracy_score(y[test], y_predict) 47 | acc += acc_tmp 48 | acc = float(acc)/10 49 | F.append(i) 50 | # record the feature which results in the largest accuracy 51 | if acc > max_acc: 52 | max_acc = acc 53 | idx = i 54 | # delete the feature which results in the largest accuracy 55 | F.remove(idx) 56 | count -= 1 57 | return np.array(F) 58 | 59 | 60 | -------------------------------------------------------------------------------- /skfeature/function/wrapper/decision_tree_forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.tree import DecisionTreeClassifier 3 | from sklearn.model_selection import KFold 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | def decision_tree_forward(X, y, n_selected_features): 8 | """ 9 | This function implements the forward feature selection algorithm based on decision tree 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | y: {numpy array}, shape (n_samples, ) 16 | input class labels 17 | n_selected_features: {int} 18 | number of selected features 19 | 20 | Output 21 | ------ 22 | F: {numpy array}, shape (n_features,) 23 | index of selected features 24 | """ 25 | 26 | n_samples, n_features = X.shape 27 | # using 10 fold cross validation 28 | cv = KFold(n_samples, n_folds=10, shuffle=True) 29 | # choose decision tree as the classifier 30 | clf = DecisionTreeClassifier() 31 | 32 | # selected feature set, initialized to be empty 33 | F = [] 34 | count = 0 35 | while count < n_selected_features: 36 | max_acc = 0 37 | for i in range(n_features): 38 | if i not in F: 39 | F.append(i) 40 | X_tmp = X[:, F] 41 | acc = 0 42 | for train, test in cv: 43 | clf.fit(X_tmp[train], y[train]) 44 | y_predict = clf.predict(X_tmp[test]) 45 | acc_tmp = accuracy_score(y[test], y_predict) 46 | acc += acc_tmp 47 | acc = float(acc)/10 48 | F.pop() 49 | # record the feature which results in the largest accuracy 50 | if acc > max_acc: 51 | max_acc = acc 52 | idx = i 53 | # add the feature which results in the largest accuracy 54 | F.append(idx) 55 | count += 1 56 | return np.array(F) 57 | 58 | -------------------------------------------------------------------------------- /skfeature/function/wrapper/svm_backward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.svm import SVC 3 | from sklearn.model_selection import KFold 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | def svm_backward(X, y, n_selected_features): 8 | """ 9 | This function implements the backward feature selection algorithm based on SVM 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | y: {numpy array}, shape (n_samples,) 16 | input class labels 17 | n_selected_features: {int} 18 | number of selected features 19 | 20 | Output 21 | ------ 22 | F: {numpy array}, shape (n_features, ) 23 | index of selected features 24 | """ 25 | 26 | n_samples, n_features = X.shape 27 | # using 10 fold cross validation 28 | cv = KFold(n_samples, n_folds=10, shuffle=True) 29 | # choose SVM as the classifier 30 | clf = SVC() 31 | 32 | # selected feature set, initialized to contain all features 33 | F = range(n_features) 34 | count = n_features 35 | 36 | while count > n_selected_features: 37 | max_acc = 0 38 | for i in range(n_features): 39 | if i in F: 40 | F.remove(i) 41 | X_tmp = X[:, F] 42 | acc = 0 43 | for train, test in cv: 44 | clf.fit(X_tmp[train], y[train]) 45 | y_predict = clf.predict(X_tmp[test]) 46 | acc_tmp = accuracy_score(y[test], y_predict) 47 | acc += acc_tmp 48 | acc = float(acc)/10 49 | F.append(i) 50 | # record the feature which results in the largest accuracy 51 | if acc > max_acc: 52 | max_acc = acc 53 | idx = i 54 | # delete the feature which results in the largest accuracy 55 | F.remove(idx) 56 | count -= 1 57 | return np.array(F) 58 | 59 | 60 | -------------------------------------------------------------------------------- /skfeature/function/wrapper/svm_forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.svm import SVC 3 | from sklearn.model_selection import KFold 4 | from sklearn.metrics import accuracy_score 5 | 6 | 7 | def svm_forward(X, y, n_selected_features): 8 | """ 9 | This function implements the forward feature selection algorithm based on SVM 10 | 11 | Input 12 | ----- 13 | X: {numpy array}, shape (n_samples, n_features) 14 | input data 15 | y: {numpy array}, shape (n_samples,) 16 | input class labels 17 | n_selected_features: {int} 18 | number of selected features 19 | 20 | Output 21 | ------ 22 | F: {numpy array}, shape (n_features, ) 23 | index of selected features 24 | """ 25 | 26 | n_samples, n_features = X.shape 27 | # using 10 fold cross validation 28 | cv = KFold(n_samples, n_folds=10, shuffle=True) 29 | # choose SVM as the classifier 30 | clf = SVC() 31 | 32 | # selected feature set, initialized to be empty 33 | F = [] 34 | count = 0 35 | while count < n_selected_features: 36 | max_acc = 0 37 | for i in range(n_features): 38 | if i not in F: 39 | F.append(i) 40 | X_tmp = X[:, F] 41 | acc = 0 42 | for train, test in cv: 43 | clf.fit(X_tmp[train], y[train]) 44 | y_predict = clf.predict(X_tmp[test]) 45 | acc_tmp = accuracy_score(y[test], y_predict) 46 | acc += acc_tmp 47 | acc = float(acc)/10 48 | F.pop() 49 | # record the feature which results in the largest accuracy 50 | if acc > max_acc: 51 | max_acc = acc 52 | idx = i 53 | # add the feature which results in the largest accuracy 54 | F.append(idx) 55 | count += 1 56 | return np.array(F) -------------------------------------------------------------------------------- /skfeature/utility/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/utility/__init__.py -------------------------------------------------------------------------------- /skfeature/utility/construct_W.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.sparse import * 3 | from sklearn.metrics.pairwise import pairwise_distances 4 | 5 | 6 | def construct_W(X, **kwargs): 7 | """ 8 | Construct the affinity matrix W through different ways 9 | 10 | Notes 11 | ----- 12 | if kwargs is null, use the default parameter settings; 13 | if kwargs is not null, construct the affinity matrix according to parameters in kwargs 14 | 15 | Input 16 | ----- 17 | X: {numpy array}, shape (n_samples, n_features) 18 | input data 19 | kwargs: {dictionary} 20 | parameters to construct different affinity matrix W: 21 | y: {numpy array}, shape (n_samples, 1) 22 | the true label information needed under the 'supervised' neighbor mode 23 | metric: {string} 24 | choices for different distance measures 25 | 'euclidean' - use euclidean distance 26 | 'cosine' - use cosine distance (default) 27 | neighbor_mode: {string} 28 | indicates how to construct the graph 29 | 'knn' - put an edge between two nodes if and only if they are among the 30 | k nearest neighbors of each other (default) 31 | 'supervised' - put an edge between two nodes if they belong to same class 32 | and they are among the k nearest neighbors of each other 33 | weight_mode: {string} 34 | indicates how to assign weights for each edge in the graph 35 | 'binary' - 0-1 weighting, every edge receives weight of 1 (default) 36 | 'heat_kernel' - if nodes i and j are connected, put weight W_ij = exp(-norm(x_i - x_j)/2t^2) 37 | this weight mode can only be used under 'euclidean' metric and you are required 38 | to provide the parameter t 39 | 'cosine' - if nodes i and j are connected, put weight cosine(x_i,x_j). 40 | this weight mode can only be used under 'cosine' metric 41 | k: {int} 42 | choices for the number of neighbors (default k = 5) 43 | t: {float} 44 | parameter for the 'heat_kernel' weight_mode 45 | fisher_score: {boolean} 46 | indicates whether to build the affinity matrix in a fisher score way, in which W_ij = 1/n_l if yi = yj = l; 47 | otherwise W_ij = 0 (default fisher_score = false) 48 | reliefF: {boolean} 49 | indicates whether to build the affinity matrix in a reliefF way, NH(x) and NM(x,y) denotes a set of 50 | k nearest points to x with the same class as x, and a different class (the class y), respectively. 51 | W_ij = 1 if i = j; W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) (default reliefF = false) 52 | 53 | Output 54 | ------ 55 | W: {sparse matrix}, shape (n_samples, n_samples) 56 | output affinity matrix W 57 | """ 58 | 59 | # default metric is 'cosine' 60 | if 'metric' not in kwargs.keys(): 61 | kwargs['metric'] = 'cosine' 62 | 63 | # default neighbor mode is 'knn' and default neighbor size is 5 64 | if 'neighbor_mode' not in kwargs.keys(): 65 | kwargs['neighbor_mode'] = 'knn' 66 | if kwargs['neighbor_mode'] == 'knn' and 'k' not in kwargs.keys(): 67 | kwargs['k'] = 5 68 | if kwargs['neighbor_mode'] == 'supervised' and 'k' not in kwargs.keys(): 69 | kwargs['k'] = 5 70 | if kwargs['neighbor_mode'] == 'supervised' and 'y' not in kwargs.keys(): 71 | print ('Warning: label is required in the supervised neighborMode!!!') 72 | exit(0) 73 | 74 | # default weight mode is 'binary', default t in heat kernel mode is 1 75 | if 'weight_mode' not in kwargs.keys(): 76 | kwargs['weight_mode'] = 'binary' 77 | if kwargs['weight_mode'] == 'heat_kernel': 78 | if kwargs['metric'] != 'euclidean': 79 | kwargs['metric'] = 'euclidean' 80 | if 't' not in kwargs.keys(): 81 | kwargs['t'] = 1 82 | elif kwargs['weight_mode'] == 'cosine': 83 | if kwargs['metric'] != 'cosine': 84 | kwargs['metric'] = 'cosine' 85 | 86 | # default fisher_score and reliefF mode are 'false' 87 | if 'fisher_score' not in kwargs.keys(): 88 | kwargs['fisher_score'] = False 89 | if 'reliefF' not in kwargs.keys(): 90 | kwargs['reliefF'] = False 91 | 92 | n_samples, n_features = np.shape(X) 93 | 94 | # choose 'knn' neighbor mode 95 | if kwargs['neighbor_mode'] == 'knn': 96 | k = kwargs['k'] 97 | if kwargs['weight_mode'] == 'binary': 98 | if kwargs['metric'] == 'euclidean': 99 | # compute pairwise euclidean distances 100 | D = pairwise_distances(X) 101 | D **= 2 102 | # sort the distance matrix D in ascending order 103 | dump = np.sort(D, axis=1) 104 | idx = np.argsort(D, axis=1) 105 | # choose the k-nearest neighbors for each instance 106 | idx_new = idx[:, 0:k+1] 107 | G = np.zeros((n_samples*(k+1), 3)) 108 | G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1) 109 | G[:, 1] = np.ravel(idx_new, order='F') 110 | G[:, 2] = 1 111 | # build the sparse affinity matrix W 112 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 113 | bigger = np.transpose(W) > W 114 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 115 | return W 116 | 117 | elif kwargs['metric'] == 'cosine': 118 | # normalize the data first 119 | X_normalized = np.power(np.sum(X*X, axis=1), 0.5) 120 | for i in range(n_samples): 121 | X[i, :] = X[i, :]/max(1e-12, X_normalized[i]) 122 | # compute pairwise cosine distances 123 | D_cosine = np.dot(X, np.transpose(X)) 124 | # sort the distance matrix D in descending order 125 | dump = np.sort(-D_cosine, axis=1) 126 | idx = np.argsort(-D_cosine, axis=1) 127 | idx_new = idx[:, 0:k+1] 128 | G = np.zeros((n_samples*(k+1), 3)) 129 | G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1) 130 | G[:, 1] = np.ravel(idx_new, order='F') 131 | G[:, 2] = 1 132 | # build the sparse affinity matrix W 133 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 134 | bigger = np.transpose(W) > W 135 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 136 | return W 137 | 138 | elif kwargs['weight_mode'] == 'heat_kernel': 139 | t = kwargs['t'] 140 | # compute pairwise euclidean distances 141 | D = pairwise_distances(X) 142 | D **= 2 143 | # sort the distance matrix D in ascending order 144 | dump = np.sort(D, axis=1) 145 | idx = np.argsort(D, axis=1) 146 | idx_new = idx[:, 0:k+1] 147 | dump_new = dump[:, 0:k+1] 148 | # compute the pairwise heat kernel distances 149 | dump_heat_kernel = np.exp(-dump_new/(2*t*t)) 150 | G = np.zeros((n_samples*(k+1), 3)) 151 | G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1) 152 | G[:, 1] = np.ravel(idx_new, order='F') 153 | G[:, 2] = np.ravel(dump_heat_kernel, order='F') 154 | # build the sparse affinity matrix W 155 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 156 | bigger = np.transpose(W) > W 157 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 158 | return W 159 | 160 | elif kwargs['weight_mode'] == 'cosine': 161 | # normalize the data first 162 | X_normalized = np.power(np.sum(X*X, axis=1), 0.5) 163 | for i in range(n_samples): 164 | X[i, :] = X[i, :]/max(1e-12, X_normalized[i]) 165 | # compute pairwise cosine distances 166 | D_cosine = np.dot(X, np.transpose(X)) 167 | # sort the distance matrix D in ascending order 168 | dump = np.sort(-D_cosine, axis=1) 169 | idx = np.argsort(-D_cosine, axis=1) 170 | idx_new = idx[:, 0:k+1] 171 | dump_new = -dump[:, 0:k+1] 172 | G = np.zeros((n_samples*(k+1), 3)) 173 | G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1) 174 | G[:, 1] = np.ravel(idx_new, order='F') 175 | G[:, 2] = np.ravel(dump_new, order='F') 176 | # build the sparse affinity matrix W 177 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 178 | bigger = np.transpose(W) > W 179 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 180 | return W 181 | 182 | # choose supervised neighborMode 183 | elif kwargs['neighbor_mode'] == 'supervised': 184 | k = kwargs['k'] 185 | # get true labels and the number of classes 186 | y = kwargs['y'] 187 | label = np.unique(y) 188 | n_classes = np.unique(y).size 189 | # construct the weight matrix W in a fisherScore way, W_ij = 1/n_l if yi = yj = l, otherwise W_ij = 0 190 | if kwargs['fisher_score'] is True: 191 | W = lil_matrix((n_samples, n_samples)) 192 | for i in range(n_classes): 193 | class_idx = (y == label[i]) 194 | class_idx_all = (class_idx[:, np.newaxis] & class_idx[np.newaxis, :]) 195 | W[class_idx_all] = 1.0/np.sum(np.sum(class_idx)) 196 | return W 197 | 198 | # construct the weight matrix W in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest 199 | # points to x with the same class as x, a different class (the class y), respectively. W_ij = 1 if i = j; 200 | # W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) 201 | if kwargs['reliefF'] is True: 202 | # when xj in NH(xi) 203 | G = np.zeros((n_samples*(k+1), 3)) 204 | id_now = 0 205 | for i in range(n_classes): 206 | class_idx = np.column_stack(np.where(y == label[i]))[:, 0] 207 | D = pairwise_distances(X[class_idx, :]) 208 | D **= 2 209 | idx = np.argsort(D, axis=1) 210 | idx_new = idx[:, 0:k+1] 211 | n_smp_class = (class_idx[idx_new[:]]).size 212 | if len(class_idx) <= k: 213 | k = len(class_idx) - 1 214 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1) 215 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') 216 | G[id_now:n_smp_class+id_now, 2] = 1.0/k 217 | id_now += n_smp_class 218 | W1 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 219 | # when i = j, W_ij = 1 220 | for i in range(n_samples): 221 | W1[i, i] = 1 222 | # when x_j in NM(x_i, y) 223 | G = np.zeros((n_samples*k*(n_classes - 1), 3)) 224 | id_now = 0 225 | for i in range(n_classes): 226 | class_idx1 = np.column_stack(np.where(y == label[i]))[:, 0] 227 | X1 = X[class_idx1, :] 228 | for j in range(n_classes): 229 | if label[j] != label[i]: 230 | class_idx2 = np.column_stack(np.where(y == label[j]))[:, 0] 231 | X2 = X[class_idx2, :] 232 | D = pairwise_distances(X1, X2) 233 | idx = np.argsort(D, axis=1) 234 | idx_new = idx[:, 0:k] 235 | n_smp_class = len(class_idx1)*k 236 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx1, (k, 1)).reshape(-1) 237 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx2[idx_new[:]], order='F') 238 | G[id_now:n_smp_class+id_now, 2] = -1.0/((n_classes-1)*k) 239 | id_now += n_smp_class 240 | W2 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 241 | bigger = np.transpose(W2) > W2 242 | W2 = W2 - W2.multiply(bigger) + np.transpose(W2).multiply(bigger) 243 | W = W1 + W2 244 | return W 245 | 246 | if kwargs['weight_mode'] == 'binary': 247 | if kwargs['metric'] == 'euclidean': 248 | G = np.zeros((n_samples*(k+1), 3)) 249 | id_now = 0 250 | for i in range(n_classes): 251 | class_idx = np.column_stack(np.where(y == label[i]))[:, 0] 252 | # compute pairwise euclidean distances for instances in class i 253 | D = pairwise_distances(X[class_idx, :]) 254 | D **= 2 255 | # sort the distance matrix D in ascending order for instances in class i 256 | idx = np.argsort(D, axis=1) 257 | idx_new = idx[:, 0:k+1] 258 | n_smp_class = len(class_idx)*(k+1) 259 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1) 260 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') 261 | G[id_now:n_smp_class+id_now, 2] = 1 262 | id_now += n_smp_class 263 | # build the sparse affinity matrix W 264 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 265 | bigger = np.transpose(W) > W 266 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 267 | return W 268 | 269 | if kwargs['metric'] == 'cosine': 270 | # normalize the data first 271 | X_normalized = np.power(np.sum(X*X, axis=1), 0.5) 272 | for i in range(n_samples): 273 | X[i, :] = X[i, :]/max(1e-12, X_normalized[i]) 274 | G = np.zeros((n_samples*(k+1), 3)) 275 | id_now = 0 276 | for i in range(n_classes): 277 | class_idx = np.column_stack(np.where(y == label[i]))[:, 0] 278 | # compute pairwise cosine distances for instances in class i 279 | D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :])) 280 | # sort the distance matrix D in descending order for instances in class i 281 | idx = np.argsort(-D_cosine, axis=1) 282 | idx_new = idx[:, 0:k+1] 283 | n_smp_class = len(class_idx)*(k+1) 284 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1) 285 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') 286 | G[id_now:n_smp_class+id_now, 2] = 1 287 | id_now += n_smp_class 288 | # build the sparse affinity matrix W 289 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 290 | bigger = np.transpose(W) > W 291 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 292 | return W 293 | 294 | elif kwargs['weight_mode'] == 'heat_kernel': 295 | G = np.zeros((n_samples*(k+1), 3)) 296 | id_now = 0 297 | for i in range(n_classes): 298 | class_idx = np.column_stack(np.where(y == label[i]))[:, 0] 299 | # compute pairwise cosine distances for instances in class i 300 | D = pairwise_distances(X[class_idx, :]) 301 | D **= 2 302 | # sort the distance matrix D in ascending order for instances in class i 303 | dump = np.sort(D, axis=1) 304 | idx = np.argsort(D, axis=1) 305 | idx_new = idx[:, 0:k+1] 306 | dump_new = dump[:, 0:k+1] 307 | t = kwargs['t'] 308 | # compute pairwise heat kernel distances for instances in class i 309 | dump_heat_kernel = np.exp(-dump_new/(2*t*t)) 310 | n_smp_class = len(class_idx)*(k+1) 311 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1) 312 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') 313 | G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_heat_kernel, order='F') 314 | id_now += n_smp_class 315 | # build the sparse affinity matrix W 316 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 317 | bigger = np.transpose(W) > W 318 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 319 | return W 320 | 321 | elif kwargs['weight_mode'] == 'cosine': 322 | # normalize the data first 323 | X_normalized = np.power(np.sum(X*X, axis=1), 0.5) 324 | for i in range(n_samples): 325 | X[i, :] = X[i, :]/max(1e-12, X_normalized[i]) 326 | G = np.zeros((n_samples*(k+1), 3)) 327 | id_now = 0 328 | for i in range(n_classes): 329 | class_idx = np.column_stack(np.where(y == label[i]))[:, 0] 330 | # compute pairwise cosine distances for instances in class i 331 | D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :])) 332 | # sort the distance matrix D in descending order for instances in class i 333 | dump = np.sort(-D_cosine, axis=1) 334 | idx = np.argsort(-D_cosine, axis=1) 335 | idx_new = idx[:, 0:k+1] 336 | dump_new = -dump[:, 0:k+1] 337 | n_smp_class = len(class_idx)*(k+1) 338 | G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1) 339 | G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F') 340 | G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_new, order='F') 341 | id_now += n_smp_class 342 | # build the sparse affinity matrix W 343 | W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples)) 344 | bigger = np.transpose(W) > W 345 | W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger) 346 | return W -------------------------------------------------------------------------------- /skfeature/utility/data_discretization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.preprocessing 3 | 4 | 5 | def data_discretization(X, n_bins): 6 | """ 7 | This function implements the data discretization function to discrete data into n_bins 8 | 9 | Input 10 | ----- 11 | X: {numpy array}, shape (n_samples, n_features) 12 | input data 13 | n_bins: {int} 14 | number of bins to be discretized 15 | 16 | Output 17 | ------ 18 | X_discretized: {numpy array}, shape (n_samples, n_features) 19 | output discretized data, where features are digitized to n_bins 20 | """ 21 | 22 | # normalize each feature 23 | min_max_scaler = sklearn.preprocessing.MinMaxScaler() 24 | X_normalized = min_max_scaler.fit_transform(X) 25 | 26 | # discretize X 27 | n_samples, n_features = X.shape 28 | X_discretized = np.zeros((n_samples, n_features)) 29 | bins = np.linspace(0, 1, n_bins) 30 | for i in range(n_features): 31 | X_discretized[:, i] = np.digitize(X_normalized[:, i], bins) 32 | 33 | return X_discretized 34 | -------------------------------------------------------------------------------- /skfeature/utility/entropy_estimators.py: -------------------------------------------------------------------------------- 1 | # Written by Greg Ver Steeg (http://www.isi.edu/~gregv/npeet.html) 2 | 3 | import scipy.spatial as ss 4 | from scipy.special import digamma 5 | from math import log 6 | import numpy.random as nr 7 | import numpy as np 8 | import random 9 | 10 | 11 | # continuous estimators 12 | 13 | def entropy(x, k=3, base=2): 14 | """ 15 | The classic K-L k-nearest neighbor continuous entropy estimator x should be a list of vectors, 16 | e.g. x = [[1.3],[3.7],[5.1],[2.4]] if x is a one-dimensional scalar and we have four samples 17 | """ 18 | 19 | assert k <= len(x)-1, "Set k smaller than num. samples - 1" 20 | d = len(x[0]) 21 | N = len(x) 22 | intens = 1e-10 # small noise to break degeneracy, see doc. 23 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 24 | tree = ss.cKDTree(x) 25 | nn = [tree.query(point, k+1, p=float('inf'))[0][k] for point in x] 26 | const = digamma(N)-digamma(k) + d*log(2) 27 | return (const + d*np.mean(map(log, nn)))/log(base) 28 | 29 | 30 | def mi(x, y, k=3, base=2): 31 | """ 32 | Mutual information of x and y; x, y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]] 33 | if x is a one-dimensional scalar and we have four samples 34 | """ 35 | 36 | assert len(x) == len(y), "Lists should have same length" 37 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 38 | intens = 1e-10 # small noise to break degeneracy, see doc. 39 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 40 | y = [list(p + intens * nr.rand(len(y[0]))) for p in y] 41 | points = zip2(x, y) 42 | # Find nearest neighbors in joint space, p=inf means max-norm 43 | tree = ss.cKDTree(points) 44 | dvec = [tree.query(point, k+1, p=float('inf'))[0][k] for point in points] 45 | a, b, c, d = avgdigamma(x, dvec), avgdigamma(y, dvec), digamma(k), digamma(len(x)) 46 | return (-a-b+c+d)/log(base) 47 | 48 | 49 | def cmi(x, y, z, k=3, base=2): 50 | """ 51 | Mutual information of x and y, conditioned on z; x, y, z should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]] 52 | if x is a one-dimensional scalar and we have four samples 53 | """ 54 | 55 | assert len(x) == len(y), "Lists should have same length" 56 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 57 | intens = 1e-10 # small noise to break degeneracy, see doc. 58 | x = [list(p + intens * nr.rand(len(x[0]))) for p in x] 59 | y = [list(p + intens * nr.rand(len(y[0]))) for p in y] 60 | z = [list(p + intens * nr.rand(len(z[0]))) for p in z] 61 | points = zip2(x, y, z) 62 | # Find nearest neighbors in joint space, p=inf means max-norm 63 | tree = ss.cKDTree(points) 64 | dvec = [tree.query(point, k+1, p=float('inf'))[0][k] for point in points] 65 | a, b, c, d = avgdigamma(zip2(x, z), dvec), avgdigamma(zip2(y, z), dvec), avgdigamma(z, dvec), digamma(k) 66 | return (-a-b+c+d)/log(base) 67 | 68 | 69 | def kldiv(x, xp, k=3, base=2): 70 | """ 71 | KL Divergence between p and q for x~p(x), xp~q(x); x, xp should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]] 72 | if x is a one-dimensional scalar and we have four samples 73 | """ 74 | 75 | assert k <= len(x) - 1, "Set k smaller than num. samples - 1" 76 | assert k <= len(xp) - 1, "Set k smaller than num. samples - 1" 77 | assert len(x[0]) == len(xp[0]), "Two distributions must have same dim." 78 | d = len(x[0]) 79 | n = len(x) 80 | m = len(xp) 81 | const = log(m) - log(n-1) 82 | tree = ss.cKDTree(x) 83 | treep = ss.cKDTree(xp) 84 | nn = [tree.query(point, k+1, p=float('inf'))[0][k] for point in x] 85 | nnp = [treep.query(point, k, p=float('inf'))[0][k-1] for point in x] 86 | return (const + d*np.mean(map(log, nnp))-d*np.mean(map(log, nn)))/log(base) 87 | 88 | 89 | # Discrete estimators 90 | def entropyd(sx, base=2): 91 | """ 92 | Discrete entropy estimator given a list of samples which can be any hashable object 93 | """ 94 | 95 | return entropyfromprobs(hist(sx), base=base) 96 | 97 | 98 | def midd(x, y): 99 | """ 100 | Discrete mutual information estimator given a list of samples which can be any hashable object 101 | """ 102 | 103 | return -entropyd(list(zip(x, y)))+entropyd(x)+entropyd(y) 104 | 105 | 106 | def cmidd(x, y, z): 107 | """ 108 | Discrete mutual information estimator given a list of samples which can be any hashable object 109 | """ 110 | 111 | return entropyd(list(zip(y, z)))+entropyd(list(zip(x, z)))-entropyd(list(zip(x, y, z)))-entropyd(z) 112 | 113 | 114 | def hist(sx): 115 | # Histogram from list of samples 116 | d = dict() 117 | for s in sx: 118 | d[s] = d.get(s, 0) + 1 119 | return map(lambda z: float(z)/len(sx), d.values()) 120 | 121 | 122 | def entropyfromprobs(probs, base=2): 123 | # Turn a normalized list of probabilities of discrete outcomes into entropy (base 2) 124 | return -sum(map(elog, probs))/log(base) 125 | 126 | 127 | def elog(x): 128 | # for entropy, 0 log 0 = 0. but we get an error for putting log 0 129 | if x <= 0. or x >= 1.: 130 | return 0 131 | else: 132 | return x*log(x) 133 | 134 | 135 | # Mixed estimators 136 | def micd(x, y, k=3, base=2, warning=True): 137 | """ If x is continuous and y is discrete, compute mutual information 138 | """ 139 | 140 | overallentropy = entropy(x, k, base) 141 | n = len(y) 142 | word_dict = dict() 143 | for sample in y: 144 | word_dict[sample] = word_dict.get(sample, 0) + 1./n 145 | yvals = list(set(word_dict.keys())) 146 | 147 | mi = overallentropy 148 | for yval in yvals: 149 | xgiveny = [x[i] for i in range(n) if y[i] == yval] 150 | if k <= len(xgiveny) - 1: 151 | mi -= word_dict[yval]*entropy(xgiveny, k, base) 152 | else: 153 | if warning: 154 | print("Warning, after conditioning, on y={0} insufficient data. Assuming maximal entropy in this case.".format(yval)) 155 | mi -= word_dict[yval]*overallentropy 156 | return mi # units already applied 157 | 158 | 159 | # Utility functions 160 | def vectorize(scalarlist): 161 | """ 162 | Turn a list of scalars into a list of one-d vectors 163 | """ 164 | 165 | return [(x,) for x in scalarlist] 166 | 167 | 168 | def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs): 169 | """ 170 | Shuffle test 171 | Repeatedly shuffle the x-values and then estimate measure(x,y,[z]). 172 | Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs, 'measure' could me mi,cmi, 173 | e.g. Keyword arguments can be passed. Mutual information and CMI should have a mean near zero. 174 | """ 175 | 176 | xp = x[:] # A copy that we can shuffle 177 | outputs = [] 178 | for i in range(ns): 179 | random.shuffle(xp) 180 | if z: 181 | outputs.append(measure(xp, y, z, **kwargs)) 182 | else: 183 | outputs.append(measure(xp, y, **kwargs)) 184 | outputs.sort() 185 | return np.mean(outputs), (outputs[int((1.-ci)/2*ns)], outputs[int((1.+ci)/2*ns)]) 186 | 187 | 188 | # Internal functions 189 | def avgdigamma(points, dvec): 190 | # This part finds number of neighbors in some radius in the marginal space 191 | # returns expectation value of 192 | N = len(points) 193 | tree = ss.cKDTree(points) 194 | avg = 0. 195 | for i in range(N): 196 | dist = dvec[i] 197 | # subtlety, we don't include the boundary point, 198 | # but we are implicitly adding 1 to kraskov def bc center point is included 199 | num_points = len(tree.query_ball_point(points[i], dist-1e-15, p=float('inf'))) 200 | avg += digamma(num_points)/N 201 | return avg 202 | 203 | 204 | def zip2(*args): 205 | # zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space 206 | # E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]] 207 | return [sum(sublist, []) for sublist in zip(*args)] 208 | -------------------------------------------------------------------------------- /skfeature/utility/mutual_information.py: -------------------------------------------------------------------------------- 1 | import skfeature.utility.entropy_estimators as ee 2 | 3 | 4 | def information_gain(f1, f2): 5 | """ 6 | This function calculates the information gain, where ig(f1,f2) = H(f1) - H(f1|f2) 7 | 8 | Input 9 | ----- 10 | f1: {numpy array}, shape (n_samples,) 11 | f2: {numpy array}, shape (n_samples,) 12 | 13 | Output 14 | ------ 15 | ig: {float} 16 | """ 17 | 18 | ig = ee.entropyd(f1) - conditional_entropy(f1, f2) 19 | return ig 20 | 21 | 22 | def conditional_entropy(f1, f2): 23 | """ 24 | This function calculates the conditional entropy, where ce = H(f1) - I(f1;f2) 25 | 26 | Input 27 | ----- 28 | f1: {numpy array}, shape (n_samples,) 29 | f2: {numpy array}, shape (n_samples,) 30 | 31 | Output 32 | ------ 33 | ce: {float} 34 | ce is conditional entropy of f1 and f2 35 | """ 36 | 37 | ce = ee.entropyd(f1) - ee.midd(f1, f2) 38 | return ce 39 | 40 | 41 | def su_calculation(f1, f2): 42 | """ 43 | This function calculates the symmetrical uncertainty, where su(f1,f2) = 2*IG(f1,f2)/(H(f1)+H(f2)) 44 | 45 | Input 46 | ----- 47 | f1: {numpy array}, shape (n_samples,) 48 | f2: {numpy array}, shape (n_samples,) 49 | 50 | Output 51 | ------ 52 | su: {float} 53 | su is the symmetrical uncertainty of f1 and f2 54 | 55 | """ 56 | 57 | # calculate information gain of f1 and f2, t1 = ig(f1,f2) 58 | t1 = information_gain(f1, f2) 59 | # calculate entropy of f1, t2 = H(f1) 60 | t2 = ee.entropyd(f1) 61 | # calculate entropy of f2, t3 = H(f2) 62 | t3 = ee.entropyd(f2) 63 | # su(f1,f2) = 2*t1/(t2+t3) 64 | su = 2.0*t1/(t2+t3) 65 | 66 | return su 67 | -------------------------------------------------------------------------------- /skfeature/utility/sparse_learning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import linalg as LA 3 | 4 | 5 | def feature_ranking(W): 6 | """ 7 | This function ranks features according to the feature weights matrix W 8 | 9 | Input: 10 | ----- 11 | W: {numpy array}, shape (n_features, n_classes) 12 | feature weights matrix 13 | 14 | Output: 15 | ------ 16 | idx: {numpy array}, shape {n_features,} 17 | feature index ranked in descending order by feature importance 18 | """ 19 | T = (W*W).sum(1) 20 | idx = np.argsort(T, 0) 21 | return idx[::-1] 22 | 23 | 24 | def generate_diagonal_matrix(U): 25 | """ 26 | This function generates a diagonal matrix D from an input matrix U as D_ii = 0.5 / ||U[i,:]|| 27 | 28 | Input: 29 | ----- 30 | U: {numpy array}, shape (n_samples, n_features) 31 | 32 | Output: 33 | ------ 34 | D: {numpy array}, shape (n_samples, n_samples) 35 | """ 36 | temp = np.sqrt(np.multiply(U, U).sum(1)) 37 | temp[temp < 1e-16] = 1e-16 38 | temp = 0.5 / temp 39 | D = np.diag(temp) 40 | return D 41 | 42 | 43 | def calculate_l21_norm(X): 44 | """ 45 | This function calculates the l21 norm of a matrix X, i.e., \sum ||X[i,:]||_2 46 | 47 | Input: 48 | ----- 49 | X: {numpy array}, shape (n_samples, n_features) 50 | 51 | Output: 52 | ------ 53 | l21_norm: {float} 54 | """ 55 | return (np.sqrt(np.multiply(X, X).sum(1))).sum() 56 | 57 | 58 | def construct_label_matrix(label): 59 | """ 60 | This function converts a 1d numpy array to a 2d array, for each instance, the class label is 1 or 0 61 | 62 | Input: 63 | ----- 64 | label: {numpy array}, shape(n_samples,) 65 | 66 | Output: 67 | ------ 68 | label_matrix: {numpy array}, shape(n_samples, n_classes) 69 | """ 70 | 71 | n_samples = label.shape[0] 72 | unique_label = np.unique(label) 73 | n_classes = unique_label.shape[0] 74 | label_matrix = np.zeros((n_samples, n_classes)) 75 | for i in range(n_classes): 76 | label_matrix[label == unique_label[i], i] = 1 77 | 78 | return label_matrix.astype(int) 79 | 80 | 81 | def construct_label_matrix_pan(label): 82 | """ 83 | This function converts a 1d numpy array to a 2d array, for each instance, the class label is 1 or -1 84 | 85 | Input: 86 | ----- 87 | label: {numpy array}, shape(n_samples,) 88 | 89 | Output: 90 | ------ 91 | label_matrix: {numpy array}, shape(n_samples, n_classes) 92 | """ 93 | n_samples = label.shape[0] 94 | unique_label = np.unique(label) 95 | n_classes = unique_label.shape[0] 96 | label_matrix = np.zeros((n_samples, n_classes)) 97 | for i in range(n_classes): 98 | label_matrix[label == unique_label[i], i] = 1 99 | label_matrix[label_matrix == 0] = -1 100 | 101 | return label_matrix.astype(int) 102 | 103 | 104 | def euclidean_projection(V, n_features, n_classes, z, gamma): 105 | """ 106 | L2 Norm regularized euclidean projection min_W 1/2 ||W- V||_2^2 + z * ||W||_2 107 | """ 108 | W_projection = np.zeros((n_features, n_classes)) 109 | for i in range(n_features): 110 | if LA.norm(V[i, :]) > z/gamma: 111 | W_projection[i, :] = (1-z/(gamma*LA.norm(V[i, :])))*V[i, :] 112 | else: 113 | W_projection[i, :] = np.zeros(n_classes) 114 | return W_projection 115 | 116 | 117 | def tree_lasso_projection(v, n_features, idx, n_nodes): 118 | """ 119 | This functions solves the following optimization problem min_w 1/2 ||w-v||_2^2 + \sum z_i||w_{G_{i}}|| 120 | where w and v are of dimensions of n_features; z_i >=0, and G_{i} follows the tree structure 121 | """ 122 | # test whether the first node is special 123 | if idx[0, 0] == -1 and idx[1, 0] == -1: 124 | w_projection = np.zeros(n_features) 125 | z = idx[2, 0] 126 | for j in range(n_features): 127 | if v[j] > z: 128 | w_projection[j] = v[j] - z 129 | else: 130 | if v[j] < -z: 131 | w_projection[j] = v[j] + z 132 | else: 133 | w_projection[j] = 0 134 | i = 1 135 | 136 | else: 137 | w = v.copy() 138 | i = 0 139 | 140 | # sequentially process each node 141 | while i < n_nodes: 142 | # compute the L2 norm of this group 143 | two_norm = 0 144 | start_idx = int(idx[0, i] - 1) 145 | end_idx = int(idx[1, i]) 146 | for j in range(start_idx, end_idx): 147 | two_norm += w_projection[j] * w_projection[j] 148 | two_norm = np.sqrt(two_norm) 149 | z = idx[2, i] 150 | if two_norm > z: 151 | ratio = (two_norm - z) / two_norm 152 | # shrinkage this group by ratio 153 | for j in range(start_idx, end_idx): 154 | w_projection[j] *= ratio 155 | else: 156 | for j in range(start_idx, end_idx): 157 | w_projection[j] = 0 158 | i += 1 159 | return w_projection 160 | 161 | 162 | def tree_norm(w, n_features, idx, n_nodes): 163 | """ 164 | This function computes \sum z_i||w_{G_{i}}|| 165 | """ 166 | obj = 0 167 | # test whether the first node is special 168 | if idx[0, 0] == -1 and idx[1, 0] == -1: 169 | z = idx[2, 0] 170 | for j in range(n_features): 171 | obj += np.abs(w[j]) 172 | obj *= z 173 | i = 1 174 | else: 175 | i = 0 176 | 177 | # sequentially process each node 178 | while i < n_nodes: 179 | two_norm = 0 180 | start_idx = int(idx[0, i] - 1) 181 | end_idx = int(idx[1, i]) 182 | for j in range(start_idx, end_idx): 183 | two_norm += w[j] * w[j] 184 | two_norm = np.sqrt(two_norm) 185 | z = idx[2, i] 186 | obj += z*two_norm 187 | i += 1 188 | return obj 189 | 190 | -------------------------------------------------------------------------------- /skfeature/utility/unsupervised_evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.utils.linear_assignment_ as la 3 | from sklearn.metrics import accuracy_score 4 | from sklearn.metrics.cluster import normalized_mutual_info_score 5 | from sklearn.cluster import KMeans 6 | 7 | 8 | def best_map(l1, l2): 9 | """ 10 | Permute labels of l2 to match l1 as much as possible 11 | """ 12 | if len(l1) != len(l2): 13 | print("L1.shape must == L2.shape") 14 | exit(0) 15 | 16 | label1 = np.unique(l1) 17 | n_class1 = len(label1) 18 | 19 | label2 = np.unique(l2) 20 | n_class2 = len(label2) 21 | 22 | n_class = max(n_class1, n_class2) 23 | G = np.zeros((n_class, n_class)) 24 | 25 | for i in range(0, n_class1): 26 | for j in range(0, n_class2): 27 | ss = l1 == label1[i] 28 | tt = l2 == label2[j] 29 | G[i, j] = np.count_nonzero(ss & tt) 30 | 31 | A = la.linear_assignment(-G) 32 | 33 | new_l2 = np.zeros(l2.shape) 34 | for i in range(0, n_class2): 35 | new_l2[l2 == label2[A[i][1]]] = label1[A[i][0]] 36 | return new_l2.astype(int) 37 | 38 | 39 | def evaluation(X_selected, n_clusters, y): 40 | """ 41 | This function calculates ARI, ACC and NMI of clustering results 42 | 43 | Input 44 | ----- 45 | X_selected: {numpy array}, shape (n_samples, n_selected_features} 46 | input data on the selected features 47 | n_clusters: {int} 48 | number of clusters 49 | y: {numpy array}, shape (n_samples,) 50 | true labels 51 | 52 | Output 53 | ------ 54 | nmi: {float} 55 | Normalized Mutual Information 56 | acc: {float} 57 | Accuracy 58 | """ 59 | k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, 60 | tol=0.0001, precompute_distances=True, verbose=0, 61 | random_state=None, copy_x=True, n_jobs=1) 62 | 63 | k_means.fit(X_selected) 64 | y_predict = k_means.labels_ 65 | 66 | # calculate NMI 67 | nmi = normalized_mutual_info_score(y, y_predict) 68 | 69 | # calculate ACC 70 | y_permuted_predict = best_map(y, y_predict) 71 | acc = accuracy_score(y, y_permuted_predict) 72 | 73 | return nmi, acc --------------------------------------------------------------------------------