├── .gitignore
├── LICENSE
├── PKG-INFO
├── README.md
├── setup.py
└── skfeature
    ├── __init__.py
    ├── data
        ├── ALLAML.mat
        ├── BASEHOCK.mat
        ├── CLL-SUB-111.mat
        ├── COIL20.mat
        ├── Carcinom.mat
        ├── GLI-85.mat
        ├── GLIOMA.mat
        ├── Isolet.mat
        ├── ORL.mat
        ├── PCMAC.mat
        ├── Prostate-GE.mat
        ├── RELATHE.mat
        ├── SMK-CAN-187.mat
        ├── TOX-171.mat
        ├── USPS.mat
        ├── Yale.mat
        ├── arcene.mat
        ├── colon.mat
        ├── gisette.mat
        ├── leukemia.mat
        ├── lung.mat
        ├── lung_small.mat
        ├── lymphoma.mat
        ├── madelon.mat
        ├── nci9.mat
        ├── orlraws10P.mat
        ├── pixraw10P.mat
        ├── warpAR10P.mat
        └── warpPIE10P.mat
    ├── example
        ├── test_CFS.py
        ├── test_CIFE.py
        ├── test_CMIM.py
        ├── test_DISR.py
        ├── test_FCBF.py
        ├── test_ICAP.py
        ├── test_JMI.py
        ├── test_MCFS.py
        ├── test_MIFS.py
        ├── test_MIM.py
        ├── test_MRMR.py
        ├── test_NDFS.py
        ├── test_RFS.py
        ├── test_SPEC.py
        ├── test_UDFS.py
        ├── test_alpha_investing.py
        ├── test_chi_square.py
        ├── test_decision_tree_backward.py
        ├── test_decision_tree_forward.py
        ├── test_f_score.py
        ├── test_fisher_score.py
        ├── test_gini_index.py
        ├── test_group_fs.py
        ├── test_lap_score.py
        ├── test_ll_l21.py
        ├── test_low_variance.py
        ├── test_ls_l21.py
        ├── test_reliefF.py
        ├── test_svm_backward.py
        ├── test_svm_forward.py
        ├── test_t_score.py
        ├── test_trace_ratio.py
        └── test_tree_fs.py
    ├── function
        ├── __init__.py
        ├── information_theoretical_based
        │   ├── CIFE.py
        │   ├── CMIM.py
        │   ├── DISR.py
        │   ├── FCBF.py
        │   ├── ICAP.py
        │   ├── JMI.py
        │   ├── LCSI.py
        │   ├── MIFS.py
        │   ├── MIM.py
        │   ├── MRMR.py
        │   └── __init__.py
        ├── similarity_based
        │   ├── SPEC.py
        │   ├── __init__.py
        │   ├── fisher_score.py
        │   ├── lap_score.py
        │   ├── reliefF.py
        │   └── trace_ratio.py
        ├── sparse_learning_based
        │   ├── MCFS.py
        │   ├── NDFS.py
        │   ├── RFS.py
        │   ├── UDFS.py
        │   ├── __init__.py
        │   ├── ll_l21.py
        │   └── ls_l21.py
        ├── statistical_based
        │   ├── CFS.py
        │   ├── __init__.py
        │   ├── chi_square.py
        │   ├── f_score.py
        │   ├── gini_index.py
        │   ├── low_variance.py
        │   └── t_score.py
        ├── streaming
        │   ├── __init__.py
        │   └── alpha_investing.py
        ├── structure
        │   ├── __init__.py
        │   ├── graph_fs.py
        │   ├── group_fs.py
        │   └── tree_fs.py
        └── wrapper
        │   ├── __init__.py
        │   ├── decision_tree_backward.py
        │   ├── decision_tree_forward.py
        │   ├── svm_backward.py
        │   └── svm_forward.py
    └── utility
        ├── __init__.py
        ├── construct_W.py
        ├── data_discretization.py
        ├── entropy_estimators.py
        ├── mutual_information.py
        ├── sparse_learning.py
        └── unsupervised_evaluation.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 1.0
 2 | Name: skfeature
 3 | Version: 1.0.0
 4 | Summary: Feature Selection Repository in Python (DMML Lab@ASU)
 5 | Home-page: https://github.com/jundongl/scikit-feature
 6 | Author: Jundong Li, Kewei Cheng, Suhang Wang
 7 | Author-email: jundong.li@asu.edu, kcheng18@asu.edu, suhang.wang@asu.edu
 8 | License: UNKNOWN
 9 | Description: UNKNOWN
10 | Keywords: Feature Selection Repository
11 | Platform: UNKNOWN
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scikit-feature
 2 | ===============================
 3 | Feature selection repository scikit-feature in Python. 
 4 | 
 5 | scikit-feature is an open-source feature selection repository in Python developed by Data Mining and Machine Learning Lab at Arizona State University. It is built upon one widely used machine learning package scikit-learn and two scientific computing packages Numpy and Scipy. scikit-feature contains around 40 popular feature selection algorithms, including traditional feature selection algorithms and some structural and streaming feature selection algorithms. 
 6 | 
 7 | It serves as a platform for facilitating feature selection application, research and comparative study. It is designed to share widely used feature selection algorithms developed in the feature selection research, and offer convenience for researchers and practitioners to perform empirical evaluation in developing new feature selection algorithms.
 8 | 
 9 | ## Installing scikit-feature
10 | ### Prerequisites:
11 | Python 2.7 *and Python 3*
12 | 
13 | NumPy
14 | 
15 | SciPy
16 | 
17 | Scikit-learn
18 | 
19 | ### Steps:
20 | For Linux users, you can install the repository by the following command:
21 | 
22 |     python setup.py install
23 | 
24 | For Windows users, you can also install the repository by the following command:
25 | 
26 |     setup.py install
27 | 
28 | ## Project website
29 | Instructions of using this repository can be found in our project webpage at http://featureselection.asu.edu/
30 | 
31 | ## Citation
32 | 
33 | If you find scikit-feature feature selection reposoitory useful in your research, please consider citing the following paper::
34 | 
35 |     @article{li2018feature,
36 |     title={Feature selection: A data perspective},
37 |     author={Li, Jundong and Cheng, Kewei and Wang, Suhang and Morstatter, Fred and Trevino, Robert P and Tang, Jiliang and Liu, Huan},
38 |     journal={ACM Computing Surveys (CSUR)},
39 |     volume={50},
40 |     number={6},
41 |     pages={94},
42 |     year={2018},
43 |     publisher={ACM}
44 |     }
45 |     
46 | ## Contact
47 | Jundong Li
48 | E-mail: jundong@virgnia.edu
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | NAME = "skfeature"
 4 | 
 5 | DESCRIPTION = "Feature Selection Repository in Python (DMML Lab@ASU)"
 6 | 
 7 | KEYWORDS = "Feature Selection Repository"
 8 | 
 9 | AUTHOR = "Jundong Li, Kewei Cheng, Suhang Wang"
10 | 
11 | AUTHOR_EMAIL = "jundong.li@asu.edu, kcheng18@asu.edu, suhang.wang@asu.edu"
12 | 
13 | URL = "https://github.com/jundongl/scikit-feature"
14 | 
15 | VERSION = "1.0.0"
16 | 
17 | 
18 | setup(
19 |     name = NAME,
20 |     version = VERSION,
21 |     description = DESCRIPTION,
22 |     keywords = KEYWORDS,
23 |     author = AUTHOR,
24 |     author_email = AUTHOR_EMAIL,
25 |     url = URL,
26 |     packages =['skfeature', 'skfeature.utility','skfeature.function','skfeature.function.information_theoretical_based','skfeature.function.similarity_based','skfeature.function.sparse_learning_based','skfeature.function.statistical_based','skfeature.function.streaming','skfeature.function.structure','skfeature.function.wrapper',] ,
27 | )
28 | 


--------------------------------------------------------------------------------
/skfeature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/__init__.py


--------------------------------------------------------------------------------
/skfeature/data/ALLAML.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/ALLAML.mat


--------------------------------------------------------------------------------
/skfeature/data/BASEHOCK.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/BASEHOCK.mat


--------------------------------------------------------------------------------
/skfeature/data/CLL-SUB-111.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/CLL-SUB-111.mat


--------------------------------------------------------------------------------
/skfeature/data/COIL20.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/COIL20.mat


--------------------------------------------------------------------------------
/skfeature/data/Carcinom.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Carcinom.mat


--------------------------------------------------------------------------------
/skfeature/data/GLI-85.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/GLI-85.mat


--------------------------------------------------------------------------------
/skfeature/data/GLIOMA.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/GLIOMA.mat


--------------------------------------------------------------------------------
/skfeature/data/Isolet.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Isolet.mat


--------------------------------------------------------------------------------
/skfeature/data/ORL.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/ORL.mat


--------------------------------------------------------------------------------
/skfeature/data/PCMAC.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/PCMAC.mat


--------------------------------------------------------------------------------
/skfeature/data/Prostate-GE.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Prostate-GE.mat


--------------------------------------------------------------------------------
/skfeature/data/RELATHE.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/RELATHE.mat


--------------------------------------------------------------------------------
/skfeature/data/SMK-CAN-187.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/SMK-CAN-187.mat


--------------------------------------------------------------------------------
/skfeature/data/TOX-171.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/TOX-171.mat


--------------------------------------------------------------------------------
/skfeature/data/USPS.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/USPS.mat


--------------------------------------------------------------------------------
/skfeature/data/Yale.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/Yale.mat


--------------------------------------------------------------------------------
/skfeature/data/arcene.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/arcene.mat


--------------------------------------------------------------------------------
/skfeature/data/colon.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/colon.mat


--------------------------------------------------------------------------------
/skfeature/data/gisette.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/gisette.mat


--------------------------------------------------------------------------------
/skfeature/data/leukemia.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/leukemia.mat


--------------------------------------------------------------------------------
/skfeature/data/lung.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lung.mat


--------------------------------------------------------------------------------
/skfeature/data/lung_small.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lung_small.mat


--------------------------------------------------------------------------------
/skfeature/data/lymphoma.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/lymphoma.mat


--------------------------------------------------------------------------------
/skfeature/data/madelon.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/madelon.mat


--------------------------------------------------------------------------------
/skfeature/data/nci9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/nci9.mat


--------------------------------------------------------------------------------
/skfeature/data/orlraws10P.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/orlraws10P.mat


--------------------------------------------------------------------------------
/skfeature/data/pixraw10P.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/pixraw10P.mat


--------------------------------------------------------------------------------
/skfeature/data/warpAR10P.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/warpAR10P.mat


--------------------------------------------------------------------------------
/skfeature/data/warpPIE10P.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/data/warpPIE10P.mat


--------------------------------------------------------------------------------
/skfeature/example/test_CFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import svm
 3 | from sklearn import cross_validation
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.function.statistical_based import CFS
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of selected features on training set
27 |         idx = CFS.cfs(X[train], y[train])
28 | 
29 |         # obtain the dataset on the selected features
30 |         selected_features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(selected_features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(selected_features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_CIFE.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import CIFE
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = CIFE.cife(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_CMIM.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import CMIM
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = CMIM.cmim(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_DISR.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import DISR
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = DISR.disr(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_FCBF.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import FCBF
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx = FCBF.fcbf(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_ICAP.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import ICAP
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = ICAP.icap(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_JMI.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import JMI
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = JMI.jmi(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_MCFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.sparse_learning_based import MCFS
 3 | from skfeature.utility import construct_W
 4 | from skfeature.utility import unsupervised_evaluation
 5 | 
 6 | 
 7 | def main():
 8 |     # load data
 9 |     mat = scipy.io.loadmat('../data/COIL20.mat')
10 |     X = mat['X']    # data
11 |     X = X.astype(float)
12 |     y = mat['Y']    # label
13 |     y = y[:, 0]
14 | 
15 |     # construct affinity matrix
16 |     kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
17 |     W = construct_W.construct_W(X, **kwargs)
18 | 
19 |     num_fea = 100    # specify the number of selected features
20 |     num_cluster = 20    # specify the number of clusters, it is usually set as the number of classes in the ground truth
21 | 
22 |     # obtain the feature weight matrix
23 |     Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)
24 | 
25 |     # sort the feature scores in an ascending order according to the feature scores
26 |     idx = MCFS.feature_ranking(Weight)
27 | 
28 |     # obtain the dataset on the selected features
29 |     selected_features = X[:, idx[0:num_fea]]
30 | 
31 |     # perform kmeans clustering based on the selected features and repeats 20 times
32 |     nmi_total = 0
33 |     acc_total = 0
34 |     for i in range(0, 20):
35 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
36 |         nmi_total += nmi
37 |         acc_total += acc
38 | 
39 |     # output the average NMI and average ACC
40 |     print 'NMI:', float(nmi_total)/20
41 |     print 'ACC:', float(acc_total)/20
42 | 
43 | if __name__ == '__main__':
44 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_MIFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import MIFS
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/BASEHOCK.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx = MIFS.mifs(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         print acc
41 |         correct = correct + acc
42 | 
43 |     # output the average classification accuracy over all 10 folds
44 |     print 'Accuracy:', float(correct)/10
45 | 
46 | if __name__ == '__main__':
47 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_MIM.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import MIM
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = MIM.mim(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_MRMR.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.information_theoretical_based import MRMR
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 10    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of each feature on the training set
27 |         idx,_,_ = MRMR.mrmr(X[train], y[train], n_selected_features=num_fea)
28 | 
29 |         # obtain the dataset on the selected features
30 |         features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_NDFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.sparse_learning_based import NDFS
 3 | from skfeature.utility import construct_W
 4 | from skfeature.utility.sparse_learning import feature_ranking
 5 | from skfeature.utility import unsupervised_evaluation
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 | 
16 |     # construct affinity matrix
17 |     kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
18 |     W = construct_W.construct_W(X, **kwargs)
19 | 
20 |     # obtain the feature weight matrix
21 |     Weight = NDFS.ndfs(X, W=W, n_clusters=20)
22 | 
23 |     # sort the feature scores in an ascending order according to the feature scores
24 |     idx = feature_ranking(Weight)
25 | 
26 |     # perform evaluation on clustering task
27 |     num_fea = 100    # number of selected features
28 |     num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth
29 | 
30 |     # obtain the dataset on the selected features
31 |     selected_features = X[:, idx[0:num_fea]]
32 | 
33 |     # perform kmeans clustering based on the selected features and repeats 20 times
34 |     nmi_total = 0
35 |     acc_total = 0
36 |     for i in range(0, 20):
37 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
38 |         nmi_total += nmi
39 |         acc_total += acc
40 | 
41 |     # output the average NMI and average ACC
42 |     print 'NMI:', float(nmi_total)/20
43 |     print 'ACC:', float(acc_total)/20
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_RFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import svm
 3 | from sklearn import cross_validation
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.function.sparse_learning_based import RFS
 6 | from skfeature.utility.sparse_learning import construct_label_matrix, feature_ranking
 7 | 
 8 | 
 9 | def main():
10 |     # load data
11 |     mat = scipy.io.loadmat('../data/COIL20.mat')
12 |     X = mat['X']    # data
13 |     X = X.astype(float)
14 |     y = mat['Y']    # label
15 |     y = y[:, 0]
16 |     Y = construct_label_matrix(y)
17 |     n_samples, n_features = X.shape
18 | 
19 |     # split data into 10 folds
20 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
21 | 
22 |     # perform evaluation on classification task
23 |     num_fea = 100    # number of selected features
24 |     clf = svm.LinearSVC()    # linear SVM
25 | 
26 |     correct = 0
27 |     for train, test in ss:
28 |         # obtain the feature weight matrix
29 |         Weight = RFS.rfs(X[train, :], Y[train, :], gamma=0.1)
30 | 
31 |         # sort the feature scores in an ascending order according to the feature scores
32 |         idx = feature_ranking(Weight)
33 | 
34 |         # obtain the dataset on the selected features
35 |         selected_features = X[:, idx[0:num_fea]]
36 | 
37 |         # train a classification model with the selected features on the training dataset
38 |         clf.fit(selected_features[train], y[train])
39 | 
40 |         # predict the class labels of test data
41 |         y_predict = clf.predict(selected_features[test])
42 | 
43 |         # obtain the classification accuracy on the test data
44 |         acc = accuracy_score(y[test], y_predict)
45 |         print acc
46 |         correct = correct + acc
47 | 
48 |     # output the average classification accuracy over all 10 folds
49 |     print 'Accuracy:', float(correct)/10
50 | 
51 | if __name__ == '__main__':
52 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_SPEC.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.similarity_based import SPEC
 3 | from skfeature.utility import unsupervised_evaluation
 4 | 
 5 | 
 6 | def main():
 7 |     # load data
 8 |     mat = scipy.io.loadmat('../data/COIL20.mat')
 9 |     X = mat['X']    # data
10 |     X = X.astype(float)
11 |     y = mat['Y']    # label
12 |     y = y[:, 0]
13 | 
14 |     # specify the second ranking function which uses all except the 1st eigenvalue
15 |     kwargs = {'style': 0}
16 | 
17 |     # obtain the scores of features
18 |     score = SPEC.spec(X, **kwargs)
19 | 
20 |     # sort the feature scores in an descending order according to the feature scores
21 |     idx = SPEC.feature_ranking(score, **kwargs)
22 | 
23 |     # perform evaluation on clustering task
24 |     num_fea = 100    # number of selected features
25 |     num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth
26 | 
27 |     # obtain the dataset on the selected features
28 |     selected_features = X[:, idx[0:num_fea]]
29 | 
30 |     # perform kmeans clustering based on the selected features and repeats 20 times
31 |     nmi_total = 0
32 |     acc_total = 0
33 |     for i in range(0, 20):
34 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
35 |         nmi_total += nmi
36 |         acc_total += acc
37 | 
38 |     # output the average NMI and average ACC
39 |     print 'NMI:', float(nmi_total)/20
40 |     print 'ACC:', float(acc_total)/20
41 | 
42 | if __name__ == '__main__':
43 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_UDFS.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.sparse_learning_based import UDFS
 3 | from skfeature.utility import unsupervised_evaluation
 4 | from skfeature.utility.sparse_learning import feature_ranking
 5 | 
 6 | 
 7 | def main():
 8 |     # load data
 9 |     mat = scipy.io.loadmat('../data/COIL20.mat')
10 |     X = mat['X']    # data
11 |     X = X.astype(float)
12 |     y = mat['Y']    # label
13 |     y = y[:, 0]
14 | 
15 |     # perform evaluation on clustering task
16 |     num_fea = 100    # number of selected features
17 |     num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth
18 | 
19 |     # obtain the feature weight matrix
20 |     Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster)
21 | 
22 |     # sort the feature scores in an ascending order according to the feature scores
23 |     idx = feature_ranking(Weight)
24 | 
25 |     # obtain the dataset on the selected features
26 |     selected_features = X[:, idx[0:num_fea]]
27 | 
28 |     # perform kmeans clustering based on the selected features and repeats 20 times
29 |     nmi_total = 0
30 |     acc_total = 0
31 |     for i in range(0, 20):
32 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
33 |         nmi_total += nmi
34 |         acc_total += acc
35 | 
36 |     # output the average NMI and average ACC
37 |     print 'NMI:', float(nmi_total)/20
38 |     print 'ACC:', float(acc_total)/20
39 | 
40 | if __name__ == '__main__':
41 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_alpha_investing.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import cross_validation
 3 | from sklearn.metrics import accuracy_score
 4 | from skfeature.function.streaming import alpha_investing
 5 | from sklearn import svm
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     y = y.astype(float)
16 |     n_samples, n_features = X.shape    # number of samples and number of features
17 | 
18 |     # split data into 10 folds
19 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
20 | 
21 |     # perform evaluation on classification task
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of selected features
27 |         idx = alpha_investing.alpha_investing(X[train], y[train], 0.05, 0.05)
28 | 
29 |         # obtain the dataset on the selected features
30 |         selected_features = X[:, idx]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(selected_features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(selected_features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_chi_square.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.statistical_based import chi_square
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/BASEHOCK.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the chi-square score of each feature
27 |         score = chi_square.chi_square(X, y)
28 | 
29 |         # rank features in descending order according to score
30 |         idx = chi_square.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_decision_tree_backward.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.cross_validation import KFold
 3 | from skfeature.function.wrapper import decision_tree_backward
 4 | from sklearn import svm
 5 | from sklearn.metrics import accuracy_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     clf = svm.LinearSVC()    # linear SVM
22 | 
23 |     correct = 0
24 |     for train, test in ss:
25 |         # obtain the idx of selected features from the training set
26 |         idx = decision_tree_backward.decision_tree_backward(X[train], y[train], n_features)
27 | 
28 |         # obtain the dataset on the selected features
29 |         X_selected = X[:, idx]
30 | 
31 |         # train a classification model with the selected features on the training dataset
32 |         clf.fit(X_selected[train], y[train])
33 | 
34 |         # predict the class labels of test data
35 |         y_predict = clf.predict(X_selected[test])
36 | 
37 |         # obtain the classification accuracy on the test data
38 |         acc = accuracy_score(y[test], y_predict)
39 |         correct = correct + acc
40 | 
41 |     # output the average classification accuracy over all 10 folds
42 |     print 'Accuracy:', float(correct)/10
43 | 
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_decision_tree_forward.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.cross_validation import KFold
 3 | from skfeature.function.wrapper import decision_tree_forward
 4 | from sklearn import svm
 5 | from sklearn.metrics import accuracy_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     clf = svm.LinearSVC()    # linear SVM
22 | 
23 |     correct = 0
24 |     for train, test in ss:
25 |         # obtain the idx of selected features from the training set
26 |         idx = decision_tree_forward.decision_tree_forward(X[train], y[train], n_features)
27 | 
28 |         # obtain the dataset on the selected features
29 |         X_selected = X[:, idx]
30 | 
31 |         # train a classification model with the selected features on the training dataset
32 |         clf.fit(X_selected[train], y[train])
33 | 
34 |         # predict the class labels of test data
35 |         y_predict = clf.predict(X_selected[test])
36 | 
37 |         # obtain the classification accuracy on the test data
38 |         acc = accuracy_score(y[test], y_predict)
39 |         correct = correct + acc
40 | 
41 |     # output the average classification accuracy over all 10 folds
42 |     print 'Accuracy:', float(correct)/10
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/skfeature/example/test_f_score.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.statistical_based import f_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the f-score of each feature
27 |         score = f_score.f_score(X, y)
28 | 
29 |         # rank features in descending order according to score
30 |         idx = f_score.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_fisher_score.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import cross_validation
 3 | from sklearn import svm
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.function.similarity_based import fisher_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the score of each feature on the training set
27 |         score = fisher_score.fisher_score(X[train], y[train])
28 | 
29 |         # rank features in descending order according to score
30 |         idx = fisher_score.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 
51 | 


--------------------------------------------------------------------------------
/skfeature/example/test_gini_index.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import svm
 3 | from sklearn.metrics import accuracy_score
 4 | from skfeature.function.statistical_based import gini_index
 5 | from sklearn import cross_validation
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/colon.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the gini_index score of each feature
27 |         score = gini_index.gini_index(X[train], y[train])
28 | 
29 |         # rank features in descending order according to score
30 |         idx = gini_index.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 
51 | 


--------------------------------------------------------------------------------
/skfeature/example/test_group_fs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.sparse import rand
 3 | from skfeature.function.structure import group_fs
 4 | 
 5 | 
 6 | def main():
 7 |     n_samples = 50    # specify the number of samples in the simulated data
 8 |     n_features = 100    # specify the number of features in the simulated data
 9 | 
10 |     # simulate the dataset
11 |     X = np.random.rand(n_samples, n_features)
12 | 
13 |     # simulate the feature weight
14 |     w_orin = rand(n_features, 1, 1).toarray()
15 |     w_orin[0:50] = 0
16 | 
17 |     # obtain the ground truth of the simulated dataset
18 |     noise = np.random.rand(n_samples, 1)
19 |     y = np.dot(X, w_orin) + 0.01 * noise
20 |     y = y[:, 0]
21 | 
22 |     z1 = 0.1    # specify the regularization parameter of L1 norm
23 |     z2 = 0.1    # specify the regularization parameter of L2 norm for the non-overlapping group
24 | 
25 |     # specify the group structure among features
26 |     idx = np.array([[1, 20, np.sqrt(20)], [21, 40, np.sqrt(20)], [41, 50, np.sqrt(10)],
27 |                     [51, 70, np.sqrt(20)], [71, 100, np.sqrt(30)]]).T
28 |     idx = idx.astype(int)
29 | 
30 |     # perform feature selection and obtain the feature weight of all the features
31 |     w, obj, value_gamma = group_fs.group_fs(X, y, z1, z2, idx, verbose=True)
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/skfeature/example/test_lap_score.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.similarity_based import lap_score
 3 | from skfeature.utility import construct_W
 4 | from skfeature.utility import unsupervised_evaluation
 5 | 
 6 | 
 7 | def main():
 8 |     # load data
 9 |     mat = scipy.io.loadmat('../data/COIL20.mat')
10 |     X = mat['X']    # data
11 |     X = X.astype(float)
12 |     y = mat['Y']    # label
13 |     y = y[:, 0]
14 | 
15 |     # construct affinity matrix
16 |     kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
17 |     W = construct_W.construct_W(X, **kwargs_W)
18 | 
19 |     # obtain the scores of features
20 |     score = lap_score.lap_score(X, W=W)
21 | 
22 |     # sort the feature scores in an ascending order according to the feature scores
23 |     idx = lap_score.feature_ranking(score)
24 | 
25 |     # perform evaluation on clustering task
26 |     num_fea = 100    # number of selected features
27 |     num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth
28 | 
29 |     # obtain the dataset on the selected features
30 |     selected_features = X[:, idx[0:num_fea]]
31 | 
32 |     # perform kmeans clustering based on the selected features and repeats 20 times
33 |     nmi_total = 0
34 |     acc_total = 0
35 |     for i in range(0, 20):
36 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
37 |         nmi_total += nmi
38 |         acc_total += acc
39 | 
40 |     # output the average NMI and average ACC
41 |     print 'NMI:', float(nmi_total)/20
42 |     print 'ACC:', float(acc_total)/20
43 | 
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_ll_l21.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import svm
 3 | from sklearn import cross_validation
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.utility.sparse_learning import *
 6 | from skfeature.function.sparse_learning_based import ll_l21
 7 | 
 8 | 
 9 | def main():
10 |     # load data
11 |     mat = scipy.io.loadmat('../data/COIL20.mat')
12 |     X = mat['X']    # data
13 |     X = X.astype(float)
14 |     y = mat['Y']    # label
15 |     y = y[:, 0]
16 |     Y = construct_label_matrix_pan(y)
17 |     n_samples, n_features = X.shape    # number of samples and number of features
18 | 
19 |     # split data into 10 folds
20 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
21 | 
22 |     # perform evaluation on classification task
23 |     num_fea = 100    # number of selected features
24 |     clf = svm.LinearSVC()    # linear SVM
25 | 
26 |     correct = 0
27 |     for train, test in ss:
28 |         # obtain the feature weight matrix
29 |         Weight, obj, value_gamma = ll_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False)
30 | 
31 |         # sort the feature scores in an ascending order according to the feature scores
32 |         idx = feature_ranking(Weight)
33 | 
34 |         # obtain the dataset on the selected features
35 |         selected_features = X[:, idx[0:num_fea]]
36 | 
37 |         # train a classification model with the selected features on the training dataset
38 |         clf.fit(selected_features[train], y[train])
39 | 
40 |         # predict the class labels of test data
41 |         y_predict = clf.predict(selected_features[test])
42 | 
43 |         # obtain the classification accuracy on the test data
44 |         acc = accuracy_score(y[test], y_predict)
45 |         correct = correct + acc
46 | 
47 |     # output the average classification accuracy over all 10 folds
48 |     print 'Accuracy:', float(correct)/10
49 | 
50 | if __name__ == '__main__':
51 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_low_variance.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from skfeature.function.statistical_based import low_variance
 3 | from skfeature.utility import unsupervised_evaluation
 4 | 
 5 | 
 6 | def main():
 7 |     # load data
 8 |     mat = scipy.io.loadmat('../data/BASEHOCK.mat')
 9 |     X = mat['X']    # data
10 |     X = X.astype(float)
11 |     y = mat['Y']    # label
12 |     y = y[:, 0]
13 | 
14 |     p = 0.1    # specify the threshold p to be 0.1
15 |     num_cluster = 2    # specify the number of clusters to be 2
16 | 
17 |     # perform feature selection and obtain the dataset on the selected features
18 |     selected_features = low_variance.low_variance_feature_selection(X, p*(1-p))
19 | 
20 |     # perform kmeans clustering based on the selected features and repeats 20 times
21 |     nmi_total = 0
22 |     acc_total = 0
23 |     for i in range(0, 20):
24 |         nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
25 |         nmi_total += nmi
26 |         acc_total += acc
27 | 
28 |     # output the average NMI and average ACC
29 |     print 'NMI:', float(nmi_total)/20
30 |     print 'ACC:', float(acc_total)/20
31 | 
32 | if __name__ == '__main__':
33 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_ls_l21.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import svm
 3 | from sklearn import cross_validation
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.utility.sparse_learning import *
 6 | from skfeature.function.sparse_learning_based import ls_l21
 7 | 
 8 | 
 9 | def main():
10 |     # load data
11 |     mat = scipy.io.loadmat('../data/COIL20.mat')
12 |     X = mat['X']    # data
13 |     X = X.astype(float)
14 |     y = mat['Y']    # label
15 |     y = y[:, 0]
16 |     Y = construct_label_matrix_pan(y)
17 |     n_samples, n_features = X.shape    # number of samples and number of features
18 | 
19 |     # split data into 10 folds
20 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
21 | 
22 |     # perform evaluation on classification task
23 |     num_fea = 100    # number of selected features
24 |     clf = svm.LinearSVC()    # linear SVM
25 | 
26 |     correct = 0
27 |     for train, test in ss:
28 |         # obtain the feature weight matrix
29 |         Weight, obj, value_gamma = ls_l21.proximal_gradient_descent(X[train], Y[train], 0.1, verbose=False)
30 | 
31 |         # sort the feature scores in an ascending order according to the feature scores
32 |         idx = feature_ranking(Weight)
33 | 
34 |         # obtain the dataset on the selected features
35 |         selected_features = X[:, idx[0:num_fea]]
36 | 
37 |         # train a classification model with the selected features on the training dataset
38 |         clf.fit(selected_features[train], y[train])
39 | 
40 |         # predict the class labels of test data
41 |         y_predict = clf.predict(selected_features[test])
42 | 
43 |         # obtain the classification accuracy on the test data
44 |         acc = accuracy_score(y[test], y_predict)
45 |         correct = correct + acc
46 | 
47 |     # output the average classification accuracy over all 10 folds
48 |     print 'Accuracy:', float(correct)/10
49 | 
50 | if __name__ == '__main__':
51 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_reliefF.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import cross_validation
 3 | from sklearn import svm
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.function.similarity_based import reliefF
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the score of each feature on the training set
27 |         score = reliefF.reliefF(X[train], y[train])
28 | 
29 |         # rank features in descending order according to score
30 |         idx = reliefF.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_svm_backward.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.cross_validation import KFold
 3 | from skfeature.function.wrapper import svm_backward
 4 | from sklearn import svm
 5 | from sklearn.metrics import accuracy_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     clf = svm.LinearSVC()    # linear SVM
22 | 
23 |     correct = 0
24 |     for train, test in ss:
25 |         # obtain the idx of selected features from the training set
26 |         idx = svm_backward.svm_backward(X[train], y[train], n_features)
27 | 
28 |         # obtain the dataset on the selected features
29 |         X_selected = X[:, idx]
30 | 
31 |         # train a classification model with the selected features on the training dataset
32 |         clf.fit(X_selected[train], y[train])
33 | 
34 |         # predict the class labels of test data
35 |         y_predict = clf.predict(X_selected[test])
36 | 
37 |         # obtain the classification accuracy on the test data
38 |         acc = accuracy_score(y[test], y_predict)
39 |         correct = correct + acc
40 | 
41 |     # output the average classification accuracy over all 10 folds
42 |     print 'Accuracy:', float(correct)/10
43 | 
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_svm_forward.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.cross_validation import KFold
 3 | from skfeature.function.wrapper import svm_forward
 4 | from sklearn import svm
 5 | from sklearn.metrics import accuracy_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     clf = svm.LinearSVC()    # linear SVM
22 | 
23 |     correct = 0
24 |     for train, test in ss:
25 |         # obtain the idx of selected features from the training set
26 |         idx = svm_forward.svm_forward(X[train], y[train], n_features)
27 | 
28 |         # obtain the dataset on the selected features
29 |         X_selected = X[:, idx]
30 | 
31 |         # train a classification model with the selected features on the training dataset
32 |         clf.fit(X_selected[train], y[train])
33 | 
34 |         # predict the class labels of test data
35 |         y_predict = clf.predict(X_selected[test])
36 | 
37 |         # obtain the classification accuracy on the test data
38 |         acc = accuracy_score(y[test], y_predict)
39 |         correct = correct + acc
40 | 
41 |     # output the average classification accuracy over all 10 folds
42 |     print 'Accuracy:', float(correct)/10
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/skfeature/example/test_t_score.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn.metrics import accuracy_score
 3 | from sklearn import cross_validation
 4 | from sklearn import svm
 5 | from skfeature.function.statistical_based import t_score
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the t-score of each feature
27 |         score = t_score.t_score(X, y)
28 | 
29 |         # rank features in descending order according to score
30 |         idx = t_score.feature_ranking(score)
31 | 
32 |         # obtain the dataset on the selected features
33 |         selected_features = X[:, idx[0:num_fea]]
34 | 
35 |         # train a classification model with the selected features on the training dataset
36 |         clf.fit(selected_features[train], y[train])
37 | 
38 |         # predict the class labels of test data
39 |         y_predict = clf.predict(selected_features[test])
40 | 
41 |         # obtain the classification accuracy on the test data
42 |         acc = accuracy_score(y[test], y_predict)
43 |         correct = correct + acc
44 | 
45 |     # output the average classification accuracy over all 10 folds
46 |     print 'Accuracy:', float(correct)/10
47 | 
48 | if __name__ == '__main__':
49 |     main()


--------------------------------------------------------------------------------
/skfeature/example/test_trace_ratio.py:
--------------------------------------------------------------------------------
 1 | import scipy.io
 2 | from sklearn import cross_validation
 3 | from sklearn import svm
 4 | from sklearn.metrics import accuracy_score
 5 | from skfeature.function.similarity_based import trace_ratio
 6 | 
 7 | 
 8 | def main():
 9 |     # load data
10 |     mat = scipy.io.loadmat('../data/COIL20.mat')
11 |     X = mat['X']    # data
12 |     X = X.astype(float)
13 |     y = mat['Y']    # label
14 |     y = y[:, 0]
15 |     n_samples, n_features = X.shape    # number of samples and number of features
16 | 
17 |     # split data into 10 folds
18 |     ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
19 | 
20 |     # perform evaluation on classification task
21 |     num_fea = 100    # number of selected features
22 |     clf = svm.LinearSVC()    # linear SVM
23 | 
24 |     correct = 0
25 |     for train, test in ss:
26 |         # obtain the index of selected features
27 |         idx, feature_score, subset_score = trace_ratio.trace_ratio(X[train], y[train], num_fea, style='fisher')
28 | 
29 |         # obtain the dataset on the selected features
30 |         selected_features = X[:, idx[0:num_fea]]
31 | 
32 |         # train a classification model with the selected features on the training dataset
33 |         clf.fit(selected_features[train], y[train])
34 | 
35 |         # predict the class labels of test data
36 |         y_predict = clf.predict(selected_features[test])
37 | 
38 |         # obtain the classification accuracy on the test data
39 |         acc = accuracy_score(y[test], y_predict)
40 |         correct = correct + acc
41 | 
42 |     # output the average classification accuracy over all 10 folds
43 |     print 'Accuracy:', float(correct)/10
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/skfeature/example/test_tree_fs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.sparse import rand
 3 | from skfeature.function.structure import tree_fs
 4 | 
 5 | 
 6 | def main():
 7 |     n_samples = 50    # specify the number of samples in the simulated data
 8 |     n_features = 100    # specify the number of features in the simulated data
 9 | 
10 |     # simulate the dataset
11 |     X = np.random.rand(n_samples, n_features)
12 | 
13 |     # simulate the feature weight
14 |     w_orin = rand(n_features, 1, 1).toarray()
15 |     w_orin[0:50] = 0
16 | 
17 |     # obtain the ground truth of the simulated dataset
18 |     noise = np.random.rand(n_samples, 1)
19 |     y = np.dot(X, w_orin) + 0.01 * noise
20 |     y = y[:, 0]
21 | 
22 | 
23 |     z = 0.01  # specify the regularization parameter of regularization parameter of L2 norm for the non-overlapping group
24 | 
25 |     # specify the tree structure among features
26 |     idx = np.array([[-1, -1, 1], [1, 20, np.sqrt(20)], [21, 40, np.sqrt(20)], [41, 50, np.sqrt(10)],
27 |                     [51, 70, np.sqrt(20)], [71, 100, np.sqrt(30)], [1, 50, np.sqrt(50)], [51, 100, np.sqrt(50)]]).T
28 |     idx = idx.astype(int)
29 | 
30 |     # perform feature selection and obtain the feature weight of all the features
31 |     w, obj, value_gamma = tree_fs.tree_fs(X, y, z, idx, verbose=True)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     main()
36 | 


--------------------------------------------------------------------------------
/skfeature/function/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/CIFE.py:
--------------------------------------------------------------------------------
 1 | from skfeature.function.information_theoretical_based import LCSI
 2 | 
 3 | 
 4 | def cife(X, y, **kwargs):
 5 |     """
 6 |     This function implements the CIFE feature selection
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data, guaranteed to be discrete
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 |     kwargs: {dictionary}
15 |         n_selected_features: {int}
16 |             number of features to select
17 | 
18 |     Output
19 |     ------
20 |     F: {numpy array}, shape (n_features,)
21 |         index of selected features, F[0] is the most important feature
22 |     J_CMI: {numpy array}, shape: (n_features,)
23 |         corresponding objective function value of selected features
24 |     MIfy: {numpy array}, shape: (n_features,)
25 |         corresponding mutual information between selected features and response
26 | 
27 |     Reference
28 |     ---------
29 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
30 |     """
31 |     
32 |     if 'n_selected_features' in kwargs.keys():
33 |         n_selected_features = kwargs['n_selected_features']
34 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=1, gamma=1, n_selected_features=n_selected_features)
35 |     else:
36 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=1, gamma=1)
37 |     return F, J_CMI, MIfy
38 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/CMIM.py:
--------------------------------------------------------------------------------
 1 | from skfeature.utility.entropy_estimators import *
 2 | 
 3 | 
 4 | def cmim(X, y, **kwargs):
 5 |     """
 6 |     This function implements the CMIM feature selection.
 7 |     The scoring criteria is calculated based on the formula j_cmim=I(f;y)-max_j(I(fj;f)-I(fj;f|y))
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         Input data, guaranteed to be a discrete numpy array
13 |     y: {numpy array}, shape (n_samples,)
14 |         guaranteed to be a numpy array
15 |     kwargs: {dictionary}
16 |         n_selected_features: {int}
17 |             number of features to select
18 | 
19 |     Output
20 |     ------
21 |     F: {numpy array}, shape (n_features,)
22 |         index of selected features, F[0] is the most important feature
23 |     J_CMIM: {numpy array}, shape: (n_features,)
24 |         corresponding objective function value of selected features
25 |     MIfy: {numpy array}, shape: (n_features,)
26 |         corresponding mutual information between selected features and response
27 | 
28 |     Reference
29 |     ---------
30 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
31 |     """
32 | 
33 |     n_samples, n_features = X.shape
34 |     # index of selected features, initialized to be empty
35 |     F = []
36 |     # Objective function value for selected features
37 |     J_CMIM = []
38 |     # Mutual information between feature and response
39 |     MIfy = []
40 |     # indicate whether the user specifies the number of features
41 |     is_n_selected_features_specified = False
42 | 
43 |     if 'n_selected_features' in kwargs.keys():
44 |         n_selected_features = kwargs['n_selected_features']
45 |         is_n_selected_features_specified = True
46 | 
47 |     # t1 stores I(f;y) for each feature f
48 |     t1 = np.zeros(n_features)
49 | 
50 |     # max stores max(I(fj;f)-I(fj;f|y)) for each feature f
51 |     # we assign an extreme small value to max[i] ito make it is smaller than possible value of max(I(fj;f)-I(fj;f|y))
52 |     max = -10000000*np.ones(n_features)
53 |     for i in range(n_features):
54 |         f = X[:, i]
55 |         t1[i] = midd(f, y)
56 | 
57 |     # make sure that j_cmi is positive at the very beginning
58 |     j_cmim = 1
59 | 
60 |     while True:
61 |         if len(F) == 0:
62 |             # select the feature whose mutual information is the largest
63 |             idx = np.argmax(t1)
64 |             F.append(idx)
65 |             J_CMIM.append(t1[idx])
66 |             MIfy.append(t1[idx])
67 |             f_select = X[:, idx]
68 | 
69 |         if is_n_selected_features_specified:
70 |             if len(F) == n_selected_features:
71 |                 break
72 |         else:
73 |             if j_cmim <= 0:
74 |                 break
75 | 
76 |         # we assign an extreme small value to j_cmim to ensure it is smaller than all possible values of j_cmim
77 |         j_cmim = -1000000000000
78 |         for i in range(n_features):
79 |             if i not in F:
80 |                 f = X[:, i]
81 |                 t2 = midd(f_select, f)
82 |                 t3 = cmidd(f_select, f, y)
83 |                 if t2-t3 > max[i]:
84 |                         max[i] = t2-t3
85 |                 # calculate j_cmim for feature i (not in F)
86 |                 t = t1[i] - max[i]
87 |                 # record the largest j_cmim and the corresponding feature index
88 |                 if t > j_cmim:
89 |                     j_cmim = t
90 |                     idx = i
91 |         F.append(idx)
92 |         J_CMIM.append(j_cmim)
93 |         MIfy.append(t1[idx])
94 |         f_select = X[:, idx]
95 | 
96 |     return np.array(F), np.array(J_CMIM), np.array(MIfy)


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/DISR.py:
--------------------------------------------------------------------------------
 1 | from skfeature.utility.entropy_estimators import *
 2 | from skfeature.utility.mutual_information import conditional_entropy
 3 | 
 4 | 
 5 | def disr(X, y, **kwargs):
 6 |     """
 7 |     This function implement the DISR feature selection.
 8 |     The scoring criteria is calculated based on the formula j_disr=sum_j(I(f,fj;y)/H(f,fj,y))
 9 | 
10 |     Input
11 |     -----
12 |     X: {numpy array}, shape (n_samples, n_features)
13 |         input data, guaranteed to be a discrete data matrix
14 |     y: {numpy array}, shape (n_samples,)
15 |         input class labels
16 | 
17 |     kwargs: {dictionary}
18 |         n_selected_features: {int}
19 |             number of features to select
20 | 
21 |     Output
22 |     ------
23 |     F: {numpy array}, shape (n_features, )
24 |         index of selected features, F[0] is the most important feature
25 |     J_DISR: {numpy array}, shape: (n_features,)
26 |         corresponding objective function value of selected features
27 |     MIfy: {numpy array}, shape: (n_features,)
28 |         corresponding mutual information between selected features and response
29 | 
30 |     Reference
31 |     ---------
32 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
33 |     """
34 | 
35 |     n_samples, n_features = X.shape
36 |     # index of selected features, initialized to be empty
37 |     F = []
38 |     # Objective function value for selected features
39 |     J_DISR = []
40 |     # Mutual information between feature and response
41 |     MIfy = []
42 |     # indicate whether the user specifies the number of features
43 |     is_n_selected_features_specified = False
44 | 
45 |     if 'n_selected_features' in kwargs.keys():
46 |         n_selected_features = kwargs['n_selected_features']
47 |         is_n_selected_features_specified = True
48 | 
49 |     # sum stores sum_j(I(f,fj;y)/H(f,fj,y)) for each feature f
50 |     sum = np.zeros(n_features)
51 | 
52 |     # make sure that j_cmi is positive at the very beginning
53 |     j_disr = 1
54 | 
55 |     while True:
56 |         if len(F) == 0:
57 |             # t1 stores I(f;y) for each feature f
58 |             t1 = np.zeros(n_features)
59 |             for i in range(n_features):
60 |                 f = X[:, i]
61 |                 t1[i] = midd(f, y)
62 |             # select the feature whose mutual information is the largest
63 |             idx = np.argmax(t1)
64 |             F.append(idx)
65 |             J_DISR.append(t1[idx])
66 |             MIfy.append(t1[idx])
67 |             f_select = X[:, idx]
68 | 
69 |         if is_n_selected_features_specified is True:
70 |             if len(F) == n_selected_features:
71 |                 break
72 |         if is_n_selected_features_specified is not True:
73 |             if j_disr <= 0:
74 |                 break
75 | 
76 |         # we assign an extreme small value to j_disr to ensure that it is smaller than all possible value of j_disr
77 |         j_disr = -1E30
78 |         for i in range(n_features):
79 |             if i not in F:
80 |                 f = X[:, i]
81 |                 t2 = midd(f_select, y) + cmidd(f, y, f_select)
82 |                 t3 = entropyd(f) + conditional_entropy(f_select, f) + (conditional_entropy(y, f_select) - cmidd(y, f, f_select))
83 |                 sum[i] += np.true_divide(t2, t3)
84 |                 # record the largest j_disr and the corresponding feature index
85 |                 if sum[i] > j_disr:
86 |                     j_disr = sum[i]
87 |                     idx = i
88 |         F.append(idx)
89 |         J_DISR.append(j_disr)
90 |         MIfy.append(t1[idx])
91 |         f_select = X[:, idx]
92 | 
93 |     return np.array(F), np.array(J_DISR), np.array(MIfy)
94 | 
95 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/FCBF.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from skfeature.utility.mutual_information import su_calculation
 3 | 
 4 | 
 5 | def fcbf(X, y, **kwargs):
 6 |     """
 7 |     This function implements Fast Correlation Based Filter algorithm
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         input data, guaranteed to be discrete
13 |     y: {numpy array}, shape (n_samples,)
14 |         input class labels
15 |     kwargs: {dictionary}
16 |         delta: {float}
17 |             delta is a threshold parameter, the default value of delta is 0
18 | 
19 |     Output
20 |     ------
21 |     F: {numpy array}, shape (n_features,)
22 |         index of selected features, F[0] is the most important feature
23 |     SU: {numpy array}, shape (n_features,)
24 |         symmetrical uncertainty of selected features
25 | 
26 |     Reference
27 |     ---------
28 |         Yu, Lei and Liu, Huan. "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution." ICML 2003.
29 |     """
30 | 
31 |     n_samples, n_features = X.shape
32 |     if 'delta' in kwargs.keys():
33 |         delta = kwargs['delta']
34 |     else:
35 |         # the default value of delta is 0
36 |         delta = 0
37 | 
38 |     # t1[:,0] stores index of features, t1[:,1] stores symmetrical uncertainty of features
39 |     t1 = np.zeros((n_features, 2), dtypes='object')
40 |     for i in range(n_features):
41 |         f = X[:, i]
42 |         t1[i, 0] = i
43 |         t1[i, 1] = su_calculation(f, y)
44 |     s_list = t1[t1[:, 1] > delta, :]
45 |     # index of selected features, initialized to be empty
46 |     F = []
47 |     # Symmetrical uncertainty of selected features
48 |     SU = []
49 |     while len(s_list) != 0:
50 |         # select the largest su inside s_list
51 |         idx = np.argmax(s_list[:, 1])
52 |         # record the index of the feature with the largest su
53 |         fp = X[:, s_list[idx, 0]]
54 |         np.delete(s_list, idx, 0)
55 |         F.append(s_list[idx, 0])
56 |         SU.append(s_list[idx, 1])
57 |         for i in s_list[:, 0]:
58 |             fi = X[:, i]
59 |             if su_calculation(fp, fi) >= t1[i, 1]:
60 |                 # construct the mask for feature whose su is larger than su(fp,y)
61 |                 idx = s_list[:, 0] != i
62 |                 idx = np.array([idx, idx])
63 |                 idx = np.transpose(idx)
64 |                 # delete the feature by using the mask
65 |                 s_list = s_list[idx]
66 |                 length = len(s_list)//2
67 |                 s_list = s_list.reshape((length, 2))
68 |     return np.array(F, dtype=int), np.array(SU)
69 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/ICAP.py:
--------------------------------------------------------------------------------
 1 | from skfeature.utility.entropy_estimators import *
 2 | 
 3 | 
 4 | def icap(X, y, **kwargs):
 5 |     """
 6 |     This function implements the ICAP feature selection.
 7 |     The scoring criteria is calculated based on the formula j_icap = I(f;y) - max_j(0,(I(fj;f)-I(fj;f|y)))
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         input data, guaranteed to be a discrete data matrix
13 |     y: {numpy array}, shape (n_samples,)
14 |         input class labels
15 |     kwargs: {dictionary}
16 |         n_selected_features: {int}
17 |             number of features to select
18 | 
19 |     Output
20 |     ------
21 |     F: {numpy array}, shape (n_features,)
22 |         index of selected features, F[0] is the most important feature
23 |     J_ICAP: {numpy array}, shape: (n_features,)
24 |         corresponding objective function value of selected features
25 |     MIfy: {numpy array}, shape: (n_features,)
26 |         corresponding mutual information between selected features and response
27 |     """
28 |     n_samples, n_features = X.shape
29 |     # index of selected features, initialized to be empty
30 |     F = []
31 |     # Objective function value for selected features
32 |     J_ICAP = []
33 |     # Mutual information between feature and response
34 |     MIfy = []
35 |     # indicate whether the user specifies the number of features
36 |     is_n_selected_features_specified = False
37 |     if 'n_selected_features' in kwargs.keys():
38 |         n_selected_features = kwargs['n_selected_features']
39 |         is_n_selected_features_specified = True
40 | 
41 |     # t1 contains I(f;y) for each feature f
42 |     t1 = np.zeros(n_features)
43 |     # max contains max_j(0,(I(fj;f)-I(fj;f|y))) for each feature f
44 |     max = np.zeros(n_features)
45 |     for i in range(n_features):
46 |         f = X[:, i]
47 |         t1[i] = midd(f, y)
48 | 
49 |     # make sure that j_cmi is positive at the very beginning
50 |     j_icap = 1
51 | 
52 |     while True:
53 |         if len(F) == 0:
54 |             # select the feature whose mutual information is the largest
55 |             idx = np.argmax(t1)
56 |             F.append(idx)
57 |             J_ICAP.append(t1[idx])
58 |             MIfy.append(t1[idx])
59 |             f_select = X[:, idx]
60 | 
61 |         if is_n_selected_features_specified is True:
62 |             if len(F) == n_selected_features:
63 |                 break
64 |         if is_n_selected_features_specified is not True:
65 |             if j_icap <= 0:
66 |                 break
67 | 
68 |         # we assign an extreme small value to j_icap to ensure it is smaller than all possible values of j_icap
69 |         j_icap = -1000000000000
70 |         for i in range(n_features):
71 |             if i not in F:
72 |                 f = X[:, i]
73 |                 t2 = midd(f_select, f)
74 |                 t3 = cmidd(f_select, f, y)
75 |                 if t2-t3 > max[i]:
76 |                     max[i] = t2-t3
77 |                 # calculate j_icap for feature i (not in F)
78 |                 t = t1[i] - max[i]
79 |                 # record the largest j_icap and the corresponding feature index
80 |                 if t > j_icap:
81 |                     j_icap = t
82 |                     idx = i
83 |         F.append(idx)
84 |         J_ICAP.append(j_icap)
85 |         MIfy.append(t1[idx])
86 |         f_select = X[:, idx]
87 | 
88 |     return np.array(F), np.array(J_ICAP), np.array(MIfy)
89 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/JMI.py:
--------------------------------------------------------------------------------
 1 | from skfeature.function.information_theoretical_based import LCSI
 2 | 
 3 | 
 4 | def jmi(X, y, **kwargs):
 5 |     """
 6 |     This function implements the JMI feature selection
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data, guaranteed to be discrete
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 |     kwargs: {dictionary}
15 |         n_selected_features: {int}
16 |             number of features to select
17 | 
18 |     Output
19 |     ------
20 |     F: {numpy array}, shape (n_features,)
21 |         index of selected features, F[0] is the most important feature
22 |     J_CMI: {numpy array}, shape: (n_features,)
23 |         corresponding objective function value of selected features
24 |     MIfy: {numpy array}, shape: (n_features,)
25 |         corresponding mutual information between selected features and response
26 | 
27 |     Reference
28 |     ---------
29 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
30 |     """
31 |     if 'n_selected_features' in kwargs.keys():
32 |         n_selected_features = kwargs['n_selected_features']
33 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, function_name='JMI', n_selected_features=n_selected_features)
34 |     else:
35 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, function_name='JMI')
36 |     return F, J_CMI, MIfy


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/LCSI.py:
--------------------------------------------------------------------------------
  1 | from skfeature.utility.entropy_estimators import *
  2 | 
  3 | 
  4 | def lcsi(X, y, **kwargs):
  5 |     """
  6 |     This function implements the basic scoring criteria for linear combination of shannon information term.
  7 |     The scoring criteria is calculated based on the formula j_cmi=I(f;y)-beta*sum_j(I(fj;f))+gamma*sum(I(fj;f|y))
  8 | 
  9 |     Input
 10 |     -----
 11 |     X: {numpy array}, shape (n_samples, n_features)
 12 |         input data, guaranteed to be a discrete data matrix
 13 |     y: {numpy array}, shape (n_samples,)
 14 |         input class labels
 15 |     kwargs: {dictionary}
 16 |         Parameters for different feature selection algorithms.
 17 |         beta: {float}
 18 |             beta is the parameter in j_cmi=I(f;y)-beta*sum(I(fj;f))+gamma*sum(I(fj;f|y))
 19 |         gamma: {float}
 20 |             gamma is the parameter in j_cmi=I(f;y)-beta*sum(I(fj;f))+gamma*sum(I(fj;f|y))
 21 |         function_name: {string}
 22 |             name of the feature selection function
 23 |         n_selected_features: {int}
 24 |             number of features to select
 25 | 
 26 |     Output
 27 |     ------
 28 |     F: {numpy array}, shape: (n_features,)
 29 |         index of selected features, F[0] is the most important feature
 30 |     J_CMI: {numpy array}, shape: (n_features,)
 31 |         corresponding objective function value of selected features
 32 |     MIfy: {numpy array}, shape: (n_features,)
 33 |         corresponding mutual information between selected features and response
 34 | 
 35 |     Reference
 36 |     ---------
 37 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
 38 |     """
 39 | 
 40 |     n_samples, n_features = X.shape
 41 |     # index of selected features, initialized to be empty
 42 |     F = []
 43 |     # Objective function value for selected features
 44 |     J_CMI = []
 45 |     # Mutual information between feature and response
 46 |     MIfy = []
 47 |     # indicate whether the user specifies the number of features
 48 |     is_n_selected_features_specified = False
 49 |     # initialize the parameters
 50 |     if 'beta' in kwargs.keys():
 51 |         beta = kwargs['beta']
 52 |     if 'gamma' in kwargs.keys():
 53 |         gamma = kwargs['gamma']
 54 |     if 'n_selected_features' in kwargs.keys():
 55 |         n_selected_features = kwargs['n_selected_features']
 56 |         is_n_selected_features_specified = True
 57 | 
 58 |     # select the feature whose j_cmi is the largest
 59 |     # t1 stores I(f;y) for each feature f
 60 |     t1 = np.zeros(n_features)
 61 |     # t2 stores sum_j(I(fj;f)) for each feature f
 62 |     t2 = np.zeros(n_features)
 63 |     # t3 stores sum_j(I(fj;f|y)) for each feature f
 64 |     t3 = np.zeros(n_features)
 65 |     for i in range(n_features):
 66 |         f = X[:, i]
 67 |         t1[i] = midd(f, y)
 68 | 
 69 |     # make sure that j_cmi is positive at the very beginning
 70 |     j_cmi = 1
 71 | 
 72 |     while True:
 73 |         if len(F) == 0:
 74 |             # select the feature whose mutual information is the largest
 75 |             idx = np.argmax(t1)
 76 |             F.append(idx)
 77 |             J_CMI.append(t1[idx])
 78 |             MIfy.append(t1[idx])
 79 |             f_select = X[:, idx]
 80 | 
 81 |         if is_n_selected_features_specified:
 82 |             if len(F) == n_selected_features:
 83 |                 break
 84 |         else:
 85 |             if j_cmi < 0:
 86 |                 break
 87 | 
 88 |         # we assign an extreme small value to j_cmi to ensure it is smaller than all possible values of j_cmi
 89 |         j_cmi = -1E30
 90 |         if 'function_name' in kwargs.keys():
 91 |             if kwargs['function_name'] == 'MRMR':
 92 |                 beta = 1.0 / len(F)
 93 |             elif kwargs['function_name'] == 'JMI':
 94 |                 beta = 1.0 / len(F)
 95 |                 gamma = 1.0 / len(F)
 96 |         for i in range(n_features):
 97 |             if i not in F:
 98 |                 f = X[:, i]
 99 |                 t2[i] += midd(f_select, f)
100 |                 t3[i] += cmidd(f_select, f, y)
101 |                 # calculate j_cmi for feature i (not in F)
102 |                 t = t1[i] - beta*t2[i] + gamma*t3[i]
103 |                 # record the largest j_cmi and the corresponding feature index
104 |                 if t > j_cmi:
105 |                     j_cmi = t
106 |                     idx = i
107 |         F.append(idx)
108 |         J_CMI.append(j_cmi)
109 |         MIfy.append(t1[idx])
110 |         f_select = X[:, idx]
111 | 
112 |     return np.array(F), np.array(J_CMI), np.array(MIfy)
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/MIFS.py:
--------------------------------------------------------------------------------
 1 | from skfeature.function.information_theoretical_based import LCSI
 2 | 
 3 | 
 4 | def mifs(X, y, **kwargs):
 5 |     """
 6 |     This function implements the MIFS feature selection
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data, guaranteed to be discrete
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 |     kwargs: {dictionary}
15 |         n_selected_features: {int}
16 |             number of features to select
17 | 
18 |     Output
19 |     ------
20 |     F: {numpy array}, shape (n_features,)
21 |         index of selected features, F[0] is the most important feature
22 |     J_CMI: {numpy array}, shape: (n_features,)
23 |         corresponding objective function value of selected features
24 |     MIfy: {numpy array}, shape: (n_features,)
25 |         corresponding mutual information between selected features and response
26 | 
27 |     Reference
28 |     ---------
29 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
30 |     """
31 | 
32 |     if 'beta' not in kwargs.keys():
33 |         beta = 0.5
34 |     else:
35 |         beta = kwargs['beta']
36 |     if 'n_selected_features' in kwargs.keys():
37 |         n_selected_features = kwargs['n_selected_features']
38 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0, n_selected_features=n_selected_features)
39 |     else:
40 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0)
41 |     return F, J_CMI, MIfy
42 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/MIM.py:
--------------------------------------------------------------------------------
 1 | from skfeature.function.information_theoretical_based import LCSI
 2 | 
 3 | 
 4 | def mim(X, y, **kwargs):
 5 |     """
 6 |     This function implements the MIM feature selection
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data, guaranteed to be discrete
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 |     kwargs: {dictionary}
15 |         n_selected_features: {int}
16 |             number of features to select
17 | 
18 |     Output
19 |     ------
20 |     F: {numpy array}, shape (n_features, )
21 |         index of selected features, F[0] is the most important feature
22 |     J_CMI: {numpy array}, shape: (n_features,)
23 |         corresponding objective function value of selected features
24 |     MIfy: {numpy array}, shape: (n_features,)
25 |         corresponding mutual information between selected features and response
26 | 
27 |     Reference
28 |     ---------
29 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
30 |     """
31 | 
32 |     if 'n_selected_features' in kwargs.keys():
33 |         n_selected_features = kwargs['n_selected_features']
34 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=0, gamma=0, n_selected_features=n_selected_features)
35 |     else:
36 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=0, gamma=0)
37 |     return F, J_CMI, MIfy
38 | 


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/MRMR.py:
--------------------------------------------------------------------------------
 1 | from skfeature.function.information_theoretical_based import LCSI
 2 | 
 3 | 
 4 | def mrmr(X, y, **kwargs):
 5 |     """
 6 |     This function implements the MRMR feature selection
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data, guaranteed to be discrete
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 |     kwargs: {dictionary}
15 |         n_selected_features: {int}
16 |             number of features to select
17 | 
18 |     Output
19 |     ------
20 |     F: {numpy array}, shape (n_features,)
21 |         index of selected features, F[0] is the most important feature
22 |     J_CMI: {numpy array}, shape: (n_features,)
23 |         corresponding objective function value of selected features
24 |     MIfy: {numpy array}, shape: (n_features,)
25 |         corresponding mutual information between selected features and response
26 | 
27 |     Reference
28 |     ---------
29 |     Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
30 |     """
31 |     if 'n_selected_features' in kwargs.keys():
32 |         n_selected_features = kwargs['n_selected_features']
33 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, gamma=0, function_name='MRMR', n_selected_features=n_selected_features)
34 |     else:
35 |         F, J_CMI, MIfy = LCSI.lcsi(X, y, gamma=0, function_name='MRMR')
36 |     return F, J_CMI, MIfy


--------------------------------------------------------------------------------
/skfeature/function/information_theoretical_based/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/information_theoretical_based/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/SPEC.py:
--------------------------------------------------------------------------------
  1 | import numpy.matlib
  2 | import numpy as np
  3 | from scipy.sparse import *
  4 | from sklearn.metrics.pairwise import rbf_kernel
  5 | from numpy import linalg as LA
  6 | 
  7 | 
  8 | def spec(X, **kwargs):
  9 |     """
 10 |     This function implements the SPEC feature selection
 11 | 
 12 |     Input
 13 |     -----
 14 |     X: {numpy array}, shape (n_samples, n_features)
 15 |         input data
 16 |     kwargs: {dictionary}
 17 |         style: {int}
 18 |             style == -1, the first feature ranking function, use all eigenvalues
 19 |             style == 0, the second feature ranking function, use all except the 1st eigenvalue
 20 |             style >= 2, the third feature ranking function, use the first k except 1st eigenvalue
 21 |         W: {sparse matrix}, shape (n_samples, n_samples}
 22 |             input affinity matrix
 23 | 
 24 |     Output
 25 |     ------
 26 |     w_fea: {numpy array}, shape (n_features,)
 27 |         SPEC feature score for each feature
 28 | 
 29 |     Reference
 30 |     ---------
 31 |     Zhao, Zheng and Liu, Huan. "Spectral Feature Selection for Supervised and Unsupervised Learning." ICML 2007.
 32 |     """
 33 | 
 34 |     if 'style' not in kwargs:
 35 |         kwargs['style'] = 0
 36 |     if 'W' not in kwargs:
 37 |         kwargs['W'] = rbf_kernel(X, gamma=1)
 38 | 
 39 |     style = kwargs['style']
 40 |     W = kwargs['W']
 41 |     if type(W) is numpy.ndarray:
 42 |         W = csc_matrix(W)
 43 | 
 44 |     n_samples, n_features = X.shape
 45 | 
 46 |     # build the degree matrix
 47 |     X_sum = np.array(W.sum(axis=1))
 48 |     D = np.zeros((n_samples, n_samples))
 49 |     for i in range(n_samples):
 50 |         D[i, i] = X_sum[i]
 51 | 
 52 |     # build the laplacian matrix
 53 |     L = D - W
 54 |     d1 = np.power(np.array(W.sum(axis=1)), -0.5)
 55 |     d1[np.isinf(d1)] = 0
 56 |     d2 = np.power(np.array(W.sum(axis=1)), 0.5)
 57 |     v = np.dot(np.diag(d2[:, 0]), np.ones(n_samples))
 58 |     v = v/LA.norm(v)
 59 | 
 60 |     # build the normalized laplacian matrix
 61 |     L_hat = (np.matlib.repmat(d1, 1, n_samples)) * np.array(L) * np.matlib.repmat(np.transpose(d1), n_samples, 1)
 62 | 
 63 |     # calculate and construct spectral information
 64 |     s, U = np.linalg.eigh(L_hat)
 65 |     s = np.flipud(s)
 66 |     U = np.fliplr(U)
 67 | 
 68 |     # begin to select features
 69 |     w_fea = np.ones(n_features)*1000
 70 | 
 71 |     for i in range(n_features):
 72 |         f = X[:, i]
 73 |         F_hat = np.dot(np.diag(d2[:, 0]), f)
 74 |         l = LA.norm(F_hat)
 75 |         if l < 100*np.spacing(1):
 76 |             w_fea[i] = 1000
 77 |             continue
 78 |         else:
 79 |             F_hat = F_hat/l
 80 |         a = np.array(np.dot(np.transpose(F_hat), U))
 81 |         a = np.multiply(a, a)
 82 |         a = np.transpose(a)
 83 | 
 84 |         # use f'Lf formulation
 85 |         if style == -1:
 86 |             w_fea[i] = np.sum(a * s)
 87 |         # using all eigenvalues except the 1st
 88 |         elif style == 0:
 89 |             a1 = a[0:n_samples-1]
 90 |             w_fea[i] = np.sum(a1 * s[0:n_samples-1])/(1-np.power(np.dot(np.transpose(F_hat), v), 2))
 91 |         # use first k except the 1st
 92 |         else:
 93 |             a1 = a[n_samples-style:n_samples-1]
 94 |             w_fea[i] = np.sum(a1 * (2-s[n_samples-style: n_samples-1]))
 95 | 
 96 |     if style != -1 and style != 0:
 97 |         w_fea[w_fea == 1000] = -1000
 98 | 
 99 |     return w_fea
100 | 
101 | 
102 | def feature_ranking(score, **kwargs):
103 |     if 'style' not in kwargs:
104 |         kwargs['style'] = 0
105 |     style = kwargs['style']
106 | 
107 |     # if style = -1 or 0, ranking features in descending order, the higher the score, the more important the feature is
108 |     if style == -1 or style == 0:
109 |         idx = np.argsort(score, 0)
110 |         return idx[::-1]
111 |     # if style != -1 and 0, ranking features in ascending order, the lower the score, the more important the feature is
112 |     elif style != -1 and style != 0:
113 |         idx = np.argsort(score, 0)
114 |         return idx


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/similarity_based/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/fisher_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.sparse import *
 3 | from skfeature.utility.construct_W import construct_W
 4 | 
 5 | 
 6 | def fisher_score(X, y):
 7 |     """
 8 |     This function implements the fisher score feature selection, steps are as follows:
 9 |     1. Construct the affinity matrix W in fisher score way
10 |     2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
11 |     3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
12 |     4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1
13 | 
14 |     Input
15 |     -----
16 |     X: {numpy array}, shape (n_samples, n_features)
17 |         input data
18 |     y: {numpy array}, shape (n_samples,)
19 |         input class labels
20 | 
21 |     Output
22 |     ------
23 |     score: {numpy array}, shape (n_features,)
24 |         fisher score for each feature
25 | 
26 |     Reference
27 |     ---------
28 |     He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
29 |     Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012.
30 |     """
31 | 
32 |     # Construct weight matrix W in a fisherScore way
33 |     kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
34 |     W = construct_W(X, **kwargs)
35 | 
36 |     # build the diagonal D matrix from affinity matrix W
37 |     D = np.array(W.sum(axis=1))
38 |     L = W
39 |     tmp = np.dot(np.transpose(D), X)
40 |     D = diags(np.transpose(D), [0])
41 |     Xt = np.transpose(X)
42 |     t1 = np.transpose(np.dot(Xt, D.todense()))
43 |     t2 = np.transpose(np.dot(Xt, L.todense()))
44 |     # compute the numerator of Lr
45 |     D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
46 |     # compute the denominator of Lr
47 |     L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
48 |     # avoid the denominator of Lr to be 0
49 |     D_prime[D_prime < 1e-12] = 10000
50 |     lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]
51 | 
52 |     # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
53 |     score = 1.0/lap_score - 1
54 |     return np.transpose(score)
55 | 
56 | 
57 | def feature_ranking(score):
58 |     """
59 |     Rank features in descending order according to fisher score, the larger the fisher score, the more important the
60 |     feature is
61 |     """
62 |     idx = np.argsort(score, 0)
63 |     return idx[::-1]


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/lap_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.sparse import *
 3 | from skfeature.utility.construct_W import construct_W
 4 | 
 5 | 
 6 | def lap_score(X, **kwargs):
 7 |     """
 8 |     This function implements the laplacian score feature selection, steps are as follows:
 9 |     1. Construct the affinity matrix W if it is not specified
10 |     2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
11 |     3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
12 |     4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat)
13 | 
14 |     Input
15 |     -----
16 |     X: {numpy array}, shape (n_samples, n_features)
17 |         input data
18 |     kwargs: {dictionary}
19 |         W: {sparse matrix}, shape (n_samples, n_samples)
20 |             input affinity matrix
21 | 
22 |     Output
23 |     ------
24 |     score: {numpy array}, shape (n_features,)
25 |         laplacian score for each feature
26 | 
27 |     Reference
28 |     ---------
29 |     He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
30 |     """
31 | 
32 |     # if 'W' is not specified, use the default W
33 |     if 'W' not in kwargs.keys():
34 |         W = construct_W(X)
35 |     # construct the affinity matrix W
36 |     W = kwargs['W']
37 |     # build the diagonal D matrix from affinity matrix W
38 |     D = np.array(W.sum(axis=1))
39 |     L = W
40 |     tmp = np.dot(np.transpose(D), X)
41 |     D = diags(np.transpose(D), [0])
42 |     Xt = np.transpose(X)
43 |     t1 = np.transpose(np.dot(Xt, D.todense()))
44 |     t2 = np.transpose(np.dot(Xt, L.todense()))
45 |     # compute the numerator of Lr
46 |     D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
47 |     # compute the denominator of Lr
48 |     L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
49 |     # avoid the denominator of Lr to be 0
50 |     D_prime[D_prime < 1e-12] = 10000
51 | 
52 |     # compute laplacian score for all features
53 |     score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]
54 |     return np.transpose(score)
55 | 
56 | 
57 | def feature_ranking(score):
58 |     """
59 |     Rank features in ascending order according to their laplacian scores, the smaller the laplacian score is, the more
60 |     important the feature is
61 |     """
62 |     idx = np.argsort(score, 0)
63 |     return idx
64 | 


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/reliefF.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics.pairwise import pairwise_distances
  3 | 
  4 | 
  5 | def reliefF(X, y, **kwargs):
  6 |     """
  7 |     This function implements the reliefF feature selection
  8 | 
  9 |     Input
 10 |     -----
 11 |     X: {numpy array}, shape (n_samples, n_features)
 12 |         input data
 13 |     y: {numpy array}, shape (n_samples,)
 14 |         input class labels
 15 |     kwargs: {dictionary}
 16 |         parameters of reliefF:
 17 |         k: {int}
 18 |             choices for the number of neighbors (default k = 5)
 19 | 
 20 |     Output
 21 |     ------
 22 |     score: {numpy array}, shape (n_features,)
 23 |         reliefF score for each feature
 24 | 
 25 |     Reference
 26 |     ---------
 27 |     Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003.
 28 |     Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
 29 |     """
 30 | 
 31 |     if "k" not in kwargs.keys():
 32 |         k = 5
 33 |     else:
 34 |         k = kwargs["k"]
 35 |     n_samples, n_features = X.shape
 36 | 
 37 |     # calculate pairwise distances between instances
 38 |     distance = pairwise_distances(X, metric='manhattan')
 39 | 
 40 |     score = np.zeros(n_features)
 41 | 
 42 |     # the number of sampled instances is equal to the number of total instances
 43 |     for idx in range(n_samples):
 44 |         near_hit = []
 45 |         near_miss = dict()
 46 | 
 47 |         self_fea = X[idx, :]
 48 |         c = np.unique(y).tolist()
 49 | 
 50 |         stop_dict = dict()
 51 |         for label in c:
 52 |             stop_dict[label] = 0
 53 |         del c[c.index(y[idx])]
 54 | 
 55 |         p_dict = dict()
 56 |         p_label_idx = float(len(y[y == y[idx]]))/float(n_samples)
 57 | 
 58 |         for label in c:
 59 |             p_label_c = float(len(y[y == label]))/float(n_samples)
 60 |             p_dict[label] = p_label_c/(1-p_label_idx)
 61 |             near_miss[label] = []
 62 | 
 63 |         distance_sort = []
 64 |         distance[idx, idx] = np.max(distance[idx, :])
 65 | 
 66 |         for i in range(n_samples):
 67 |             distance_sort.append([distance[idx, i], int(i), y[i]])
 68 |         distance_sort.sort(key=lambda x: x[0])
 69 | 
 70 |         for i in range(n_samples):
 71 |             # find k nearest hit points
 72 |             if distance_sort[i][2] == y[idx]:
 73 |                 if len(near_hit) < k:
 74 |                     near_hit.append(distance_sort[i][1])
 75 |                 elif len(near_hit) == k:
 76 |                     stop_dict[y[idx]] = 1
 77 |             else:
 78 |                 # find k nearest miss points for each label
 79 |                 if len(near_miss[distance_sort[i][2]]) < k:
 80 |                     near_miss[distance_sort[i][2]].append(distance_sort[i][1])
 81 |                 else:
 82 |                     if len(near_miss[distance_sort[i][2]]) == k:
 83 |                         stop_dict[distance_sort[i][2]] = 1
 84 |             stop = True
 85 |             for (key, value) in stop_dict.items():
 86 |                     if value != 1:
 87 |                         stop = False
 88 |             if stop:
 89 |                 break
 90 | 
 91 |         # update reliefF score
 92 |         near_hit_term = np.zeros(n_features)
 93 |         for ele in near_hit:
 94 |             near_hit_term = np.array(abs(self_fea-X[ele, :]))+np.array(near_hit_term)
 95 | 
 96 |         near_miss_term = dict()
 97 |         for (label, miss_list) in near_miss.items():
 98 |             near_miss_term[label] = np.zeros(n_features)
 99 |             for ele in miss_list:
100 |                 near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label])
101 |             score += near_miss_term[label]/(k*p_dict[label])
102 |         score -= near_hit_term/k
103 |     return score
104 | 
105 | 
106 | def feature_ranking(score):
107 |     """
108 |     Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the
109 |     feature is
110 |     """
111 |     idx = np.argsort(score, 0)
112 |     return idx[::-1]
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/skfeature/function/similarity_based/trace_ratio.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from skfeature.utility.construct_W import construct_W
  3 | 
  4 | 
  5 | def trace_ratio(X, y, n_selected_features, **kwargs):
  6 |     """
  7 |     This function implements the trace ratio criterion for feature selection
  8 | 
  9 |     Input
 10 |     -----
 11 |     X: {numpy array}, shape (n_samples, n_features)
 12 |         input data
 13 |     y: {numpy array}, shape (n_samples,)
 14 |         input class labels
 15 |     n_selected_features: {int}
 16 |         number of features to select
 17 |     kwargs: {dictionary}
 18 |         style: {string}
 19 |             style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way
 20 |             style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way
 21 |         verbose: {boolean}
 22 |             True if user want to print out the objective function value in each iteration, False if not
 23 | 
 24 |     Output
 25 |     ------
 26 |     feature_idx: {numpy array}, shape (n_features,)
 27 |         the ranked (descending order) feature index based on subset-level score
 28 |     feature_score: {numpy array}, shape (n_features,)
 29 |         the feature-level score
 30 |     subset_score: {float}
 31 |         the subset-level score
 32 | 
 33 |     Reference
 34 |     ---------
 35 |     Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008.
 36 |     """
 37 | 
 38 |     # if 'style' is not specified, use the fisher score way to built two affinity matrix
 39 |     if 'style' not in kwargs.keys():
 40 |         kwargs['style'] = 'fisher'
 41 |     # get the way to build affinity matrix, 'fisher' or 'laplacian'
 42 |     style = kwargs['style']
 43 |     n_samples, n_features = X.shape
 44 | 
 45 |     # if 'verbose' is not specified, do not output the value of objective function
 46 |     if 'verbose' not in kwargs:
 47 |         kwargs['verbose'] = False
 48 |     verbose = kwargs['verbose']
 49 | 
 50 |     if style is 'fisher':
 51 |         kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
 52 |         # build within class and between class laplacian matrix L_w and L_b
 53 |         W_within = construct_W(X, **kwargs_within)
 54 |         L_within = np.eye(n_samples) - W_within
 55 |         L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples
 56 |         L_between = L_within - L_tmp
 57 | 
 58 |     if style is 'laplacian':
 59 |         kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
 60 |         # build within class and between class laplacian matrix L_w and L_b
 61 |         W_within = construct_W(X, **kwargs_within)
 62 |         D_within = np.diag(np.array(W_within.sum(1))[:, 0])
 63 |         L_within = D_within - W_within
 64 |         W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within)
 65 |         D_between = np.diag(np.array(W_between.sum(1)))
 66 |         L_between = D_between - W_between
 67 | 
 68 |     # build X'*L_within*X and X'*L_between*X
 69 |     L_within = (np.transpose(L_within) + L_within)/2
 70 |     L_between = (np.transpose(L_between) + L_between)/2
 71 |     S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X))
 72 |     S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X))
 73 | 
 74 |     # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X'
 75 |     S_within = (np.transpose(S_within) + S_within)/2
 76 |     # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X'
 77 |     S_between = (np.transpose(S_between) + S_between)/2
 78 | 
 79 |     # take the absolute values of diagonal
 80 |     s_within = np.absolute(S_within.diagonal())
 81 |     s_between = np.absolute(S_between.diagonal())
 82 |     s_between[s_between == 0] = 1e-14  # this number if from authors' code
 83 | 
 84 |     # preprocessing
 85 |     fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1]
 86 |     k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features])
 87 |     s_within = s_within[fs_idx[0:n_selected_features]]
 88 |     s_between = s_between[fs_idx[0:n_selected_features]]
 89 | 
 90 |     # iterate util converge
 91 |     count = 0
 92 |     while True:
 93 |         score = np.sort(s_between-k*s_within)[::-1]
 94 |         I = np.argsort(s_between-k*s_within)[::-1]
 95 |         idx = I[0:n_selected_features]
 96 |         old_k = k
 97 |         k = np.sum(s_between[idx])/np.sum(s_within[idx])
 98 |         if verbose:
 99 |             print('obj at iter {0}: {1}'.format(count+1, k))
100 |         count += 1
101 |         if abs(k - old_k) < 1e-3:
102 |             break
103 | 
104 |     # get feature index, feature-level score and subset-level score
105 |     feature_idx = fs_idx[I]
106 |     feature_score = score
107 |     subset_score = k
108 | 
109 |     return feature_idx, feature_score, subset_score
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/MCFS.py:
--------------------------------------------------------------------------------
 1 | import scipy
 2 | import numpy as np
 3 | from sklearn import linear_model
 4 | from skfeature.utility.construct_W import construct_W
 5 | 
 6 | 
 7 | def mcfs(X, n_selected_features, **kwargs):
 8 |     """
 9 |     This function implements unsupervised feature selection for multi-cluster data.
10 | 
11 |     Input
12 |     -----
13 |     X: {numpy array}, shape (n_samples, n_features)
14 |         input data
15 |     n_selected_features: {int}
16 |         number of features to select
17 |     kwargs: {dictionary}
18 |         W: {sparse matrix}, shape (n_samples, n_samples)
19 |             affinity matrix
20 |         n_clusters: {int}
21 |             number of clusters (default is 5)
22 | 
23 |     Output
24 |     ------
25 |     W: {numpy array}, shape(n_features, n_clusters)
26 |         feature weight matrix
27 | 
28 |     Reference
29 |     ---------
30 |     Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010.
31 |     """
32 | 
33 |     # use the default affinity matrix
34 |     if 'W' not in kwargs:
35 |         W = construct_W(X)
36 |     else:
37 |         W = kwargs['W']
38 |     # default number of clusters is 5
39 |     if 'n_clusters' not in kwargs:
40 |         n_clusters = 5
41 |     else:
42 |         n_clusters = kwargs['n_clusters']
43 | 
44 |     # solve the generalized eigen-decomposition problem and get the top K
45 |     # eigen-vectors with respect to the smallest eigenvalues
46 |     W = W.toarray()
47 |     W = (W + W.T) / 2
48 |     W_norm = np.diag(np.sqrt(1 / W.sum(1)))
49 |     W = np.dot(W_norm, np.dot(W, W_norm))
50 |     WT = W.T
51 |     W[W < WT] = WT[W < WT]
52 |     eigen_value, ul = scipy.linalg.eigh(a=W)
53 |     Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1])
54 | 
55 |     # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d
56 |     n_sample, n_feature = X.shape
57 |     W = np.zeros((n_feature, n_clusters))
58 |     for i in range(n_clusters):
59 |         clf = linear_model.Lars(n_nonzero_coefs=n_selected_features)
60 |         clf.fit(X, Y[:, i])
61 |         W[:, i] = clf.coef_
62 |     return W
63 | 
64 | 
65 | def feature_ranking(W):
66 |     """
67 |     This function computes MCFS score and ranking features according to feature weights matrix W
68 |     """
69 |     mcfs_score = W.max(1)
70 |     idx = np.argsort(mcfs_score, 0)
71 |     idx = idx[::-1]
72 |     return idx


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/NDFS.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | import math
  4 | import sklearn.cluster
  5 | from skfeature.utility.construct_W import construct_W
  6 | 
  7 | 
  8 | def ndfs(X, **kwargs):
  9 |     """
 10 |     This function implement unsupervised feature selection using nonnegative spectral analysis, i.e.,
 11 |     min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2
 12 |     s.t. F >= 0
 13 |     
 14 |     Input
 15 |     -----
 16 |     X: {numpy array}, shape (n_samples, n_features)
 17 |         input data
 18 |     kwargs: {dictionary}
 19 |         W: {sparse matrix}, shape {n_samples, n_samples}
 20 |             affinity matrix
 21 |         alpha: {float}
 22 |             Parameter alpha in objective function
 23 |         beta: {float}
 24 |             Parameter beta in objective function
 25 |         gamma: {float}
 26 |             a very large number used to force F^T F = I
 27 |         F0: {numpy array}, shape (n_samples, n_clusters)
 28 |             initialization of the pseudo label matirx F, if not provided
 29 |         n_clusters: {int}
 30 |             number of clusters
 31 |         verbose: {boolean}
 32 |             True if user want to print out the objective function value in each iteration, false if not
 33 | 
 34 |     Output
 35 |     ------
 36 |     W: {numpy array}, shape(n_features, n_clusters)
 37 |         feature weight matrix
 38 |         
 39 |     Reference: 
 40 |         Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012.
 41 |     """
 42 | 
 43 |     # default gamma is 10e8
 44 |     if 'gamma' not in kwargs:
 45 |         gamma = 10e8
 46 |     else:
 47 |         gamma = kwargs['gamma']
 48 |     # use the default affinity matrix
 49 |     if 'W' not in kwargs:
 50 |         W = construct_W(X)
 51 |     else:
 52 |         W = kwargs['W']
 53 |     if 'alpha' not in kwargs:
 54 |         alpha = 1
 55 |     else:
 56 |         alpha = kwargs['alpha']
 57 |     if 'beta' not in kwargs:
 58 |         beta = 1
 59 |     else:
 60 |         beta = kwargs['beta']
 61 |     if 'F0' not in kwargs:
 62 |         if 'n_clusters' not in kwargs:
 63 |             print >>sys.stderr, "either F0 or n_clusters should be provided"
 64 |         else:
 65 |             # initialize F
 66 |             n_clusters = kwargs['n_clusters']
 67 |             F = kmeans_initialization(X, n_clusters)
 68 |     else:
 69 |         F = kwargs['F0']
 70 |     if 'verbose' not in kwargs:
 71 |         verbose = False
 72 |     else:
 73 |         verbose = kwargs['verbose']
 74 |     
 75 |     n_samples, n_features = X.shape
 76 | 
 77 |     # initialize D as identity matrix
 78 |     D = np.identity(n_features)
 79 |     I = np.identity(n_samples)
 80 | 
 81 |     # build laplacian matrix
 82 |     L = np.array(W.sum(1))[:, 0] - W
 83 | 
 84 |     max_iter = 1000
 85 |     obj = np.zeros(max_iter)
 86 |     for iter_step in range(max_iter):
 87 |         # update W
 88 |         T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D + 1e-6*np.eye(n_features))
 89 |         W = np.dot(np.dot(T, X.transpose()), F)
 90 |         # update D
 91 |         temp = np.sqrt((W*W).sum(1))
 92 |         temp[temp < 1e-16] = 1e-16
 93 |         temp = 0.5 / temp
 94 |         D = np.diag(temp)
 95 |         # update M
 96 |         M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose()))
 97 |         M = (M + M.transpose())/2
 98 |         # update F
 99 |         denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F)
100 |         temp = np.divide(gamma*F, denominator)
101 |         F = F*np.array(temp)
102 |         temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16))))
103 |         F = np.dot(F, temp)
104 | 
105 |         # calculate objective function
106 |         obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro')
107 |         if verbose:
108 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
109 | 
110 |         if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
111 |             break
112 |     return W
113 | 
114 | 
115 | def kmeans_initialization(X, n_clusters):
116 |     """
117 |     This function uses kmeans to initialize the pseudo label
118 | 
119 |     Input
120 |     -----
121 |     X: {numpy array}, shape (n_samples, n_features)
122 |         input data
123 |     n_clusters: {int}
124 |         number of clusters
125 | 
126 |     Output
127 |     ------
128 |     Y: {numpy array}, shape (n_samples, n_clusters)
129 |         pseudo label matrix
130 |     """
131 | 
132 |     n_samples, n_features = X.shape
133 |     kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
134 |                                     tol=0.0001, precompute_distances=True, verbose=0,
135 |                                     random_state=None, copy_x=True, n_jobs=1)
136 |     kmeans.fit(X)
137 |     labels = kmeans.labels_
138 |     Y = np.zeros((n_samples, n_clusters))
139 |     for row in range(0, n_samples):
140 |         Y[row, labels[row]] = 1
141 |     T = np.dot(Y.transpose(), Y)
142 |     F = np.dot(Y, np.sqrt(np.linalg.inv(T)))
143 |     F = F + 0.02*np.ones((n_samples, n_clusters))
144 |     return F
145 | 
146 | 
147 | def calculate_obj(X, W, F, L, alpha, beta):
148 |     """
149 |     This function calculates the objective function of NDFS
150 |     """
151 |     # Tr(F^T L F)
152 |     T1 = np.trace(np.dot(np.dot(F.transpose(), L), F))
153 |     T2 = np.linalg.norm(np.dot(X, W) - F, 'fro')
154 |     T3 = (np.sqrt((W*W).sum(1))).sum()
155 |     obj = T1 + alpha*(T2 + beta*T3)
156 |     return obj


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/RFS.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | from numpy import linalg as LA
 4 | from skfeature.utility.sparse_learning import generate_diagonal_matrix
 5 | from skfeature.utility.sparse_learning import calculate_l21_norm
 6 | 
 7 | 
 8 | def rfs(X, Y, **kwargs):
 9 |     """
10 |     This function implementS efficient and robust feature selection via joint l21-norms minimization
11 |     min_W||X^T W - Y||_2,1 + gamma||W||_2,1
12 | 
13 |     Input
14 |     -----
15 |     X: {numpy array}, shape (n_samples, n_features)
16 |         input data
17 |     Y: {numpy array}, shape (n_samples, n_classes)
18 |         input class label matrix, each row is a one-hot-coding class label
19 |     kwargs: {dictionary}
20 |         gamma: {float}
21 |             parameter in RFS
22 |         verbose: boolean
23 |             True if want to display the objective function value, false if not
24 | 
25 |     Output
26 |     ------
27 |     W: {numpy array}, shape(n_samples, n_features)
28 |         feature weight matrix
29 | 
30 |     Reference
31 |     ---------
32 |     Nie, Feiping et al. "Efficient and Robust Feature Selection via Joint l2,1-Norms Minimization" NIPS 2010.
33 |     """
34 | 
35 |     # default gamma is 1
36 |     if 'gamma' not in kwargs:
37 |         gamma = 1
38 |     else:
39 |         gamma = kwargs['gamma']
40 |     if 'verbose' not in kwargs:
41 |         verbose = False
42 |     else:
43 |         verbose = kwargs['verbose']
44 | 
45 |     n_samples, n_features = X.shape
46 |     A = np.zeros((n_samples, n_samples + n_features))
47 |     A[:, 0:n_features] = X
48 |     A[:, n_features:n_features+n_samples] = gamma*np.eye(n_samples)
49 |     D = np.eye(n_features+n_samples)
50 | 
51 |     max_iter = 1000
52 |     obj = np.zeros(max_iter)
53 |     for iter_step in range(max_iter):
54 |         # update U as U = D^{-1} A^T (A D^-1 A^T)^-1 Y
55 |         D_inv = LA.inv(D)
56 |         temp = LA.inv(np.dot(np.dot(A, D_inv), A.T) + 1e-6*np.eye(n_samples))  # (A D^-1 A^T)^-1
57 |         U = np.dot(np.dot(np.dot(D_inv, A.T), temp), Y)
58 |         # update D as D_ii = 1 / 2 / ||U(i,:)||
59 |         D = generate_diagonal_matrix(U)
60 | 
61 |         obj[iter_step] = calculate_obj(X, Y, U[0:n_features, :], gamma)
62 | 
63 |         if verbose:
64 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
65 |         if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
66 |             break
67 | 
68 |     # the first d rows of U are the feature weights
69 |     W = U[0:n_features, :]
70 |     return W
71 | 
72 | 
73 | def calculate_obj(X, Y, W, gamma):
74 |     """
75 |     This function calculates the objective function of rfs
76 |     """
77 |     temp = np.dot(X, W) - Y
78 |     return calculate_l21_norm(temp) + gamma*calculate_l21_norm(W)


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/UDFS.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import math
  4 | from skfeature.utility.sparse_learning import generate_diagonal_matrix, calculate_l21_norm
  5 | from sklearn.metrics.pairwise import pairwise_distances
  6 | 
  7 | 
  8 | def udfs(X, **kwargs):
  9 |     """
 10 |     This function implements l2,1-norm regularized discriminative feature
 11 |     selection for unsupervised learning, i.e., min_W Tr(W^T M W) + gamma ||W||_{2,1}, s.t. W^T W = I
 12 | 
 13 |     Input
 14 |     -----
 15 |     X: {numpy array}, shape (n_samples, n_features)
 16 |         input data
 17 |     kwargs: {dictionary}
 18 |         gamma: {float}
 19 |             parameter in the objective function of UDFS (default is 1)
 20 |         n_clusters: {int}
 21 |             Number of clusters
 22 |         k: {int}
 23 |             number of nearest neighbor
 24 |         verbose: {boolean}
 25 |             True if want to display the objective function value, false if not
 26 | 
 27 |     Output
 28 |     ------
 29 |     W: {numpy array}, shape(n_features, n_clusters)
 30 |         feature weight matrix
 31 | 
 32 |     Reference
 33 |     Yang, Yi et al. "l2,1-Norm Regularized Discriminative Feature Selection for Unsupervised Learning." AAAI 2012.
 34 |     """
 35 | 
 36 |     # default gamma is 0.1
 37 |     if 'gamma' not in kwargs:
 38 |         gamma = 0.1
 39 |     else:
 40 |         gamma = kwargs['gamma']
 41 |     # default k is set to be 5
 42 |     if 'k' not in kwargs:
 43 |         k = 5
 44 |     else:
 45 |         k = kwargs['k']
 46 |     if 'n_clusters' not in kwargs:
 47 |         n_clusters = 5
 48 |     else:
 49 |         n_clusters = kwargs['n_clusters']
 50 |     if 'verbose' not in kwargs:
 51 |         verbose = False
 52 |     else:
 53 |         verbose = kwargs['verbose']
 54 | 
 55 |     # construct M
 56 |     n_sample, n_feature = X.shape
 57 |     M = construct_M(X, k, gamma)
 58 | 
 59 |     D = np.eye(n_feature)
 60 |     max_iter = 1000
 61 |     obj = np.zeros(max_iter)
 62 |     for iter_step in range(max_iter):
 63 |         # update W as the eigenvectors of P corresponding to the first n_clusters
 64 |         # smallest eigenvalues
 65 |         P = M + gamma*D
 66 |         eigen_value, eigen_vector = scipy.linalg.eigh(a=P)
 67 |         W = eigen_vector[:, 0:n_clusters]
 68 |         # update D as D_ii = 1 / 2 / ||W(i,:)||
 69 |         D = generate_diagonal_matrix(W)
 70 | 
 71 |         obj[iter_step] = calculate_obj(X, W, M, gamma)
 72 |         if verbose:
 73 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
 74 | 
 75 |         if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
 76 |             break
 77 |     return W
 78 | 
 79 | 
 80 | def construct_M(X, k, gamma):
 81 |     """
 82 |     This function constructs the M matrix described in the paper
 83 |     """
 84 |     n_sample, n_feature = X.shape
 85 |     Xt = X.T
 86 |     D = pairwise_distances(X)
 87 |     # sort the distance matrix D in ascending order
 88 |     idx = np.argsort(D, axis=1)
 89 |     # choose the k-nearest neighbors for each instance
 90 |     idx_new = idx[:, 0:k+1]
 91 |     H = np.eye(k+1) - 1/(k+1) * np.ones((k+1, k+1))
 92 |     I = np.eye(k+1)
 93 |     Mi = np.zeros((n_sample, n_sample))
 94 |     for i in range(n_sample):
 95 |         Xi = Xt[:, idx_new[i, :]]
 96 |         Xi_tilde =np.dot(Xi, H)
 97 |         Bi = np.linalg.inv(np.dot(Xi_tilde.T, Xi_tilde) + gamma*I)
 98 |         Si = np.zeros((n_sample, k+1))
 99 |         for q in range(k+1):
100 |             Si[idx_new[q], q] = 1
101 |         Mi = Mi + np.dot(np.dot(Si, np.dot(np.dot(H, Bi), H)), Si.T)
102 |     M = np.dot(np.dot(X.T, Mi), X)
103 |     return M
104 | 
105 | 
106 | def calculate_obj(X, W, M, gamma):
107 |     """
108 |     This function calculates the objective function of ls_l21 described in the paper
109 |     """
110 |     return np.trace(np.dot(np.dot(W.T, M), W)) + gamma*calculate_l21_norm(W)


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/sparse_learning_based/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/ll_l21.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from numpy import linalg as LA
  4 | from skfeature.utility.sparse_learning import euclidean_projection, calculate_l21_norm
  5 | 
  6 | 
  7 | def proximal_gradient_descent(X, Y, z, **kwargs):
  8 |     """
  9 |     This function implements supervised sparse feature selection via l2,1 norm, i.e.,
 10 |     min_{W} sum_{i}log(1+exp(-yi*(W'*x+C))) + z*||W||_{2,1}
 11 | 
 12 |     Input
 13 |     -----
 14 |     X: {numpy array}, shape (n_samples, n_features)
 15 |         input data
 16 |     Y: {numpy array}, shape (n_samples, n_classes)
 17 |         input class labels, each row is a one-hot-coding class label, guaranteed to be a numpy array
 18 |     z: {float}
 19 |         regularization parameter
 20 |     kwargs: {dictionary}
 21 |         verbose: {boolean}
 22 |             True if user want to print out the objective function value in each iteration, false if not
 23 | 
 24 |     Output
 25 |     ------
 26 |     W: {numpy array}, shape (n_features, n_classes)
 27 |         weight matrix
 28 |     obj: {numpy array}, shape (n_iterations,)
 29 |         objective function value during iterations
 30 |     value_gamma: {numpy array}, shape (n_iterations,s)
 31 |         suitable step size during iterations
 32 | 
 33 | 
 34 |     Reference:
 35 |         Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
 36 |     """
 37 | 
 38 |     if 'verbose' not in kwargs:
 39 |         verbose = False
 40 |     else:
 41 |         verbose = kwargs['verbose']
 42 | 
 43 |     # Starting point initialization #
 44 |     n_samples, n_features = X.shape
 45 |     n_samples, n_classes = Y.shape
 46 | 
 47 |     # the indices of positive samples
 48 |     p_flag = (Y == 1)
 49 |     # the total number of positive samples
 50 |     n_positive_samples = np.sum(p_flag, 0)
 51 |     # the total number of negative samples
 52 |     n_negative_samples = n_samples - n_positive_samples
 53 |     n_positive_samples = n_positive_samples.astype(float)
 54 |     n_negative_samples = n_negative_samples.astype(float)
 55 | 
 56 |     # initialize a starting point
 57 |     W = np.zeros((n_features, n_classes))
 58 |     C = np.log(np.divide(n_positive_samples, n_negative_samples))
 59 | 
 60 |     # compute XW = X*W
 61 |     XW = np.dot(X, W)
 62 | 
 63 |     # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
 64 |     # the intial guess of the Lipschitz continuous gradient
 65 |     gamma = 1.0/(n_samples*n_classes)
 66 | 
 67 |     # assign Wp with W, and XWp with XW
 68 |     XWp = XW
 69 |     WWp =np.zeros((n_features, n_classes))
 70 |     CCp = np.zeros((1, n_classes))
 71 | 
 72 |     alphap = 0
 73 |     alpha = 1
 74 | 
 75 |     # indicates whether the gradient step only changes a little
 76 |     flag = False
 77 | 
 78 |     max_iter = 1000
 79 |     value_gamma = np.zeros(max_iter)
 80 |     obj = np.zeros(max_iter)
 81 |     for iter_step in range(max_iter):
 82 |         # step1: compute search point S based on Wp and W (with beta)
 83 |         beta = (alphap-1)/alpha
 84 |         S = W + beta*WWp
 85 |         SC = C + beta*CCp
 86 | 
 87 |         # step2: line search for gamma and compute the new approximation solution W
 88 |         XS = XW + beta*(XW - XWp)
 89 |         aa = -np.multiply(Y, XS+np.tile(SC, (n_samples, 1)))
 90 |         # fun_S is the logistic loss at the search point
 91 |         bb = np.maximum(aa, 0)
 92 |         fun_S = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes)
 93 |         # compute prob = [p_1;p_2;...;p_m]
 94 |         prob = 1.0/(1+np.exp(aa))
 95 | 
 96 |         b = np.multiply(-Y, (1-prob))/(n_samples*n_classes)
 97 |         # compute the gradient of C
 98 |         GC = np.sum(b, 0)
 99 |         # compute the gradient of W as X'*b
100 |         G = np.dot(np.transpose(X), b)
101 | 
102 |         # copy W and XW to Wp and XWp
103 |         Wp = W
104 |         XWp = XW
105 |         Cp = C
106 | 
107 |         while True:
108 |             # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
109 |             V = S - G/gamma
110 |             C = SC - GC/gamma
111 |             W = euclidean_projection(V, n_features, n_classes, z, gamma)
112 | 
113 |             # the difference between the new approximate solution W and the search point S
114 |             V = W - S
115 |             # compute XW = X*W
116 |             XW = np.dot(X, W)
117 |             aa = -np.multiply(Y, XW+np.tile(C, (n_samples, 1)))
118 |             # fun_W is the logistic loss at the new approximate solution
119 |             bb = np.maximum(aa, 0)
120 |             fun_W = np.sum(np.log(np.exp(-bb)+np.exp(aa-bb))+bb)/(n_samples*n_classes)
121 | 
122 |             r_sum = (LA.norm(V, 'fro')**2 + LA.norm(C-SC, 2)**2) / 2
123 |             l_sum = fun_W - fun_S - np.sum(np.multiply(V, G)) - np.inner((C-SC), GC)
124 | 
125 |             # determine weather the gradient step makes little improvement
126 |             if r_sum <= 1e-20:
127 |                 flag = True
128 |                 break
129 | 
130 |             # the condition is fun_W <= fun_S + <V, G> + <C ,GC> + gamma/2 * (<V,V> + <C-SC,C-SC> )
131 |             if l_sum < r_sum*gamma:
132 |                 break
133 |             else:
134 |                 gamma = max(2*gamma, l_sum/r_sum)
135 |         value_gamma[iter_step] = gamma
136 | 
137 |         # step3: update alpha and alphap, and check weather converge
138 |         alphap = alpha
139 |         alpha = (1+math.sqrt(4*alpha*alpha+1))/2
140 | 
141 |         WWp = W - Wp
142 |         CCp = C - Cp
143 | 
144 |         # calculate obj
145 |         obj[iter_step] = fun_W
146 |         obj[iter_step] += z*calculate_l21_norm(W)
147 | 
148 |         if verbose:
149 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
150 | 
151 |         if flag is True:
152 |             break
153 | 
154 |         # determine weather converge
155 |         if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
156 |             break
157 |     return W, obj, value_gamma
158 | 


--------------------------------------------------------------------------------
/skfeature/function/sparse_learning_based/ls_l21.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from numpy import linalg as LA
  4 | from skfeature.utility.sparse_learning import euclidean_projection, calculate_l21_norm
  5 | 
  6 | 
  7 | def proximal_gradient_descent(X, Y, z, **kwargs):
  8 |     """
  9 |     This function implements supervised sparse feature selection via l2,1 norm, i.e.,
 10 |     min_{W} ||XW-Y||_F^2 + z*||W||_{2,1}
 11 | 
 12 |     Input
 13 |     -----
 14 |     X: {numpy array}, shape (n_samples, n_features)
 15 |         input data, guaranteed to be a numpy array
 16 |     Y: {numpy array}, shape (n_samples, n_classes)
 17 |         input class labels, each row is a one-hot-coding class label
 18 |     z: {float}
 19 |         regularization parameter
 20 |     kwargs: {dictionary}
 21 |         verbose: {boolean}
 22 |             True if user want to print out the objective function value in each iteration, false if not
 23 | 
 24 |     Output
 25 |     ------
 26 |         W: {numpy array}, shape (n_features, n_classes)
 27 |             weight matrix
 28 |         obj: {numpy array}, shape (n_iterations,)
 29 |             objective function value during iterations
 30 |         value_gamma: {numpy array}, shape (n_iterations,)
 31 |             suitable step size during iterations
 32 | 
 33 |     Reference
 34 |     ---------
 35 |         Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
 36 |     """
 37 | 
 38 |     if 'verbose' not in kwargs:
 39 |         verbose = False
 40 |     else:
 41 |         verbose = kwargs['verbose']
 42 | 
 43 |     # starting point initialization
 44 |     n_samples, n_features = X.shape
 45 |     n_samples, n_classes = Y.shape
 46 | 
 47 |     # compute X'Y
 48 |     XtY = np.dot(np.transpose(X), Y)
 49 | 
 50 |     # initialize a starting point
 51 |     W = XtY
 52 | 
 53 |     # compute XW = X*W
 54 |     XW = np.dot(X, W)
 55 | 
 56 |     # compute l2,1 norm of W
 57 |     W_norm = calculate_l21_norm(W)
 58 | 
 59 |     if W_norm >= 1e-6:
 60 |         ratio = init_factor(W_norm, XW, Y, z)
 61 |         W = ratio*W
 62 |         XW = ratio*XW
 63 | 
 64 |     # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
 65 |     # initialize step size gamma = 1
 66 |     gamma = 1
 67 | 
 68 |     # assign Wp with W, and XWp with XW
 69 |     XWp = XW
 70 |     WWp =np.zeros((n_features, n_classes))
 71 |     alphap = 0
 72 |     alpha = 1
 73 | 
 74 |     # indicate whether the gradient step only changes a little
 75 |     flag = False
 76 | 
 77 |     max_iter = 1000
 78 |     value_gamma = np.zeros(max_iter)
 79 |     obj = np.zeros(max_iter)
 80 |     for iter_step in range(max_iter):
 81 |         # step1: compute search point S based on Wp and W (with beta)
 82 |         beta = (alphap-1)/alpha
 83 |         S = W + beta*WWp
 84 | 
 85 |         # step2: line search for gamma and compute the new approximation solution W
 86 |         XS = XW + beta*(XW - XWp)
 87 |         # compute X'* XS
 88 |         XtXS = np.dot(np.transpose(X), XS)
 89 |         # obtain the gradient g
 90 |         G = XtXS - XtY
 91 |         # copy W and XW to Wp and XWp
 92 |         Wp = W
 93 |         XWp = XW
 94 | 
 95 |         while True:
 96 |             # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
 97 |             V = S - G/gamma
 98 |             W = euclidean_projection(V, n_features, n_classes, z, gamma)
 99 |             # the difference between the new approximate solution W and the search point S
100 |             V = W - S
101 |             # compute XW = X*W
102 |             XW = np.dot(X, W)
103 |             XV = XW - XS
104 |             r_sum = LA.norm(V, 'fro')**2
105 |             l_sum = LA.norm(XV, 'fro')**2
106 | 
107 |             # determine weather the gradient step makes little improvement
108 |             if r_sum <= 1e-20:
109 |                 flag = True
110 |                 break
111 | 
112 |             # the condition is ||XV||_2^2 <= gamma * ||V||_2^2
113 |             if l_sum < r_sum*gamma:
114 |                 break
115 |             else:
116 |                 gamma = max(2*gamma, l_sum/r_sum)
117 |         value_gamma[iter_step] = gamma
118 | 
119 |         # step3: update alpha and alphap, and check weather converge
120 |         alphap = alpha
121 |         alpha = (1+math.sqrt(4*alpha*alpha+1))/2
122 | 
123 |         WWp = W - Wp
124 |         XWY = XW -Y
125 | 
126 |         # calculate obj
127 |         obj[iter_step] = LA.norm(XWY, 'fro')**2/2
128 |         obj[iter_step] += z*calculate_l21_norm(W)
129 | 
130 |         if verbose:
131 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
132 | 
133 |         if flag is True:
134 |             break
135 | 
136 |         # determine weather converge
137 |         if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
138 |             break
139 |     return W, obj, value_gamma
140 | 
141 | 
142 | def init_factor(W_norm, XW, Y, z):
143 |     """
144 |     Initialize the starting point of W, according to the author's code
145 |     """
146 |     n_samples, n_classes = XW.shape
147 |     a = np.inner(np.reshape(XW, n_samples*n_classes), np.reshape(Y, n_samples*n_classes)) - z*W_norm
148 |     b = LA.norm(XW, 'fro')**2
149 |     ratio = a / b
150 |     return ratio


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/CFS.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from skfeature.utility.mutual_information import su_calculation
 3 | 
 4 | 
 5 | def merit_calculation(X, y):
 6 |     """
 7 |     This function calculates the merit of X given class labels y, where
 8 |     merits = (k * rcf)/sqrt(k+k*(k-1)*rff)
 9 |     rcf = (1/k)*sum(su(fi,y)) for all fi in X
10 |     rff = (1/(k*(k-1)))*sum(su(fi,fj)) for all fi and fj in X
11 | 
12 |     Input
13 |     ----------
14 |     X: {numpy array}, shape (n_samples, n_features)
15 |         input data
16 |     y: {numpy array}, shape (n_samples,)
17 |         input class labels
18 | 
19 |     Output
20 |     ----------
21 |     merits: {float}
22 |         merit of a feature subset X
23 |     """
24 | 
25 |     n_samples, n_features = X.shape
26 |     rff = 0
27 |     rcf = 0
28 |     for i in range(n_features):
29 |         fi = X[:, i]
30 |         rcf += su_calculation(fi, y)
31 |         for j in range(n_features):
32 |             if j > i:
33 |                 fj = X[:, j]
34 |                 rff += su_calculation(fi, fj)
35 |     rff *= 2
36 |     merits = rcf / np.sqrt(n_features + rff)
37 |     return merits
38 | 
39 | 
40 | def cfs(X, y):
41 |     """
42 |     This function uses a correlation based heuristic to evaluate the worth of features which is called CFS
43 | 
44 |     Input
45 |     -----
46 |     X: {numpy array}, shape (n_samples, n_features)
47 |         input data
48 |     y: {numpy array}, shape (n_samples,)
49 |         input class labels
50 | 
51 |     Output
52 |     ------
53 |     F: {numpy array}
54 |         index of selected features
55 | 
56 |     Reference
57 |     ---------
58 |     Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010.
59 |     """
60 | 
61 |     n_samples, n_features = X.shape
62 |     F = []
63 |     # M stores the merit values
64 |     M = []
65 |     while True:
66 |         merit = -100000000000
67 |         idx = -1
68 |         for i in range(n_features):
69 |             if i not in F:
70 |                 F.append(i)
71 |                 # calculate the merit of current selected features
72 |                 t = merit_calculation(X[:, F], y)
73 |                 if t > merit:
74 |                     merit = t
75 |                     idx = i
76 |                 F.pop()
77 |         F.append(idx)
78 |         M.append(merit)
79 |         if len(M) > 5:
80 |             if M[len(M)-1] <= M[len(M)-2]:
81 |                 if M[len(M)-2] <= M[len(M)-3]:
82 |                     if M[len(M)-3] <= M[len(M)-4]:
83 |                         if M[len(M)-4] <= M[len(M)-5]:
84 |                             break
85 |     return np.array(F)
86 | 
87 | 


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/statistical_based/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/chi_square.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_selection import chi2
 3 | 
 4 | 
 5 | def chi_square(X, y):
 6 |     """
 7 |     This function implements the chi-square feature selection (existing method for classification in scikit-learn)
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         input data
13 |     y: {numpy array},shape (n_samples,)
14 |         input class labels
15 | 
16 |     Output
17 |     ------
18 |     F: {numpy array}, shape (n_features,)
19 |         chi-square score for each feature
20 |     """
21 |     F, pval = chi2(X, y)
22 |     return F
23 | 
24 | 
25 | def feature_ranking(F):
26 |     """
27 |     Rank features in descending order according to chi2-score, the higher the chi2-score, the more important the feature is
28 |     """
29 |     idx = np.argsort(F)
30 |     return idx[::-1]


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/f_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.feature_selection import f_classif
 3 | 
 4 | 
 5 | def f_score(X, y):
 6 |     """
 7 |     This function implements the anova f_value feature selection (existing method for classification in scikit-learn),
 8 |     where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2))
 9 | 
10 |     Input
11 |     -----
12 |     X: {numpy array}, shape (n_samples, n_features)
13 |         input data
14 |     y : {numpy array},shape (n_samples,)
15 |         input class labels
16 | 
17 |     Output
18 |     ------
19 |     F: {numpy array}, shape (n_features,)
20 |         f-score for each feature
21 |     """
22 | 
23 |     F, pval = f_classif(X, y)
24 |     return F
25 | 
26 | 
27 | def feature_ranking(F):
28 |     """
29 |     Rank features in descending order according to f-score, the higher the f-score, the more important the feature is
30 |     """
31 |     idx = np.argsort(F)
32 |     return idx[::-1]


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/gini_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def gini_index(X, y):
 5 |     """
 6 |     This function implements the gini index feature selection.
 7 | 
 8 |     Input
 9 |     ----------
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data
12 |     y: {numpy array}, shape (n_samples,)
13 |         input class labels
14 | 
15 |     Output
16 |     ----------
17 |     gini: {numpy array}, shape (n_features, )
18 |         gini index value of each feature
19 |     """
20 | 
21 |     n_samples, n_features = X.shape
22 | 
23 |     # initialize gini_index for all features to be 0.5
24 |     gini = np.ones(n_features) * 0.5
25 | 
26 |     # For i-th feature we define fi = x[:,i] ,v include all unique values in fi
27 |     for i in range(n_features):
28 |         v = np.unique(X[:, i])
29 |         for j in range(len(v)):
30 |             # left_y contains labels of instances whose i-th feature value is less than or equal to v[j]
31 |             left_y = y[X[:, i] <= v[j]]
32 |             # right_y contains labels of instances whose i-th feature value is larger than v[j]
33 |             right_y = y[X[:, i] > v[j]]
34 | 
35 |             # gini_left is sum of square of probability of occurrence of v[i] in left_y
36 |             # gini_right is sum of square of probability of occurrence of v[i] in right_y
37 |             gini_left = 0
38 |             gini_right = 0
39 | 
40 |             for k in range(np.min(y), np.max(y)+1):
41 |                 if len(left_y) != 0:
42 |                     # t1_left is probability of occurrence of k in left_y
43 |                     t1_left = np.true_divide(len(left_y[left_y == k]), len(left_y))
44 |                     t2_left = np.power(t1_left, 2)
45 |                     gini_left += t2_left
46 | 
47 |                 if len(right_y) != 0:
48 |                     # t1_right is probability of occurrence of k in left_y
49 |                     t1_right = np.true_divide(len(right_y[right_y == k]), len(right_y))
50 |                     t2_right = np.power(t1_right, 2)
51 |                     gini_right += t2_right
52 | 
53 |             gini_left = 1 - gini_left
54 |             gini_right = 1 - gini_right
55 | 
56 |             # weighted average of len(left_y) and len(right_y)
57 |             t1_gini = (len(left_y) * gini_left + len(right_y) * gini_right)
58 | 
59 |             # compute the gini_index for the i-th feature
60 |             value = np.true_divide(t1_gini, len(y))
61 | 
62 |             if value < gini[i]:
63 |                 gini[i] = value
64 |     return gini
65 | 
66 | 
67 | def feature_ranking(W):
68 |     """
69 |     Rank features in descending order according to their gini index values, the smaller the gini index,
70 |     the more important the feature is
71 |     """
72 |     idx = np.argsort(W)
73 |     return idx
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/low_variance.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_selection import VarianceThreshold
 2 | 
 3 | 
 4 | def low_variance_feature_selection(X, threshold):
 5 |     """
 6 |     This function implements the low_variance feature selection (existing method in scikit-learn)
 7 | 
 8 |     Input
 9 |     -----
10 |     X: {numpy array}, shape (n_samples, n_features)
11 |         input data
12 |     p:{float}
13 |         parameter used to calculate the threshold(threshold = p*(1-p))
14 | 
15 |     Output
16 |     ------
17 |     X_new: {numpy array}, shape (n_samples, n_selected_features)
18 |         data with selected features
19 |     """
20 |     sel = VarianceThreshold(threshold)
21 |     return sel.fit_transform(X)


--------------------------------------------------------------------------------
/skfeature/function/statistical_based/t_score.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def t_score(X, y):
 5 |     """
 6 |     This function calculates t_score for each feature, where t_score is only used for binary problem
 7 |     t_score = |mean1-mean2|/sqrt(((std1^2)/n1)+((std2^2)/n2)))
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         input data
13 |     y: {numpy array}, shape (n_samples,)
14 |         input class labels
15 | 
16 |     Output
17 |     ------
18 |     F: {numpy array}, shape (n_features,)
19 |         t-score for each feature
20 |     """
21 | 
22 |     n_samples, n_features = X.shape
23 |     F = np.zeros(n_features)
24 |     c = np.unique(y)
25 |     if len(c) == 2:
26 |         for i in range(n_features):
27 |             f = X[:, i]
28 |             # class0 contains instances belonging to the first class
29 |             # class1 contains instances belonging to the second class
30 |             class0 = f[y == c[0]]
31 |             class1 = f[y == c[1]]
32 |             mean0 = np.mean(class0)
33 |             mean1 = np.mean(class1)
34 |             std0 = np.std(class0)
35 |             std1 = np.std(class1)
36 |             n0 = len(class0)
37 |             n1 = len(class1)
38 |             t = mean0 - mean1
39 |             t0 = np.true_divide(std0**2, n0)
40 |             t1 = np.true_divide(std1**2, n1)
41 |             F[i] = np.true_divide(t, (t0 + t1)**0.5)
42 |     else:
43 |         print('y should be guaranteed to a binary class vector')
44 |         exit(0)
45 |     return np.abs(F)
46 | 
47 | 
48 | def feature_ranking(F):
49 |     """
50 |     Rank features in descending order according to t-score, the higher the t-score, the more important the feature is
51 |     """
52 |     idx = np.argsort(F)
53 |     return idx[::-1]
54 | 
55 | 


--------------------------------------------------------------------------------
/skfeature/function/streaming/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jundongl'
2 | 


--------------------------------------------------------------------------------
/skfeature/function/streaming/alpha_investing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import linear_model
 3 | 
 4 | 
 5 | def alpha_investing(X, y, w0, dw):
 6 |     """
 7 |     This function implements streamwise feature selection (SFS) algorithm alpha_investing for binary regression or
 8 |     univariate regression
 9 | 
10 |     Input
11 |     -----
12 |     X: {numpy array}, shape (n_samples, n_features)
13 |         input data, assume feature arrives one at each time step
14 |     y: {numpy array}, shape (n_samples,)
15 |         input class labels or regression target
16 | 
17 |     Output
18 |     ------
19 |     F: {numpy array}, shape (n_selected_features,)
20 |         index of selected features in a streamwise way
21 | 
22 |     Reference
23 |     ---------
24 |     Zhou, Jing et al. "Streaming Feature Selection using Alpha-investing." KDD 2006.
25 |     """
26 | 
27 |     n_samples, n_features = X.shape
28 |     w = w0
29 |     F = []  # selected features
30 |     for i in range(n_features):
31 |         x_can = X[:, i]  # generate next feature
32 |         alpha = w/2/(i+1)
33 |         X_old = X[:, F]
34 |         if i is 0:
35 |             X_old = np.ones((n_samples, 1))
36 |             linreg_old = linear_model.LinearRegression()
37 |             linreg_old.fit(X_old, y)
38 |             error_old = 1 - linreg_old.score(X_old, y)
39 |         if i is not 0:
40 |             # model built with only X_old
41 |             linreg_old = linear_model.LinearRegression()
42 |             linreg_old.fit(X_old, y)
43 |             error_old = 1 - linreg_old.score(X_old, y)
44 | 
45 |         # model built with X_old & {x_can}
46 |         X_new = np.concatenate((X_old, x_can.reshape(n_samples, 1)), axis=1)
47 |         logreg_new = linear_model.LinearRegression()
48 |         logreg_new.fit(X_new, y)
49 |         error_new = 1 - logreg_new.score(X_new, y)
50 | 
51 |         # calculate p-value
52 |         pval = np.exp((error_new - error_old)/(2*error_old/n_samples))
53 |         if pval < alpha:
54 |             F.append(i)
55 |             w = w + dw - alpha
56 |         else:
57 |             w -= alpha
58 |     return np.array(F)
59 | 
60 | 


--------------------------------------------------------------------------------
/skfeature/function/structure/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/skfeature/function/structure/graph_fs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | def soft_threshold(A,b):
  5 |     """
  6 |     This function implement the soft-threshold operator
  7 |     Input:
  8 |         A: {numpy scalar, vector, or matrix}
  9 |         b: scalar}
 10 |     """
 11 |     res = np.zeros(A.shape)
 12 |     res[A > b] = A[A > b] - b
 13 |     res[A < -b] = A[A < -b] + b
 14 |     return res
 15 | 
 16 | 
 17 | def calculate_obj(X, y, w, lambda1, lambda2, T):
 18 |     return 1/2 * (np.linalg.norm(y- np.dot(X, w), 'fro'))**2 + lambda1*np.abs(w).sum() + lambda2*np.abs(np.dot(T, w)).sum()
 19 | 
 20 | 
 21 | def graph_fs(X, y, **kwargs):
 22 |     """
 23 |     This function implement the graph structural feature selection algorithm GOSCAR
 24 | 
 25 |     Objective Function
 26 |         min_{w} 1/2 ||X*w - y||_F^2 + lambda1 ||w||_1 + lambda2 \sum_{(i,j) \in E} max{|w_i|, |w|_j}
 27 | 
 28 |     Input:
 29 |         X: {numpy array}, shape (n_samples, n_features)
 30 |             Input data, guaranteed to be a numpy array
 31 |         y: {numpy array}, shape (n_samples, 1)
 32 |             Input data, the label matrix
 33 |         edge_list: {numpy array}, shape (n_edges, 2)
 34 |             Input data, each row is a pair of linked features, note feature index should start from 0
 35 |         lambda1: {float}
 36 |             Parameter lambda1 in objective function
 37 |         lambda2: {float}
 38 |             Parameter labmda2 in objective function
 39 |         rho: {flot}
 40 |             parameter used for optimization
 41 |         max_iter: {int}
 42 |             maximal iteration
 43 |         verbose: {boolean} True or False
 44 |             True if we want to print out the objective function value in each iteration, False if not
 45 | 
 46 |     Output:
 47 |         w: the weights of the features
 48 |         obj: the value of the objective function in each iteration
 49 |     """
 50 | 
 51 |     if 'lambda1' not in kwargs:
 52 |         lambda1 = 0.8
 53 |     else:
 54 |         lambda1 = kwargs['lambda1']
 55 |     if 'lambda2' not in kwargs:
 56 |         lambda2 = 0.8
 57 |     else:
 58 |         lambda2 = kwargs['lambda2']
 59 |     if 'edge_list' not in kwargs:
 60 |         print('Error using function, the network structure E is required')
 61 |         raise()
 62 |     else :
 63 |         edge_list = kwargs['edge_list']
 64 |     if 'max_iter' not in kwargs:
 65 |         max_iter = 300
 66 |     else:
 67 |         max_iter = kwargs['max_iter']
 68 |     if 'verbose' not in kwargs:
 69 |         verbose = 0
 70 |     else:
 71 |         verbose = kwargs['verbose']
 72 |     if 'rho' not in kwargs:
 73 |         rho = 5
 74 |     else:
 75 |         rho = kwargs['rho']
 76 | 
 77 |     n_samples, n_features = X.shape
 78 | 
 79 |     # construct T from E
 80 |     ind1 = edge_list[:, 0]
 81 |     ind2 = edge_list[:, 1]
 82 |     num_edge = ind1.shape[0]
 83 |     T = np.zeros((num_edge*2, n_features))
 84 |     for i in range(num_edge):
 85 |         T[i, ind1[i]] = 0.5
 86 |         T[i, ind2[i]] = 0.5
 87 |         T[i+num_edge, ind1[i]] = 0.5
 88 |         T[i+num_edge, ind2[i]] = -0.5
 89 | 
 90 |     # calculate F = X^T X + rho(I + T^T * T)
 91 |     F = np.dot(X.T, X) + rho*(np.identity(n_features) + np.dot(T.T, T))
 92 | 
 93 |     # Cholesky factorization of F = R^T R
 94 |     R = np.linalg.cholesky(F)  # NOTE, this return F = R R^T
 95 |     R = R.T
 96 |     Rinv = np.linalg.inv(R)
 97 |     Rtinv = Rinv.T
 98 | 
 99 |     # initialize p, q, mu , v to be zero vectors
100 |     p = np.zeros((2*num_edge, 1))
101 |     q = np.zeros((n_features, 1))
102 |     mu = np.zeros((n_features, 1))
103 |     v = np.zeros((2*num_edge, 1))
104 | 
105 |     # start the main loop
106 |     iter = 0
107 |     obj = np.zeros((max_iter,1))
108 |     while iter < max_iter:
109 |         print(iter)
110 |         # update w
111 |         b = np.dot(X.T, y) - mu - np.dot(T.T, v) + rho*np.dot(T.T,p) + rho*q
112 |         w_hat = np.dot(Rtinv, b)
113 |         w = np.dot(Rinv, w_hat)
114 | 
115 |         # update q
116 |         q = soft_threshold(w + 1/rho*mu, lambda1/rho)
117 |         # update p
118 | 
119 |         p = soft_threshold(np.dot(T, w)+1/rho*v, lambda2/rho)
120 |         # update mu, v
121 |         mu += rho*(w - q)
122 |         v += rho*(np.dot(T, w) - p)
123 | 
124 |         # calculate objective function
125 |         obj[iter] = calculate_obj(X, y, w, lambda1, lambda2, T)
126 |         if verbose:
127 |             print('obj at iter {0}: {1}'.format(iter, obj[iter]))
128 |         iter += 1
129 |     return w, obj, q
130 | 
131 | def feature_ranking(w):
132 |     T = w.abs()
133 |     idx = np.argsort(T, 0)
134 |     return idx[::-1]
135 | 


--------------------------------------------------------------------------------
/skfeature/function/structure/group_fs.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from skfeature.utility.sparse_learning import tree_lasso_projection, tree_norm
  4 | 
  5 | 
  6 | def group_fs(X, y, z1, z2, idx, **kwargs):
  7 |     """
  8 |     This function implements supervised sparse group feature selection with least square loss, i.e.,
  9 |     min_{w} ||Xw-y||_2^2 + z_1||w||_1 + z_2*sum_{i} h_{i}||w_{G_{i}}|| where h_i is the weight for the i-th group
 10 | 
 11 |     Input
 12 |     -----
 13 |     X: {numpy array}, shape (n_samples, n_features)
 14 |         input data
 15 |     y: {numpy array}, shape (n_samples,)
 16 |         input class labels or regression target
 17 |     z1: {float}
 18 |         regularization parameter of L1 norm for each element
 19 |     z2: {float}
 20 |         regularization parameter of L2 norm for the non-overlapping group
 21 |     idx: {numpy array}, shape (3, n_nodes)
 22 |         3*nodes matrix, where nodes denotes the number of groups
 23 |         idx[1,:] contains the starting index of a group
 24 |         idx[2,: contains the ending index of a group
 25 |         idx[3,:] contains the corresponding weight (w_{j})
 26 |     kwargs: {dictionary}
 27 |         verbose: {boolean}
 28 |             True if user want to print out the objective function value in each iteration, false if not
 29 | 
 30 |     Output
 31 |     ------
 32 |     w: {numpy array}, shape (n_features, )
 33 |         weight matrix
 34 |     obj: {numpy array}, shape (n_iterations, )
 35 |         objective function value during iterations
 36 |     value_gamma: {numpy array}, shape (n_iterations, )
 37 |         suitable step size during iterations
 38 | 
 39 |     Reference
 40 |     ---------
 41 |         Liu, Jun, et al. "Moreau-Yosida Regularization for Grouped Tree Structure Learning." NIPS. 2010.
 42 |         Liu, Jun, et al. "SLEP: Sparse Learning with Efficient Projections." http://www.public.asu.edu/~jye02/Software/SLEP, 2009.
 43 |     """
 44 |     if 'verbose' not in kwargs:
 45 |         verbose = False
 46 |     else:
 47 |         verbose = kwargs['verbose']
 48 | 
 49 |     # starting point initialization
 50 |     n_samples, n_features = X.shape
 51 | 
 52 |     # compute X'y
 53 |     Xty = np.dot(np.transpose(X), y)
 54 | 
 55 |     # initialize a starting point
 56 |     w = np.zeros(n_features)
 57 | 
 58 |     # compute Xw = X*w
 59 |     Xw = np.dot(X, w)
 60 | 
 61 |     # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
 62 |     # initialize step size gamma = 1
 63 |     gamma = 1
 64 | 
 65 |     # assign wp with w, and Xwp with Xw
 66 |     Xwp = Xw
 67 |     wwp = np.zeros(n_features)
 68 |     alphap = 0
 69 |     alpha = 1
 70 | 
 71 |     # indicates whether the gradient step only changes a little
 72 |     flag = False
 73 | 
 74 |     max_iter = 1000
 75 |     value_gamma = np.zeros(max_iter)
 76 |     obj = np.zeros(max_iter)
 77 |     for iter_step in range(max_iter):
 78 |         # step1: compute search point s based on wp and w (with beta)
 79 |         beta = (alphap-1)/alpha
 80 |         s = w + beta*wwp
 81 | 
 82 |         # step2: line search for gamma and compute the new approximation solution w
 83 |         Xs = Xw + beta*(Xw - Xwp)
 84 |         # compute X'* Xs
 85 |         XtXs = np.dot(np.transpose(X), Xs)
 86 |         # obtain the gradient g
 87 |         G = XtXs - Xty
 88 |         # copy w and Xw to wp and Xwp
 89 |         wp = w
 90 |         Xwp = Xw
 91 | 
 92 |         while True:
 93 |             # let s walk in a step in the antigradient of s to get v and then do the L1/L2-norm regularized projection
 94 |             v = s - G/gamma
 95 |             # tree overlapping group lasso projection
 96 |             n_nodes = int(idx.shape[1])
 97 |             idx_tmp = np.zeros((3, n_nodes+1))
 98 |             idx_tmp[0:2, :] = np.concatenate((np.array([[-1], [-1]]), idx[0:2, :]), axis=1)
 99 |             idx_tmp[2, :] = np.concatenate((np.array([z1/gamma]), z2/gamma*idx[2, :]), axis=1)
100 |             w = tree_lasso_projection(v, n_features, idx_tmp, n_nodes+1)
101 |             # the difference between the new approximate solution w and the search point s
102 |             v = w - s
103 |             # compute Xw = X*w
104 |             Xw = np.dot(X, w)
105 |             Xv = Xw - Xs
106 |             r_sum = np.inner(v, v)
107 |             l_sum = np.inner(Xv, Xv)
108 |             # determine weather the gradient step makes little improvement
109 |             if r_sum <= 1e-20:
110 |                 flag = True
111 |                 break
112 | 
113 |             # the condition is ||Xv||_2^2 <= gamma * ||v||_2^2
114 |             if l_sum <= r_sum*gamma:
115 |                 break
116 |             else:
117 |                 gamma = max(2*gamma, l_sum/r_sum)
118 |         value_gamma[iter_step] = gamma
119 | 
120 |         # step3: update alpha and alphap, and check weather converge
121 |         alphap = alpha
122 |         alpha = (1+math.sqrt(4*alpha*alpha+1))/2
123 | 
124 |         wwp = w - wp
125 |         Xwy = Xw -y
126 | 
127 |         # calculate the regularization part
128 |         idx_tmp = np.zeros((3, n_nodes+1))
129 |         idx_tmp[0:2, :] = np.concatenate((np.array([[-1], [-1]]), idx[0:2, :]), axis=1)
130 |         idx_tmp[2, :] = np.concatenate((np.array([z1]), z2*idx[2, :]), axis=1)
131 |         tree_norm_val = tree_norm(w, n_features, idx_tmp, n_nodes+1)
132 | 
133 |         # function value = loss + regularization
134 |         obj[iter_step] = np.inner(Xwy, Xwy)/2 + tree_norm_val
135 | 
136 |         if verbose:
137 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
138 | 
139 |         if flag is True:
140 |             break
141 | 
142 |         # determine weather converge
143 |         if iter_step >= 2 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
144 |             break
145 | 
146 |     return w, obj, value_gamma
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/skfeature/function/structure/tree_fs.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | from skfeature.utility.sparse_learning import tree_lasso_projection, tree_norm
  4 | 
  5 | 
  6 | def tree_fs(X, y, z, idx, **kwargs):
  7 |     """
  8 |     This function implements tree structured group lasso regularization with least square loss, i.e.,
  9 |     min_{w} ||Xw-Y||_2^2 + z\sum_{i}\sum_{j} h_{j}^{i}|||w_{G_{j}^{i}}|| where h_{j}^{i} is the weight for the j-th group
 10 |     from the i-th level (the root node is in level 0)
 11 | 
 12 |     Input
 13 |     -----
 14 |     X: {numpy array}, shape (n_samples, n_features)
 15 |         input data
 16 |     y: {numpy array}, shape (n_samples,)
 17 |         input class labels or regression target
 18 |     z: {float}
 19 |         regularization parameter of L2 norm for the non-overlapping group
 20 |     idx: {numpy array}, shape (3, n_nodes)
 21 |         3*nodes matrix, where nodes denotes the number of nodes of the tree
 22 |         idx(1,:) contains the starting index
 23 |         idx(2,:) contains the ending index
 24 |         idx(3,:) contains the corresponding weight (w_{j})
 25 |     kwargs: {dictionary}
 26 |         verbose: {boolean}
 27 |             True if user want to print out the objective function value in each iteration, false if not
 28 | 
 29 |     Output
 30 |     ------
 31 |         w: {numpy array}, shape (n_features,)
 32 |             weight vector
 33 |         obj: {numpy array}, shape (n_iterations,)
 34 |             objective function value during iterations
 35 |         value_gamma: {numpy array}, shape (n_iterations,)
 36 |             suitable step size during iterations
 37 | 
 38 |     Note for input parameter idx:
 39 |     (1) For idx, if each entry in w is a leaf node of the tree and the weight for this leaf node are the same, then
 40 |     idx[0,0] = -1 and idx[1,0] = -1, idx[2,0] denotes the common weight
 41 |     (2) In idx, the features of the left tree is smaller than the right tree (idx[0,i] is always smaller than idx[1,i])
 42 | 
 43 |     Reference:
 44 |         Liu, Jun, et al. "Moreau-Yosida Regularization for Grouped Tree Structure Learning." NIPS. 2010.
 45 |         Liu, Jun, et al. "SLEP: Sparse Learning with Efficient Projections." http://www.public.asu.edu/~jye02/Software/SLEP, 2009.
 46 |     """
 47 | 
 48 |     if 'verbose' not in kwargs:
 49 |         verbose = False
 50 |     else:
 51 |         verbose = kwargs['verbose']
 52 | 
 53 |     # starting point initialization
 54 |     n_samples, n_features = X.shape
 55 | 
 56 |     # compute X'y
 57 |     Xty = np.dot(np.transpose(X), y)
 58 | 
 59 |     # initialize a starting point
 60 |     w = np.zeros(n_features)
 61 | 
 62 |     # compute Xw = X*w
 63 |     Xw = np.dot(X, w)
 64 | 
 65 |     # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
 66 |     # initialize step size gamma = 1
 67 |     gamma = 1
 68 | 
 69 |     # assign wp with w, and Xwp with Xw
 70 |     Xwp = Xw
 71 |     wwp = np.zeros(n_features)
 72 |     alphap = 0
 73 |     alpha = 1
 74 | 
 75 |     # indicates whether the gradient step only changes a little
 76 |     flag = False
 77 | 
 78 |     max_iter = 1000
 79 |     value_gamma = np.zeros(max_iter)
 80 |     obj = np.zeros(max_iter)
 81 |     for iter_step in range(max_iter):
 82 |         # step1: compute search point s based on wp and w (with beta)
 83 |         beta = (alphap-1)/alpha
 84 |         s = w + beta*wwp
 85 | 
 86 |         # step2: line search for gamma and compute the new approximation solution w
 87 |         Xs = Xw + beta*(Xw - Xwp)
 88 |         # compute X'* Xs
 89 |         XtXs = np.dot(np.transpose(X), Xs)
 90 | 
 91 |         # obtain the gradient g
 92 |         G = XtXs - Xty
 93 | 
 94 |         # copy w and Xw to wp and Xwp
 95 |         wp = w
 96 |         Xwp = Xw
 97 | 
 98 |         while True:
 99 |             # let s walk in a step in the antigradient of s to get v and then do the L1/L2-norm regularized projection
100 |             v = s - G/gamma
101 |             # tree overlapping group lasso projection
102 |             n_nodes = int(idx.shape[1])
103 |             idx_tmp = idx.copy()
104 |             idx_tmp[2, :] = idx[2, :] * z / gamma
105 |             w = tree_lasso_projection(v, n_features, idx_tmp, n_nodes)
106 |             # the difference between the new approximate solution w and the search point s
107 |             v = w - s
108 |             # compute Xw = X*w
109 |             Xw = np.dot(X, w)
110 |             Xv = Xw - Xs
111 |             r_sum = np.inner(v, v)
112 |             l_sum = np.inner(Xv, Xv)
113 |             # determine weather the gradient step makes little improvement
114 |             if r_sum <= 1e-20:
115 |                 flag = True
116 |                 break
117 | 
118 |             # the condition is ||Xv||_2^2 <= gamma * ||v||_2^2
119 |             if l_sum <= r_sum*gamma:
120 |                 break
121 |             else:
122 |                 gamma = max(2*gamma, l_sum/r_sum)
123 |         value_gamma[iter_step] = gamma
124 | 
125 |         # step3: update alpha and alphap, and check weather converge
126 |         alphap = alpha
127 |         alpha = (1+math.sqrt(4*alpha*alpha+1))/2
128 | 
129 |         wwp = w - wp
130 |         Xwy = Xw -y
131 |         # calculate the regularization part
132 |         tree_norm_val = tree_norm(w, n_features, idx, n_nodes)
133 | 
134 |         # function value = loss + regularization
135 |         obj[iter_step] = np.inner(Xwy, Xwy)/2 + z*tree_norm_val
136 | 
137 |         if verbose:
138 |             print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
139 | 
140 |         if flag is True:
141 |             break
142 | 
143 |         # determine whether converge
144 |         if iter_step >= 2 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
145 |             break
146 | 
147 |     return w, obj, value_gamma
148 | 
149 | 
150 | 
151 | 
152 | 


--------------------------------------------------------------------------------
/skfeature/function/wrapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/function/wrapper/__init__.py


--------------------------------------------------------------------------------
/skfeature/function/wrapper/decision_tree_backward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.metrics import accuracy_score
 5 | 
 6 | 
 7 | def decision_tree_backward(X, y, n_selected_features):
 8 |     """
 9 |     This function implements the backward feature selection algorithm based on decision tree
10 | 
11 |     Input
12 |     -----
13 |     X: {numpy array}, shape (n_samples, n_features)
14 |         input data
15 |     y: {numpy array}, shape (n_samples,)
16 |         input class labels
17 |     n_selected_features : {int}
18 |         number of selected features
19 | 
20 |     Output
21 |     ------
22 |     F: {numpy array}, shape (n_features, )
23 |         index of selected features
24 |     """
25 | 
26 |     n_samples, n_features = X.shape
27 |     # using 10 fold cross validation
28 |     cv = KFold(n_samples, n_folds=10, shuffle=True)
29 |     # choose decision tree as the classifier
30 |     clf = DecisionTreeClassifier()
31 | 
32 |     # selected feature set, initialized to contain all features
33 |     F = range(n_features)
34 |     count = n_features
35 | 
36 |     while count > n_selected_features:
37 |         max_acc = 0
38 |         for i in range(n_features):
39 |             if i in F:
40 |                 F.remove(i)
41 |                 X_tmp = X[:, F]
42 |                 acc = 0
43 |                 for train, test in cv:
44 |                     clf.fit(X_tmp[train], y[train])
45 |                     y_predict = clf.predict(X_tmp[test])
46 |                     acc_tmp = accuracy_score(y[test], y_predict)
47 |                     acc += acc_tmp
48 |                 acc = float(acc)/10
49 |                 F.append(i)
50 |                 # record the feature which results in the largest accuracy
51 |                 if acc > max_acc:
52 |                     max_acc = acc
53 |                     idx = i
54 |         # delete the feature which results in the largest accuracy
55 |         F.remove(idx)
56 |         count -= 1
57 |     return np.array(F)
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/skfeature/function/wrapper/decision_tree_forward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.tree import DecisionTreeClassifier
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.metrics import accuracy_score
 5 | 
 6 | 
 7 | def decision_tree_forward(X, y, n_selected_features):
 8 |     """
 9 |     This function implements the forward feature selection algorithm based on decision tree
10 | 
11 |     Input
12 |     -----
13 |     X: {numpy array}, shape (n_samples, n_features)
14 |         input data
15 |     y: {numpy array}, shape (n_samples, )
16 |         input class labels
17 |     n_selected_features: {int}
18 |         number of selected features
19 | 
20 |     Output
21 |     ------
22 |     F: {numpy array}, shape (n_features,)
23 |         index of selected features
24 |     """
25 | 
26 |     n_samples, n_features = X.shape
27 |     # using 10 fold cross validation
28 |     cv = KFold(n_samples, n_folds=10, shuffle=True)
29 |     # choose decision tree as the classifier
30 |     clf = DecisionTreeClassifier()
31 | 
32 |     # selected feature set, initialized to be empty
33 |     F = []
34 |     count = 0
35 |     while count < n_selected_features:
36 |         max_acc = 0
37 |         for i in range(n_features):
38 |             if i not in F:
39 |                 F.append(i)
40 |                 X_tmp = X[:, F]
41 |                 acc = 0
42 |                 for train, test in cv:
43 |                     clf.fit(X_tmp[train], y[train])
44 |                     y_predict = clf.predict(X_tmp[test])
45 |                     acc_tmp = accuracy_score(y[test], y_predict)
46 |                     acc += acc_tmp
47 |                 acc = float(acc)/10
48 |                 F.pop()
49 |                 # record the feature which results in the largest accuracy
50 |                 if acc > max_acc:
51 |                     max_acc = acc
52 |                     idx = i
53 |         # add the feature which results in the largest accuracy
54 |         F.append(idx)
55 |         count += 1
56 |     return np.array(F)
57 | 
58 | 


--------------------------------------------------------------------------------
/skfeature/function/wrapper/svm_backward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.svm import SVC
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.metrics import accuracy_score
 5 | 
 6 | 
 7 | def svm_backward(X, y, n_selected_features):
 8 |     """
 9 |     This function implements the backward feature selection algorithm based on SVM
10 | 
11 |     Input
12 |     -----
13 |     X: {numpy array}, shape (n_samples, n_features)
14 |         input data
15 |     y: {numpy array}, shape (n_samples,)
16 |         input class labels
17 |     n_selected_features: {int}
18 |         number of selected features
19 | 
20 |     Output
21 |     ------
22 |     F: {numpy array}, shape (n_features, )
23 |         index of selected features
24 |     """
25 | 
26 |     n_samples, n_features = X.shape
27 |     # using 10 fold cross validation
28 |     cv = KFold(n_samples, n_folds=10, shuffle=True)
29 |     # choose SVM as the classifier
30 |     clf = SVC()
31 | 
32 |     # selected feature set, initialized to contain all features
33 |     F = range(n_features)
34 |     count = n_features
35 | 
36 |     while count > n_selected_features:
37 |         max_acc = 0
38 |         for i in range(n_features):
39 |             if i in F:
40 |                 F.remove(i)
41 |                 X_tmp = X[:, F]
42 |                 acc = 0
43 |                 for train, test in cv:
44 |                     clf.fit(X_tmp[train], y[train])
45 |                     y_predict = clf.predict(X_tmp[test])
46 |                     acc_tmp = accuracy_score(y[test], y_predict)
47 |                     acc += acc_tmp
48 |                 acc = float(acc)/10
49 |                 F.append(i)
50 |                 # record the feature which results in the largest accuracy
51 |                 if acc > max_acc:
52 |                     max_acc = acc
53 |                     idx = i
54 |         # delete the feature which results in the largest accuracy
55 |         F.remove(idx)
56 |         count -= 1
57 |     return np.array(F)
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/skfeature/function/wrapper/svm_forward.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.svm import SVC
 3 | from sklearn.model_selection import KFold
 4 | from sklearn.metrics import accuracy_score
 5 | 
 6 | 
 7 | def svm_forward(X, y, n_selected_features):
 8 |     """
 9 |     This function implements the forward feature selection algorithm based on SVM
10 | 
11 |     Input
12 |     -----
13 |     X: {numpy array}, shape (n_samples, n_features)
14 |         input data
15 |     y: {numpy array}, shape (n_samples,)
16 |         input class labels
17 |     n_selected_features: {int}
18 |         number of selected features
19 | 
20 |     Output
21 |     ------
22 |     F: {numpy array}, shape (n_features, )
23 |         index of selected features
24 |     """
25 | 
26 |     n_samples, n_features = X.shape
27 |     # using 10 fold cross validation
28 |     cv = KFold(n_samples, n_folds=10, shuffle=True)
29 |     # choose SVM as the classifier
30 |     clf = SVC()
31 | 
32 |     # selected feature set, initialized to be empty
33 |     F = []
34 |     count = 0
35 |     while count < n_selected_features:
36 |         max_acc = 0
37 |         for i in range(n_features):
38 |             if i not in F:
39 |                 F.append(i)
40 |                 X_tmp = X[:, F]
41 |                 acc = 0
42 |                 for train, test in cv:
43 |                     clf.fit(X_tmp[train], y[train])
44 |                     y_predict = clf.predict(X_tmp[test])
45 |                     acc_tmp = accuracy_score(y[test], y_predict)
46 |                     acc += acc_tmp
47 |                 acc = float(acc)/10
48 |                 F.pop()
49 |                 # record the feature which results in the largest accuracy
50 |                 if acc > max_acc:
51 |                     max_acc = acc
52 |                     idx = i
53 |         # add the feature which results in the largest accuracy
54 |         F.append(idx)
55 |         count += 1
56 |     return np.array(F)


--------------------------------------------------------------------------------
/skfeature/utility/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jundongl/scikit-feature/48cffad4e88ff4b9d2f1c7baffb314d1b3303792/skfeature/utility/__init__.py


--------------------------------------------------------------------------------
/skfeature/utility/construct_W.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy.sparse import *
  3 | from sklearn.metrics.pairwise import pairwise_distances
  4 | 
  5 | 
  6 | def construct_W(X, **kwargs):
  7 |     """
  8 |     Construct the affinity matrix W through different ways
  9 | 
 10 |     Notes
 11 |     -----
 12 |     if kwargs is null, use the default parameter settings;
 13 |     if kwargs is not null, construct the affinity matrix according to parameters in kwargs
 14 | 
 15 |     Input
 16 |     -----
 17 |     X: {numpy array}, shape (n_samples, n_features)
 18 |         input data
 19 |     kwargs: {dictionary}
 20 |         parameters to construct different affinity matrix W:
 21 |         y: {numpy array}, shape (n_samples, 1)
 22 |             the true label information needed under the 'supervised' neighbor mode
 23 |         metric: {string}
 24 |             choices for different distance measures
 25 |             'euclidean' - use euclidean distance
 26 |             'cosine' - use cosine distance (default)
 27 |         neighbor_mode: {string}
 28 |             indicates how to construct the graph
 29 |             'knn' - put an edge between two nodes if and only if they are among the
 30 |                     k nearest neighbors of each other (default)
 31 |             'supervised' - put an edge between two nodes if they belong to same class
 32 |                     and they are among the k nearest neighbors of each other
 33 |         weight_mode: {string}
 34 |             indicates how to assign weights for each edge in the graph
 35 |             'binary' - 0-1 weighting, every edge receives weight of 1 (default)
 36 |             'heat_kernel' - if nodes i and j are connected, put weight W_ij = exp(-norm(x_i - x_j)/2t^2)
 37 |                             this weight mode can only be used under 'euclidean' metric and you are required
 38 |                             to provide the parameter t
 39 |             'cosine' - if nodes i and j are connected, put weight cosine(x_i,x_j).
 40 |                         this weight mode can only be used under 'cosine' metric
 41 |         k: {int}
 42 |             choices for the number of neighbors (default k = 5)
 43 |         t: {float}
 44 |             parameter for the 'heat_kernel' weight_mode
 45 |         fisher_score: {boolean}
 46 |             indicates whether to build the affinity matrix in a fisher score way, in which W_ij = 1/n_l if yi = yj = l;
 47 |             otherwise W_ij = 0 (default fisher_score = false)
 48 |         reliefF: {boolean}
 49 |             indicates whether to build the affinity matrix in a reliefF way, NH(x) and NM(x,y) denotes a set of
 50 |             k nearest points to x with the same class as x, and a different class (the class y), respectively.
 51 |             W_ij = 1 if i = j; W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y) (default reliefF = false)
 52 | 
 53 |     Output
 54 |     ------
 55 |     W: {sparse matrix}, shape (n_samples, n_samples)
 56 |         output affinity matrix W
 57 |     """
 58 | 
 59 |     # default metric is 'cosine'
 60 |     if 'metric' not in kwargs.keys():
 61 |         kwargs['metric'] = 'cosine'
 62 | 
 63 |     # default neighbor mode is 'knn' and default neighbor size is 5
 64 |     if 'neighbor_mode' not in kwargs.keys():
 65 |         kwargs['neighbor_mode'] = 'knn'
 66 |     if kwargs['neighbor_mode'] == 'knn' and 'k' not in kwargs.keys():
 67 |         kwargs['k'] = 5
 68 |     if kwargs['neighbor_mode'] == 'supervised' and 'k' not in kwargs.keys():
 69 |         kwargs['k'] = 5
 70 |     if kwargs['neighbor_mode'] == 'supervised' and 'y' not in kwargs.keys():
 71 |         print ('Warning: label is required in the supervised neighborMode!!!')
 72 |         exit(0)
 73 | 
 74 |     # default weight mode is 'binary', default t in heat kernel mode is 1
 75 |     if 'weight_mode' not in kwargs.keys():
 76 |         kwargs['weight_mode'] = 'binary'
 77 |     if kwargs['weight_mode'] == 'heat_kernel':
 78 |         if kwargs['metric'] != 'euclidean':
 79 |             kwargs['metric'] = 'euclidean'
 80 |         if 't' not in kwargs.keys():
 81 |             kwargs['t'] = 1
 82 |     elif kwargs['weight_mode'] == 'cosine':
 83 |         if kwargs['metric'] != 'cosine':
 84 |             kwargs['metric'] = 'cosine'
 85 | 
 86 |     # default fisher_score and reliefF mode are 'false'
 87 |     if 'fisher_score' not in kwargs.keys():
 88 |         kwargs['fisher_score'] = False
 89 |     if 'reliefF' not in kwargs.keys():
 90 |         kwargs['reliefF'] = False
 91 | 
 92 |     n_samples, n_features = np.shape(X)
 93 | 
 94 |     # choose 'knn' neighbor mode
 95 |     if kwargs['neighbor_mode'] == 'knn':
 96 |         k = kwargs['k']
 97 |         if kwargs['weight_mode'] == 'binary':
 98 |             if kwargs['metric'] == 'euclidean':
 99 |                 # compute pairwise euclidean distances
100 |                 D = pairwise_distances(X)
101 |                 D **= 2
102 |                 # sort the distance matrix D in ascending order
103 |                 dump = np.sort(D, axis=1)
104 |                 idx = np.argsort(D, axis=1)
105 |                 # choose the k-nearest neighbors for each instance
106 |                 idx_new = idx[:, 0:k+1]
107 |                 G = np.zeros((n_samples*(k+1), 3))
108 |                 G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
109 |                 G[:, 1] = np.ravel(idx_new, order='F')
110 |                 G[:, 2] = 1
111 |                 # build the sparse affinity matrix W
112 |                 W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
113 |                 bigger = np.transpose(W) > W
114 |                 W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
115 |                 return W
116 | 
117 |             elif kwargs['metric'] == 'cosine':
118 |                 # normalize the data first
119 |                 X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
120 |                 for i in range(n_samples):
121 |                     X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
122 |                 # compute pairwise cosine distances
123 |                 D_cosine = np.dot(X, np.transpose(X))
124 |                 # sort the distance matrix D in descending order
125 |                 dump = np.sort(-D_cosine, axis=1)
126 |                 idx = np.argsort(-D_cosine, axis=1)
127 |                 idx_new = idx[:, 0:k+1]
128 |                 G = np.zeros((n_samples*(k+1), 3))
129 |                 G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
130 |                 G[:, 1] = np.ravel(idx_new, order='F')
131 |                 G[:, 2] = 1
132 |                 # build the sparse affinity matrix W
133 |                 W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
134 |                 bigger = np.transpose(W) > W
135 |                 W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
136 |                 return W
137 | 
138 |         elif kwargs['weight_mode'] == 'heat_kernel':
139 |             t = kwargs['t']
140 |             # compute pairwise euclidean distances
141 |             D = pairwise_distances(X)
142 |             D **= 2
143 |             # sort the distance matrix D in ascending order
144 |             dump = np.sort(D, axis=1)
145 |             idx = np.argsort(D, axis=1)
146 |             idx_new = idx[:, 0:k+1]
147 |             dump_new = dump[:, 0:k+1]
148 |             # compute the pairwise heat kernel distances
149 |             dump_heat_kernel = np.exp(-dump_new/(2*t*t))
150 |             G = np.zeros((n_samples*(k+1), 3))
151 |             G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
152 |             G[:, 1] = np.ravel(idx_new, order='F')
153 |             G[:, 2] = np.ravel(dump_heat_kernel, order='F')
154 |             # build the sparse affinity matrix W
155 |             W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
156 |             bigger = np.transpose(W) > W
157 |             W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
158 |             return W
159 | 
160 |         elif kwargs['weight_mode'] == 'cosine':
161 |             # normalize the data first
162 |             X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
163 |             for i in range(n_samples):
164 |                     X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
165 |             # compute pairwise cosine distances
166 |             D_cosine = np.dot(X, np.transpose(X))
167 |             # sort the distance matrix D in ascending order
168 |             dump = np.sort(-D_cosine, axis=1)
169 |             idx = np.argsort(-D_cosine, axis=1)
170 |             idx_new = idx[:, 0:k+1]
171 |             dump_new = -dump[:, 0:k+1]
172 |             G = np.zeros((n_samples*(k+1), 3))
173 |             G[:, 0] = np.tile(np.arange(n_samples), (k+1, 1)).reshape(-1)
174 |             G[:, 1] = np.ravel(idx_new, order='F')
175 |             G[:, 2] = np.ravel(dump_new, order='F')
176 |             # build the sparse affinity matrix W
177 |             W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
178 |             bigger = np.transpose(W) > W
179 |             W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
180 |             return W
181 | 
182 |     # choose supervised neighborMode
183 |     elif kwargs['neighbor_mode'] == 'supervised':
184 |         k = kwargs['k']
185 |         # get true labels and the number of classes
186 |         y = kwargs['y']
187 |         label = np.unique(y)
188 |         n_classes = np.unique(y).size
189 |         # construct the weight matrix W in a fisherScore way, W_ij = 1/n_l if yi = yj = l, otherwise W_ij = 0
190 |         if kwargs['fisher_score'] is True:
191 |             W = lil_matrix((n_samples, n_samples))
192 |             for i in range(n_classes):
193 |                 class_idx = (y == label[i])
194 |                 class_idx_all = (class_idx[:, np.newaxis] & class_idx[np.newaxis, :])
195 |                 W[class_idx_all] = 1.0/np.sum(np.sum(class_idx))
196 |             return W
197 | 
198 |         # construct the weight matrix W in a reliefF way, NH(x) and NM(x,y) denotes a set of k nearest
199 |         # points to x with the same class as x, a different class (the class y), respectively. W_ij = 1 if i = j;
200 |         # W_ij = 1/k if x_j \in NH(x_i); W_ij = -1/(c-1)k if x_j \in NM(x_i, y)
201 |         if kwargs['reliefF'] is True:
202 |             # when xj in NH(xi)
203 |             G = np.zeros((n_samples*(k+1), 3))
204 |             id_now = 0
205 |             for i in range(n_classes):
206 |                 class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
207 |                 D = pairwise_distances(X[class_idx, :])
208 |                 D **= 2
209 |                 idx = np.argsort(D, axis=1)
210 |                 idx_new = idx[:, 0:k+1]
211 |                 n_smp_class = (class_idx[idx_new[:]]).size
212 |                 if len(class_idx) <= k:
213 |                     k = len(class_idx) - 1
214 |                 G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
215 |                 G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
216 |                 G[id_now:n_smp_class+id_now, 2] = 1.0/k
217 |                 id_now += n_smp_class
218 |             W1 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
219 |             # when i = j, W_ij = 1
220 |             for i in range(n_samples):
221 |                 W1[i, i] = 1
222 |             # when x_j in NM(x_i, y)
223 |             G = np.zeros((n_samples*k*(n_classes - 1), 3))
224 |             id_now = 0
225 |             for i in range(n_classes):
226 |                 class_idx1 = np.column_stack(np.where(y == label[i]))[:, 0]
227 |                 X1 = X[class_idx1, :]
228 |                 for j in range(n_classes):
229 |                     if label[j] != label[i]:
230 |                         class_idx2 = np.column_stack(np.where(y == label[j]))[:, 0]
231 |                         X2 = X[class_idx2, :]
232 |                         D = pairwise_distances(X1, X2)
233 |                         idx = np.argsort(D, axis=1)
234 |                         idx_new = idx[:, 0:k]
235 |                         n_smp_class = len(class_idx1)*k
236 |                         G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx1, (k, 1)).reshape(-1)
237 |                         G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx2[idx_new[:]], order='F')
238 |                         G[id_now:n_smp_class+id_now, 2] = -1.0/((n_classes-1)*k)
239 |                         id_now += n_smp_class
240 |             W2 = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
241 |             bigger = np.transpose(W2) > W2
242 |             W2 = W2 - W2.multiply(bigger) + np.transpose(W2).multiply(bigger)
243 |             W = W1 + W2
244 |             return W
245 | 
246 |         if kwargs['weight_mode'] == 'binary':
247 |             if kwargs['metric'] == 'euclidean':
248 |                 G = np.zeros((n_samples*(k+1), 3))
249 |                 id_now = 0
250 |                 for i in range(n_classes):
251 |                     class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
252 |                     # compute pairwise euclidean distances for instances in class i
253 |                     D = pairwise_distances(X[class_idx, :])
254 |                     D **= 2
255 |                     # sort the distance matrix D in ascending order for instances in class i
256 |                     idx = np.argsort(D, axis=1)
257 |                     idx_new = idx[:, 0:k+1]
258 |                     n_smp_class = len(class_idx)*(k+1)
259 |                     G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
260 |                     G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
261 |                     G[id_now:n_smp_class+id_now, 2] = 1
262 |                     id_now += n_smp_class
263 |                 # build the sparse affinity matrix W
264 |                 W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
265 |                 bigger = np.transpose(W) > W
266 |                 W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
267 |                 return W
268 | 
269 |             if kwargs['metric'] == 'cosine':
270 |                 # normalize the data first
271 |                 X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
272 |                 for i in range(n_samples):
273 |                     X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
274 |                 G = np.zeros((n_samples*(k+1), 3))
275 |                 id_now = 0
276 |                 for i in range(n_classes):
277 |                     class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
278 |                     # compute pairwise cosine distances for instances in class i
279 |                     D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :]))
280 |                     # sort the distance matrix D in descending order for instances in class i
281 |                     idx = np.argsort(-D_cosine, axis=1)
282 |                     idx_new = idx[:, 0:k+1]
283 |                     n_smp_class = len(class_idx)*(k+1)
284 |                     G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
285 |                     G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
286 |                     G[id_now:n_smp_class+id_now, 2] = 1
287 |                     id_now += n_smp_class
288 |                 # build the sparse affinity matrix W
289 |                 W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
290 |                 bigger = np.transpose(W) > W
291 |                 W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
292 |                 return W
293 | 
294 |         elif kwargs['weight_mode'] == 'heat_kernel':
295 |             G = np.zeros((n_samples*(k+1), 3))
296 |             id_now = 0
297 |             for i in range(n_classes):
298 |                 class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
299 |                 # compute pairwise cosine distances for instances in class i
300 |                 D = pairwise_distances(X[class_idx, :])
301 |                 D **= 2
302 |                 # sort the distance matrix D in ascending order for instances in class i
303 |                 dump = np.sort(D, axis=1)
304 |                 idx = np.argsort(D, axis=1)
305 |                 idx_new = idx[:, 0:k+1]
306 |                 dump_new = dump[:, 0:k+1]
307 |                 t = kwargs['t']
308 |                 # compute pairwise heat kernel distances for instances in class i
309 |                 dump_heat_kernel = np.exp(-dump_new/(2*t*t))
310 |                 n_smp_class = len(class_idx)*(k+1)
311 |                 G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
312 |                 G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
313 |                 G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_heat_kernel, order='F')
314 |                 id_now += n_smp_class
315 |             # build the sparse affinity matrix W
316 |             W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
317 |             bigger = np.transpose(W) > W
318 |             W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
319 |             return W
320 | 
321 |         elif kwargs['weight_mode'] == 'cosine':
322 |             # normalize the data first
323 |             X_normalized = np.power(np.sum(X*X, axis=1), 0.5)
324 |             for i in range(n_samples):
325 |                 X[i, :] = X[i, :]/max(1e-12, X_normalized[i])
326 |             G = np.zeros((n_samples*(k+1), 3))
327 |             id_now = 0
328 |             for i in range(n_classes):
329 |                 class_idx = np.column_stack(np.where(y == label[i]))[:, 0]
330 |                 # compute pairwise cosine distances for instances in class i
331 |                 D_cosine = np.dot(X[class_idx, :], np.transpose(X[class_idx, :]))
332 |                 # sort the distance matrix D in descending order for instances in class i
333 |                 dump = np.sort(-D_cosine, axis=1)
334 |                 idx = np.argsort(-D_cosine, axis=1)
335 |                 idx_new = idx[:, 0:k+1]
336 |                 dump_new = -dump[:, 0:k+1]
337 |                 n_smp_class = len(class_idx)*(k+1)
338 |                 G[id_now:n_smp_class+id_now, 0] = np.tile(class_idx, (k+1, 1)).reshape(-1)
339 |                 G[id_now:n_smp_class+id_now, 1] = np.ravel(class_idx[idx_new[:]], order='F')
340 |                 G[id_now:n_smp_class+id_now, 2] = np.ravel(dump_new, order='F')
341 |                 id_now += n_smp_class
342 |             # build the sparse affinity matrix W
343 |             W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
344 |             bigger = np.transpose(W) > W
345 |             W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
346 |             return W


--------------------------------------------------------------------------------
/skfeature/utility/data_discretization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn.preprocessing
 3 | 
 4 | 
 5 | def data_discretization(X, n_bins):
 6 |     """
 7 |     This function implements the data discretization function to discrete data into n_bins
 8 | 
 9 |     Input
10 |     -----
11 |     X: {numpy array}, shape (n_samples, n_features)
12 |         input data
13 |     n_bins: {int}
14 |         number of bins to be discretized
15 | 
16 |     Output
17 |     ------
18 |     X_discretized: {numpy array}, shape (n_samples, n_features)
19 |         output discretized data, where features are digitized to n_bins
20 |     """
21 | 
22 |     # normalize each feature
23 |     min_max_scaler = sklearn.preprocessing.MinMaxScaler()
24 |     X_normalized = min_max_scaler.fit_transform(X)
25 | 
26 |     # discretize X
27 |     n_samples, n_features = X.shape
28 |     X_discretized = np.zeros((n_samples, n_features))
29 |     bins = np.linspace(0, 1, n_bins)
30 |     for i in range(n_features):
31 |         X_discretized[:, i] = np.digitize(X_normalized[:, i], bins)
32 | 
33 |     return X_discretized
34 | 


--------------------------------------------------------------------------------
/skfeature/utility/entropy_estimators.py:
--------------------------------------------------------------------------------
  1 | # Written by Greg Ver Steeg (http://www.isi.edu/~gregv/npeet.html)
  2 | 
  3 | import scipy.spatial as ss
  4 | from scipy.special import digamma
  5 | from math import log
  6 | import numpy.random as nr
  7 | import numpy as np
  8 | import random
  9 | 
 10 | 
 11 | # continuous estimators
 12 | 
 13 | def entropy(x, k=3, base=2):
 14 |     """
 15 |     The classic K-L k-nearest neighbor continuous entropy estimator x should be a list of vectors,
 16 |     e.g. x = [[1.3],[3.7],[5.1],[2.4]] if x is a one-dimensional scalar and we have four samples
 17 |     """
 18 | 
 19 |     assert k <= len(x)-1, "Set k smaller than num. samples - 1"
 20 |     d = len(x[0])
 21 |     N = len(x)
 22 |     intens = 1e-10  # small noise to break degeneracy, see doc.
 23 |     x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
 24 |     tree = ss.cKDTree(x)
 25 |     nn = [tree.query(point, k+1, p=float('inf'))[0][k] for point in x]
 26 |     const = digamma(N)-digamma(k) + d*log(2)
 27 |     return (const + d*np.mean(map(log, nn)))/log(base)
 28 | 
 29 | 
 30 | def mi(x, y, k=3, base=2):
 31 |     """
 32 |     Mutual information of x and y; x, y should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
 33 |     if x is a one-dimensional scalar and we have four samples
 34 |     """
 35 | 
 36 |     assert len(x) == len(y), "Lists should have same length"
 37 |     assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
 38 |     intens = 1e-10  # small noise to break degeneracy, see doc.
 39 |     x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
 40 |     y = [list(p + intens * nr.rand(len(y[0]))) for p in y]
 41 |     points = zip2(x, y)
 42 |     # Find nearest neighbors in joint space, p=inf means max-norm
 43 |     tree = ss.cKDTree(points)
 44 |     dvec = [tree.query(point, k+1, p=float('inf'))[0][k] for point in points]
 45 |     a, b, c, d = avgdigamma(x, dvec), avgdigamma(y, dvec), digamma(k), digamma(len(x))
 46 |     return (-a-b+c+d)/log(base)
 47 | 
 48 | 
 49 | def cmi(x, y, z, k=3, base=2):
 50 |     """
 51 |     Mutual information of x and y, conditioned on z; x, y, z should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
 52 |     if x is a one-dimensional scalar and we have four samples
 53 |     """
 54 | 
 55 |     assert len(x) == len(y), "Lists should have same length"
 56 |     assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
 57 |     intens = 1e-10  # small noise to break degeneracy, see doc.
 58 |     x = [list(p + intens * nr.rand(len(x[0]))) for p in x]
 59 |     y = [list(p + intens * nr.rand(len(y[0]))) for p in y]
 60 |     z = [list(p + intens * nr.rand(len(z[0]))) for p in z]
 61 |     points = zip2(x, y, z)
 62 |     # Find nearest neighbors in joint space, p=inf means max-norm
 63 |     tree = ss.cKDTree(points)
 64 |     dvec = [tree.query(point, k+1, p=float('inf'))[0][k] for point in points]
 65 |     a, b, c, d = avgdigamma(zip2(x, z), dvec), avgdigamma(zip2(y, z), dvec), avgdigamma(z, dvec), digamma(k)
 66 |     return (-a-b+c+d)/log(base)
 67 | 
 68 | 
 69 | def kldiv(x, xp, k=3, base=2):
 70 |     """
 71 |     KL Divergence between p and q for x~p(x), xp~q(x); x, xp should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]]
 72 |     if x is a one-dimensional scalar and we have four samples
 73 |     """
 74 | 
 75 |     assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
 76 |     assert k <= len(xp) - 1, "Set k smaller than num. samples - 1"
 77 |     assert len(x[0]) == len(xp[0]), "Two distributions must have same dim."
 78 |     d = len(x[0])
 79 |     n = len(x)
 80 |     m = len(xp)
 81 |     const = log(m) - log(n-1)
 82 |     tree = ss.cKDTree(x)
 83 |     treep = ss.cKDTree(xp)
 84 |     nn = [tree.query(point, k+1, p=float('inf'))[0][k] for point in x]
 85 |     nnp = [treep.query(point, k, p=float('inf'))[0][k-1] for point in x]
 86 |     return (const + d*np.mean(map(log, nnp))-d*np.mean(map(log, nn)))/log(base)
 87 | 
 88 | 
 89 | # Discrete estimators
 90 | def entropyd(sx, base=2):
 91 |     """
 92 |     Discrete entropy estimator given a list of samples which can be any hashable object
 93 |     """
 94 | 
 95 |     return entropyfromprobs(hist(sx), base=base)
 96 | 
 97 | 
 98 | def midd(x, y):
 99 |     """
100 |     Discrete mutual information estimator given a list of samples which can be any hashable object
101 |     """
102 | 
103 |     return -entropyd(list(zip(x, y)))+entropyd(x)+entropyd(y)
104 | 
105 | 
106 | def cmidd(x, y, z):
107 |     """
108 |     Discrete mutual information estimator given a list of samples which can be any hashable object
109 |     """
110 | 
111 |     return entropyd(list(zip(y, z)))+entropyd(list(zip(x, z)))-entropyd(list(zip(x, y, z)))-entropyd(z)
112 | 
113 | 
114 | def hist(sx):
115 |     # Histogram from list of samples
116 |     d = dict()
117 |     for s in sx:
118 |         d[s] = d.get(s, 0) + 1
119 |     return map(lambda z: float(z)/len(sx), d.values())
120 | 
121 | 
122 | def entropyfromprobs(probs, base=2):
123 |     # Turn a normalized list of probabilities of discrete outcomes into entropy (base 2)
124 |     return -sum(map(elog, probs))/log(base)
125 | 
126 | 
127 | def elog(x):
128 |     # for entropy, 0 log 0 = 0. but we get an error for putting log 0
129 |     if x <= 0. or x >= 1.:
130 |         return 0
131 |     else:
132 |         return x*log(x)
133 | 
134 | 
135 | # Mixed estimators
136 | def micd(x, y, k=3, base=2, warning=True):
137 |     """ If x is continuous and y is discrete, compute mutual information
138 |     """
139 | 
140 |     overallentropy = entropy(x, k, base)
141 |     n = len(y)
142 |     word_dict = dict()
143 |     for sample in y:
144 |         word_dict[sample] = word_dict.get(sample, 0) + 1./n
145 |     yvals = list(set(word_dict.keys()))
146 | 
147 |     mi = overallentropy
148 |     for yval in yvals:
149 |         xgiveny = [x[i] for i in range(n) if y[i] == yval]
150 |         if k <= len(xgiveny) - 1:
151 |             mi -= word_dict[yval]*entropy(xgiveny, k, base)
152 |         else:
153 |             if warning:
154 |                 print("Warning, after conditioning, on y={0} insufficient data. Assuming maximal entropy in this case.".format(yval))
155 |             mi -= word_dict[yval]*overallentropy
156 |     return mi  # units already applied
157 | 
158 | 
159 | # Utility functions
160 | def vectorize(scalarlist):
161 |     """
162 |     Turn a list of scalars into a list of one-d vectors
163 |     """
164 | 
165 |     return [(x,) for x in scalarlist]
166 | 
167 | 
168 | def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs):
169 |     """
170 |     Shuffle test
171 |     Repeatedly shuffle the x-values and then estimate measure(x,y,[z]).
172 |     Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs, 'measure' could me mi,cmi,
173 |     e.g. Keyword arguments can be passed. Mutual information and CMI should have a mean near zero.
174 |     """
175 | 
176 |     xp = x[:]   # A copy that we can shuffle
177 |     outputs = []
178 |     for i in range(ns):
179 |         random.shuffle(xp)
180 |         if z:
181 |             outputs.append(measure(xp, y, z, **kwargs))
182 |         else:
183 |             outputs.append(measure(xp, y, **kwargs))
184 |     outputs.sort()
185 |     return np.mean(outputs), (outputs[int((1.-ci)/2*ns)], outputs[int((1.+ci)/2*ns)])
186 | 
187 | 
188 | # Internal functions
189 | def avgdigamma(points, dvec):
190 |     # This part finds number of neighbors in some radius in the marginal space
191 |     # returns expectation value of <psi(nx)>
192 |     N = len(points)
193 |     tree = ss.cKDTree(points)
194 |     avg = 0.
195 |     for i in range(N):
196 |         dist = dvec[i]
197 |         # subtlety, we don't include the boundary point,
198 |         # but we are implicitly adding 1 to kraskov def bc center point is included
199 |         num_points = len(tree.query_ball_point(points[i], dist-1e-15, p=float('inf')))
200 |         avg += digamma(num_points)/N
201 |     return avg
202 | 
203 | 
204 | def zip2(*args):
205 |     # zip2(x,y) takes the lists of vectors and makes it a list of vectors in a joint space
206 |     # E.g. zip2([[1],[2],[3]],[[4],[5],[6]]) = [[1,4],[2,5],[3,6]]
207 |     return [sum(sublist, []) for sublist in zip(*args)]
208 | 


--------------------------------------------------------------------------------
/skfeature/utility/mutual_information.py:
--------------------------------------------------------------------------------
 1 | import skfeature.utility.entropy_estimators as ee
 2 | 
 3 | 
 4 | def information_gain(f1, f2):
 5 |     """
 6 |     This function calculates the information gain, where ig(f1,f2) = H(f1) - H(f1|f2)
 7 | 
 8 |     Input
 9 |     -----
10 |     f1: {numpy array}, shape (n_samples,)
11 |     f2: {numpy array}, shape (n_samples,)
12 | 
13 |     Output
14 |     ------
15 |     ig: {float}
16 |     """
17 | 
18 |     ig = ee.entropyd(f1) - conditional_entropy(f1, f2)
19 |     return ig
20 | 
21 | 
22 | def conditional_entropy(f1, f2):
23 |     """
24 |     This function calculates the conditional entropy, where ce = H(f1) - I(f1;f2)
25 | 
26 |     Input
27 |     -----
28 |     f1: {numpy array}, shape (n_samples,)
29 |     f2: {numpy array}, shape (n_samples,)
30 | 
31 |     Output
32 |     ------
33 |     ce: {float}
34 |         ce is conditional entropy of f1 and f2
35 |     """
36 | 
37 |     ce = ee.entropyd(f1) - ee.midd(f1, f2)
38 |     return ce
39 | 
40 | 
41 | def su_calculation(f1, f2):
42 |     """
43 |     This function calculates the symmetrical uncertainty, where su(f1,f2) = 2*IG(f1,f2)/(H(f1)+H(f2))
44 | 
45 |     Input
46 |     -----
47 |     f1: {numpy array}, shape (n_samples,)
48 |     f2: {numpy array}, shape (n_samples,)
49 | 
50 |     Output
51 |     ------
52 |     su: {float}
53 |         su is the symmetrical uncertainty of f1 and f2
54 | 
55 |     """
56 | 
57 |     # calculate information gain of f1 and f2, t1 = ig(f1,f2)
58 |     t1 = information_gain(f1, f2)
59 |     # calculate entropy of f1, t2 = H(f1)
60 |     t2 = ee.entropyd(f1)
61 |     # calculate entropy of f2, t3 = H(f2)
62 |     t3 = ee.entropyd(f2)
63 |     # su(f1,f2) = 2*t1/(t2+t3)
64 |     su = 2.0*t1/(t2+t3)
65 | 
66 |     return su
67 | 


--------------------------------------------------------------------------------
/skfeature/utility/sparse_learning.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import linalg as LA
  3 | 
  4 | 
  5 | def feature_ranking(W):
  6 |     """
  7 |     This function ranks features according to the feature weights matrix W
  8 | 
  9 |     Input:
 10 |     -----
 11 |     W: {numpy array}, shape (n_features, n_classes)
 12 |         feature weights matrix
 13 | 
 14 |     Output:
 15 |     ------
 16 |     idx: {numpy array}, shape {n_features,}
 17 |         feature index ranked in descending order by feature importance
 18 |     """
 19 |     T = (W*W).sum(1)
 20 |     idx = np.argsort(T, 0)
 21 |     return idx[::-1]
 22 | 
 23 | 
 24 | def generate_diagonal_matrix(U):
 25 |     """
 26 |     This function generates a diagonal matrix D from an input matrix U as D_ii = 0.5 / ||U[i,:]||
 27 | 
 28 |     Input:
 29 |     -----
 30 |     U: {numpy array}, shape (n_samples, n_features)
 31 | 
 32 |     Output:
 33 |     ------
 34 |     D: {numpy array}, shape (n_samples, n_samples)
 35 |     """
 36 |     temp = np.sqrt(np.multiply(U, U).sum(1))
 37 |     temp[temp < 1e-16] = 1e-16
 38 |     temp = 0.5 / temp
 39 |     D = np.diag(temp)
 40 |     return D
 41 | 
 42 | 
 43 | def calculate_l21_norm(X):
 44 |     """
 45 |     This function calculates the l21 norm of a matrix X, i.e., \sum ||X[i,:]||_2
 46 | 
 47 |     Input:
 48 |     -----
 49 |     X: {numpy array}, shape (n_samples, n_features)
 50 | 
 51 |     Output:
 52 |     ------
 53 |     l21_norm: {float}
 54 |     """
 55 |     return (np.sqrt(np.multiply(X, X).sum(1))).sum()
 56 | 
 57 | 
 58 | def construct_label_matrix(label):
 59 |     """
 60 |     This function converts a 1d numpy array to a 2d array, for each instance, the class label is 1 or 0
 61 | 
 62 |     Input:
 63 |     -----
 64 |     label: {numpy array}, shape(n_samples,)
 65 | 
 66 |     Output:
 67 |     ------
 68 |     label_matrix: {numpy array}, shape(n_samples, n_classes)
 69 |     """
 70 | 
 71 |     n_samples = label.shape[0]
 72 |     unique_label = np.unique(label)
 73 |     n_classes = unique_label.shape[0]
 74 |     label_matrix = np.zeros((n_samples, n_classes))
 75 |     for i in range(n_classes):
 76 |         label_matrix[label == unique_label[i], i] = 1
 77 | 
 78 |     return label_matrix.astype(int)
 79 | 
 80 | 
 81 | def construct_label_matrix_pan(label):
 82 |     """
 83 |     This function converts a 1d numpy array to a 2d array, for each instance, the class label is 1 or -1
 84 | 
 85 |     Input:
 86 |     -----
 87 |     label: {numpy array}, shape(n_samples,)
 88 | 
 89 |     Output:
 90 |     ------
 91 |     label_matrix: {numpy array}, shape(n_samples, n_classes)
 92 |     """
 93 |     n_samples = label.shape[0]
 94 |     unique_label = np.unique(label)
 95 |     n_classes = unique_label.shape[0]
 96 |     label_matrix = np.zeros((n_samples, n_classes))
 97 |     for i in range(n_classes):
 98 |         label_matrix[label == unique_label[i], i] = 1
 99 |     label_matrix[label_matrix == 0] = -1
100 | 
101 |     return label_matrix.astype(int)
102 | 
103 | 
104 | def euclidean_projection(V, n_features, n_classes, z, gamma):
105 |     """
106 |     L2 Norm regularized euclidean projection min_W  1/2 ||W- V||_2^2 + z * ||W||_2
107 |     """
108 |     W_projection = np.zeros((n_features, n_classes))
109 |     for i in range(n_features):
110 |         if LA.norm(V[i, :]) > z/gamma:
111 |             W_projection[i, :] = (1-z/(gamma*LA.norm(V[i, :])))*V[i, :]
112 |         else:
113 |             W_projection[i, :] = np.zeros(n_classes)
114 |     return W_projection
115 | 
116 | 
117 | def tree_lasso_projection(v, n_features, idx, n_nodes):
118 |     """
119 |     This functions solves the following optimization problem min_w 1/2 ||w-v||_2^2 + \sum z_i||w_{G_{i}}||
120 |     where w and v are of dimensions of n_features; z_i >=0, and G_{i} follows the tree structure
121 |     """
122 |     # test whether the first node is special
123 |     if idx[0, 0] == -1 and idx[1, 0] == -1:
124 |         w_projection = np.zeros(n_features)
125 |         z = idx[2, 0]
126 |         for j in range(n_features):
127 |             if v[j] > z:
128 |                 w_projection[j] = v[j] - z
129 |             else:
130 |                 if v[j] < -z:
131 |                     w_projection[j] = v[j] + z
132 |                 else:
133 |                     w_projection[j] = 0
134 |         i = 1
135 | 
136 |     else:
137 |         w = v.copy()
138 |         i = 0
139 | 
140 |     # sequentially process each node
141 |     while i < n_nodes:
142 |         # compute the L2 norm of this group
143 |         two_norm = 0
144 |         start_idx = int(idx[0, i] - 1)
145 |         end_idx = int(idx[1, i])
146 |         for j in range(start_idx, end_idx):
147 |             two_norm += w_projection[j] * w_projection[j]
148 |         two_norm = np.sqrt(two_norm)
149 |         z = idx[2, i]
150 |         if two_norm > z:
151 |             ratio = (two_norm - z) / two_norm
152 |             # shrinkage this group by ratio
153 |             for j in range(start_idx, end_idx):
154 |                 w_projection[j] *= ratio
155 |         else:
156 |             for j in range(start_idx, end_idx):
157 |                 w_projection[j] = 0
158 |         i += 1
159 |     return w_projection
160 | 
161 | 
162 | def tree_norm(w, n_features, idx, n_nodes):
163 |     """
164 |     This function computes \sum z_i||w_{G_{i}}||
165 |     """
166 |     obj = 0
167 |     # test whether the first node is special
168 |     if idx[0, 0] == -1 and idx[1, 0] == -1:
169 |         z = idx[2, 0]
170 |         for j in range(n_features):
171 |             obj += np.abs(w[j])
172 |         obj *= z
173 |         i = 1
174 |     else:
175 |         i = 0
176 | 
177 |     # sequentially process each node
178 |     while i < n_nodes:
179 |         two_norm = 0
180 |         start_idx = int(idx[0, i] - 1)
181 |         end_idx = int(idx[1, i])
182 |         for j in range(start_idx, end_idx):
183 |             two_norm += w[j] * w[j]
184 |         two_norm = np.sqrt(two_norm)
185 |         z = idx[2, i]
186 |         obj += z*two_norm
187 |         i += 1
188 |     return obj
189 | 
190 | 


--------------------------------------------------------------------------------
/skfeature/utility/unsupervised_evaluation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn.utils.linear_assignment_ as la
 3 | from sklearn.metrics import accuracy_score
 4 | from sklearn.metrics.cluster import normalized_mutual_info_score
 5 | from sklearn.cluster import KMeans
 6 | 
 7 | 
 8 | def best_map(l1, l2):
 9 |     """
10 |     Permute labels of l2 to match l1 as much as possible
11 |     """
12 |     if len(l1) != len(l2):
13 |         print("L1.shape must == L2.shape")
14 |         exit(0)
15 | 
16 |     label1 = np.unique(l1)
17 |     n_class1 = len(label1)
18 | 
19 |     label2 = np.unique(l2)
20 |     n_class2 = len(label2)
21 | 
22 |     n_class = max(n_class1, n_class2)
23 |     G = np.zeros((n_class, n_class))
24 | 
25 |     for i in range(0, n_class1):
26 |         for j in range(0, n_class2):
27 |             ss = l1 == label1[i]
28 |             tt = l2 == label2[j]
29 |             G[i, j] = np.count_nonzero(ss & tt)
30 | 
31 |     A = la.linear_assignment(-G)
32 | 
33 |     new_l2 = np.zeros(l2.shape)
34 |     for i in range(0, n_class2):
35 |         new_l2[l2 == label2[A[i][1]]] = label1[A[i][0]]
36 |     return new_l2.astype(int)
37 | 
38 | 
39 | def evaluation(X_selected, n_clusters, y):
40 |     """
41 |     This function calculates ARI, ACC and NMI of clustering results
42 | 
43 |     Input
44 |     -----
45 |     X_selected: {numpy array}, shape (n_samples, n_selected_features}
46 |             input data on the selected features
47 |     n_clusters: {int}
48 |             number of clusters
49 |     y: {numpy array}, shape (n_samples,)
50 |             true labels
51 | 
52 |     Output
53 |     ------
54 |     nmi: {float}
55 |         Normalized Mutual Information
56 |     acc: {float}
57 |         Accuracy
58 |     """
59 |     k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
60 |                      tol=0.0001, precompute_distances=True, verbose=0,
61 |                      random_state=None, copy_x=True, n_jobs=1)
62 | 
63 |     k_means.fit(X_selected)
64 |     y_predict = k_means.labels_
65 | 
66 |     # calculate NMI
67 |     nmi = normalized_mutual_info_score(y, y_predict)
68 | 
69 |     # calculate ACC
70 |     y_permuted_predict = best_map(y, y_predict)
71 |     acc = accuracy_score(y, y_permuted_predict)
72 | 
73 |     return nmi, acc


--------------------------------------------------------------------------------