├── setup.cfg
├── infoselect
├── __pycache__
│ ├── gmm.cpython-37.pyc
│ ├── mi.cpython-37.pyc
│ └── __init__.cpython-37.pyc
├── gmm.py
├── mi.py
└── __init__.py
├── setup.py
├── LICENSE.txt
├── README.md
└── .ipynb_checkpoints
└── InfoSelect-checkpoint.ipynb
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/infoselect/__pycache__/gmm.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/gmm.cpython-37.pyc
--------------------------------------------------------------------------------
/infoselect/__pycache__/mi.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/mi.cpython-37.pyc
--------------------------------------------------------------------------------
/infoselect/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 |
2 | import setuptools
3 |
4 | with open("README.md", "r") as fh:
5 | long_description = fh.read()
6 |
7 | setuptools.setup(
8 | name="infoselect",
9 | version="1.0.1",
10 | author="Felipe Maia Polo & Felipe Leno da Silva",
11 | author_email="felipemaiapolo@gmail.com, f.leno@usp.br",
12 | description="Mutual Information Based Feature Selection in Python.",
13 | long_description=long_description,
14 | long_description_content_type="text/markdown",
15 | url='https://github.com/felipemaiapolo/infoselect',
16 | packages=setuptools.find_packages(),
17 | classifiers=[
18 | "Programming Language :: Python :: 3",
19 | "License :: OSI Approved :: MIT License",
20 | "Operating System :: OS Independent",
21 | ],
22 | install_requires=['scipy','numpy','pandas','sklearn','matplotlib'],
23 | )
24 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | MIT License
3 |
4 | Copyright (c) [2019] [info_selection]
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/infoselect/gmm.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import pandas as pd
4 | import random
5 | import copy
6 | import matplotlib.pyplot as plt
7 | from sklearn import mixture
8 | from sklearn.model_selection import train_test_split
9 | from scipy.stats import multivariate_normal
10 |
11 | def check_array(X, name="X", dim=2):
12 | if not (type(X)==np.ndarray) or len(X.shape)!=dim:
13 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim))
14 |
15 | def gmm_scores(X_train, X_val, k=3, covariance_type='full', reg_covar=1e-06, random_state=42):
16 |
17 | '''
18 | This function trains a GMM and evaluate it in a holdout set using the mean log_likelihood of samples
19 |
20 | Inputs: - X_train: training set;
21 | - k: number of GMM components;
22 | - covariance_type: covariance type (scikit-learn implementation);
23 |
24 | - X_val: holdout set (used if criterion=="loglike");
25 | - random_state: seed.
26 |
27 | Output: - score
28 | '''
29 |
30 | assert covariance_type in ['full','tied','diag','spherical']
31 |
32 | clf = mixture.GaussianMixture(n_components=k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state, max_iter=1000)
33 | clf.fit(X_train)
34 | return clf.score(X_val)
35 |
36 | def edit_covariances(gmm, covariance_type, d):
37 |
38 | n_comp = gmm.n_components
39 |
40 | if covariance_type=='spherical':
41 | covs = np.array([(var*np.eye(d)).tolist() for var in gmm.covariances_])
42 |
43 | elif covariance_type=='diag':
44 | covs = np.array([np.diag(var).tolist() for var in gmm.covariances_])
45 |
46 | elif covariance_type=='tied':
47 | covs = np.array(n_comp*[gmm.covariances_.tolist()])
48 |
49 | else:
50 | covs = gmm.covariances_
51 |
52 | gmm.covariances_ = covs
53 |
54 | return gmm
55 |
56 | def get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type='full', random_state=42):
57 |
58 | '''
59 | This function trains a GMM and evaluate it in a holdout set using the mean log_likelihood of samples
60 |
61 | Inputs: - X: numpy array of features;
62 | - y: numpy array of labels;
63 | - y_cat: if we should consider y as categorical;
64 | - num_comps: numbers of GMM components to be tested;
65 | - val_size: size of holdout set used to validate the GMMs numbers of components
66 | - reg_covar: covariance regularization (scikit-learn implementation);
67 | - covariance_type: covariance type (scikit-learn implementation);
68 | - random_state: seed.
69 |
70 | Output: - GMM ou dictionary of GMMs
71 | '''
72 |
73 | #Checking input format
74 | check_array(X, name="X", dim=2)
75 | check_array(y, name="y", dim=1)
76 | assert covariance_type in ['full','tied','diag','spherical']
77 |
78 | #Y categorical/or with few values
79 | if y_cat:
80 | classes=list(set(y))
81 | gmm={}
82 |
83 | for c in classes:
84 | #Selecting number of components
85 | X_gmm_train, X_gmm_val, _, _=train_test_split(X[y==c], X[y==c], test_size=val_size, random_state=random_state)
86 | scores=np.array([gmm_scores(X_gmm_train, X_gmm_val, k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) for k in num_comps])
87 | k_star=num_comps[np.argmax(scores)]
88 |
89 | #Training GMMs
90 | gmm[c] = mixture.GaussianMixture(n_components=k_star, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state)
91 | gmm[c].fit(X[y==c])
92 | gmm[c] = edit_covariances(gmm[c], covariance_type, X.shape[1])
93 |
94 | return gmm #it is a dictionary of GMMs
95 |
96 | #Y continuous/or with many values
97 | else:
98 | #Selecting number of components
99 | X_gmm_train, X_gmm_val, y_gmm_train, y_gmm_val = train_test_split(X, y, test_size=val_size, random_state=random_state)
100 | Z_gmm_train=np.hstack((y_gmm_train.reshape((-1,1)), X_gmm_train))
101 | Z_gmm_val=np.hstack((y_gmm_val.reshape((-1,1)), X_gmm_val))
102 | scores=np.array([gmm_scores(X_train=Z_gmm_train, X_val=Z_gmm_val, k=k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) for k in num_comps])
103 | k_star=num_comps[np.argmax(scores)]
104 |
105 | #Training GMM
106 | Z = np.hstack((y.reshape((-1,1)),X))
107 | gmm = mixture.GaussianMixture(n_components=k_star, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state)
108 | gmm.fit(Z)
109 | gmm = edit_covariances(gmm, covariance_type, Z.shape[1])
110 |
111 | return gmm #it is a GMM
--------------------------------------------------------------------------------
/infoselect/mi.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 | import pandas as pd
4 | import random
5 | import copy
6 | import matplotlib.pyplot as plt
7 | from sklearn import mixture
8 | from sklearn.model_selection import train_test_split
9 | from scipy.stats import multivariate_normal
10 |
11 | def check_array(X, name="X", dim=2):
12 | if not (type(X)==np.ndarray) or len(X.shape)!=dim:
13 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim))
14 |
15 | def MI_gmm_reg(X, y, gmm, feat, eps=10**-50):
16 |
17 | '''
18 | This function calculates the mutual information between y and X in cases where we assume y continuous/with many values.
19 |
20 | Inputs: - X: numpy array of features;
21 | - y: numpy array of labels;
22 | - gmm: GMM trained model;
23 | - feat: features indexes (feat);
24 | - eps: small value so we can avoid taking log of zero in some cases
25 |
26 | Output: - dictionary containing the estimate for the mutual information between y and X,
27 | and the standard deviation of measurements calculated from the samples.
28 | '''
29 |
30 | n, d = X.shape
31 | components=gmm.n_components
32 | Z=np.hstack((y.reshape((-1,1)),X))
33 | feat2=[0]+[f+1 for f in feat] #feat2 includes y in addition de X[:,feat]. PS: we sum 1 because when we train GMM, the first variable is always y
34 |
35 | ### Calculating log-likelihood with samples (x_i,y_i)
36 | like=np.zeros(n)
37 | for c in range(components):
38 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,feat2], gmm.means_[c][feat2], gmm.covariances_[c][feat2][:,feat2])
39 |
40 | log_like_xy=np.log(like + eps)
41 |
42 |
43 | ### Calculating log-likelihood with samples (x_i)
44 | like=np.zeros(n)
45 | for c in range(components):
46 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,feat2[1:]], gmm.means_[c][feat2[1:]], gmm.covariances_[c][feat2[1:]][:,feat2[1:]])
47 |
48 | log_like_x=np.log(like + eps)
49 |
50 |
51 | ### Calculating log-likelihood with samples (y_i)
52 | like=np.zeros(n)
53 | for c in range(components):
54 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,0], gmm.means_[c][0], gmm.covariances_[c][0][0])
55 |
56 | log_like_y=np.log(like + eps)
57 |
58 |
59 | #Output
60 | m=np.mean(log_like_xy-log_like_x-log_like_y)
61 | s=np.std(log_like_xy-log_like_x-log_like_y)
62 |
63 | return {'mi':m, 'std':s}
64 |
65 |
66 |
67 | def MI_gmm_class(X, y, gmm, feat, eps=10**-50):
68 |
69 | '''
70 | This function calculates the mutual information between y and X in cases where we assume y categorical/with few values.
71 |
72 | Inputs: - X: numpy array of features;
73 | - y: numpy array of labels;
74 | - gmm: dict. of GMM trained models;
75 | - feat: features indexes (feat);
76 | - eps: small value so we can avoid taking log of zero in some cases
77 |
78 | Output: - dictionary containing the estimate for the mutual information between y and X,
79 | and the standard deviation of measurements calculated from the samples.
80 | '''
81 |
82 |
83 | n,d=X.shape
84 | classes=list(set(y))
85 | p={}
86 |
87 | ### Calculating log-likelihood with samples (y_i)
88 | like=np.zeros(n)
89 | for c in classes:
90 | p[c]=np.mean(y==c)
91 | like[y==c]=p[c]
92 |
93 | log_like_y=np.log(like + eps)
94 |
95 |
96 | ### Calculating log-likelihood with samples (x_i,y_i)
97 | like=np.zeros(n)
98 | for c in classes:
99 | #X|Y
100 | like_aux=np.zeros(n)
101 | for comp in range(gmm[c].n_components):
102 | like_aux[y==c]+=gmm[c].weights_[comp]*multivariate_normal.pdf(X[y==c][:,feat], gmm[c].means_[comp][feat], gmm[c].covariances_[comp][feat][:,feat])
103 |
104 | #(X,Y)
105 | like[y==c]=p[c]*like_aux[y==c]
106 | log_like_xy=np.log(like + eps)
107 |
108 |
109 | ### Calculating log-likelihood with samples (x_i)
110 | like=np.zeros(n)
111 | for c in classes:
112 | #X|Y
113 | like_aux=np.zeros(n)
114 | for comp in range(gmm[c].n_components):
115 | like_aux+=gmm[c].weights_[comp]*multivariate_normal.pdf(X[:,feat], gmm[c].means_[comp][feat], gmm[c].covariances_[comp][feat][:,feat])
116 |
117 | #Marginalization of (X,Y)
118 | like+=p[c]*like_aux
119 |
120 | log_like_x=np.log(like + eps)
121 |
122 |
123 | #Output
124 | m=np.mean(log_like_xy-log_like_x-log_like_y)
125 | s=np.std(log_like_xy-log_like_x-log_like_y)
126 |
127 | return {'mi':m, 'std':s}
128 |
129 |
130 |
131 | def MI(cand, posic, r, X, y, gmm, include_cand = True, eps=10**-50):
132 |
133 | '''
134 | This function is an intermediary function between the main class and the two functions that make the calculation of the
135 | mutual information. It basically decides which of the two functions to use and if we should do the forward or backward step.
136 |
137 | Inputs: - cand: position of the candidate variable to be chosen;
138 | - posic: list with positions of the selected variables so far;
139 | - r: round;
140 | - X: numpy array of features;
141 | - y: numpy array of labels;
142 | - gmm: model or dict. of GMM(s);
143 | - include_cand: include or remove variables (forwar/backward)d;
144 | - eps: small value so we can avoid taking log of zero in some cases
145 |
146 | Output: - cand: position of the candidate variable to be chosen;
147 | - dic: dictionary containing the estimate for the mutual information between y and X,
148 | and the standard deviation of measurements calculated from the samples.
149 | '''
150 |
151 | n,d=X.shape
152 | aux = copy.deepcopy(posic)
153 | if include_cand:
154 | aux[r] = cand
155 | else:
156 | aux.remove(cand)
157 |
158 | if type(gmm)==dict:
159 | dic=MI_gmm_class(X, y, gmm, aux, eps)
160 | else:
161 | dic=MI_gmm_reg(X, y, gmm, aux, eps)
162 |
163 | return cand, dic
--------------------------------------------------------------------------------
/infoselect/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | name="info_selection/info_selection"
3 | __version__ = "1.0.0"
4 |
5 | ###
6 | import math
7 | import numpy as np
8 | import pandas as pd
9 | import random
10 | import copy
11 | import matplotlib.pyplot as plt
12 | from sklearn import mixture
13 | from sklearn.model_selection import train_test_split
14 | from scipy.stats import multivariate_normal
15 | from .gmm import *
16 | from .mi import *
17 |
18 | def check_array(X, name="X", dim=2):
19 | if not (type(X)==np.ndarray) or len(X.shape)!=dim:
20 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim))
21 |
22 | class SelectVars:
23 |
24 | '''
25 | This is the main class of the package.
26 | '''
27 |
28 | selection_mode = None
29 | gmm = None
30 | n = None
31 |
32 | def __init__(self, gmm, selection_mode = 'forward'):
33 | """
34 | Inputs: - gmm: model or dict. of GMM(s);
35 | - selection_mode: forward/backward algorithms.
36 | """
37 |
38 | if not selection_mode in ['forward', 'backward']:
39 | raise ValueError("Selection model should be either 'forward' or 'backward'.")
40 |
41 | self.selection_mode = selection_mode
42 | self.gmm=gmm
43 |
44 | def fit(self, X, y, verbose=True, eps=0):
45 |
46 | '''
47 | This function order the features according to their importance - from
48 | most important to least important (forward) or from least important to most importante (backward).
49 |
50 | Inputs: - X: numpy array of features;
51 | - y: numpy array of labels;
52 | - verbose: print or not to print!?
53 | - eps: small value so we can avoid taking log of zero in some cases
54 | '''
55 |
56 | #Checking input format
57 | check_array(X, name="X", dim=2)
58 | check_array(y, name="y", dim=1)
59 |
60 | '''Creating some important objects'''
61 | self.n, self.d = X.shape
62 | include_var = self.selection_mode == 'forward' #True if include or False if remove
63 |
64 | self.delta_list = [] # list with history of % changes of mutual info when adding/removing the best/worst variables
65 | self.mi_list = [] # list with mutual info history by adding/removing the best/worst variables
66 | self.stds_list = [] # list with stds history and that we will use to calculate the standard error of MIs
67 | self.feat_hist=[] # history of variables at each round
68 | lista = list(range(self.d)) # list with indexes of all variables
69 |
70 | '''Defining number of iterations and list of features we use in each iteration'''
71 | if verbose: print("Let's start...\n")
72 |
73 | #The 'posic' list starts empty if you include
74 | if include_var:
75 | posic = [] # lista com posições das variáveis selecionadas até o momento
76 | self.feat_hist.append(copy.deepcopy(posic))
77 | rounds = range(self.d)
78 |
79 | self.mi_list.append(0)
80 | self.stds_list.append(0)
81 | self.delta_list.append(0)
82 |
83 | #The 'posic' list starts full if we take it out
84 | else:
85 | posic = copy.deepcopy(lista)
86 | self.feat_hist.append(copy.deepcopy(posic))
87 | rounds = range(self.d-1)
88 |
89 | if type(self.gmm)==dict:
90 | dic=MI_gmm_class(X, y, self.gmm, posic, eps)
91 | else:
92 | dic=MI_gmm_reg(X, y, self.gmm, posic, eps)
93 |
94 | self.mi_list.append(dic['mi'])
95 | self.stds_list.append(dic['std'])
96 | self.delta_list.append(0)
97 |
98 | if verbose: print("Round = {:3d} | Î = {:5.2f} | Δ%Î = {:5.2f} | Features={}".format(0, self.mi_list[-1], 0, posic))
99 |
100 |
101 | '''Calculating the Mutual Information (forward or backward fashion)'''
102 | for r in rounds: # "r" of rounds/repetitions
103 |
104 | if include_var:
105 | posic.append(None)
106 |
107 | #Calcula MI entre y e X[:,(posic, cand)] -> cand: variável candidata a ser selecionada
108 | outputs = [MI(cand, posic, r, X, y, self.gmm, include_var, eps) for cand in lista]
109 |
110 | #Escolhendo variável que traz maior retorno
111 | MI_best=-math.inf
112 |
113 | for out in outputs:
114 |
115 | cand, dic = out
116 | MI_current = dic['mi']
117 |
118 | if MI_current > MI_best:
119 | MI_best = MI_current
120 | std_best = dic['std']
121 | best_index = cand
122 |
123 | #Δ%Î
124 | if r==0 and include_var:
125 | self.delta_list.append(0)
126 | else:
127 | self.delta_list.append(MI_best/self.mi_list[-1]-1)
128 |
129 | #Updating variable list
130 | lista.remove(best_index)
131 | if include_var:
132 | posic[r] = best_index
133 | else:
134 | posic.remove(best_index)
135 |
136 | #Updating lists
137 | self.mi_list.append(MI_best)
138 | self.stds_list.append(std_best)
139 | self.feat_hist.append(copy.deepcopy(posic))
140 |
141 | #Verbose
142 | if verbose: print("Round = {:3d} | Î = {:5.2f} | Δ%Î = {:5.2f} | Features={}".format(r+1, MI_best, self.delta_list[-1], posic))
143 |
144 |
145 |
146 | def get_info(self):
147 |
148 | '''
149 | This function creates and outputs a Pandas DataFrame with the history of feature importance.
150 | '''
151 |
152 | dic={'rounds': range(0,len(self.mi_list)),
153 | 'mi_mean': self.mi_list,
154 | 'mi_error': [s/np.sqrt(self.n) for s in self.stds_list],
155 | 'delta': self.delta_list,
156 | 'features':self.feat_hist,
157 | 'num_feat':[len(l) for l in self.feat_hist]}
158 | return pd.DataFrame(dic).loc[:,['rounds','mi_mean','mi_error','delta','num_feat','features']]
159 |
160 | def plot_delta(self):
161 |
162 | '''
163 | This function plots the history of percentual changes in the mutual information.
164 | '''
165 |
166 | l=self.delta_list
167 | plt.plot(list(range(1,len(l))),l[1:])
168 | plt.axhline(y=0, color='r', linestyle='--')
169 | plt.xlabel("Rounds")
170 | plt.ylabel("Δ% Mutual Information")
171 | plt.show()
172 |
173 | def plot_mi(self):
174 |
175 | '''
176 | This function plots the history of the mutual information.
177 | '''
178 |
179 | l,s=self.mi_list, self.stds_list
180 | plt.errorbar(list(range(len(l))), l, yerr=np.array(s)/np.sqrt(self.n)) #
181 | plt.axhline(y=0, color='g', linestyle='--')
182 | plt.xlabel("Rounds")
183 | plt.ylabel("Mutual Information")
184 | plt.show()
185 |
186 | def transform(self, X, rd):
187 |
188 | '''
189 | This transforms X using the round 'rd'. Examine the history dataframe and plot before choosing 'rd'.
190 | '''
191 |
192 | return X[:,self.get_info().loc[rd,'features']]
193 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # ***InfoSelect*** - Mutual Information Based Feature Selection in Python
3 |
4 |
5 |
6 |
7 | ### *Felipe Maia Polo (felipemaiapolo), Felipe Leno da Silva (f-leno)*
8 |
9 | [](https://pypi.python.org/pypi/infoselect)
10 | [](https://colab.research.google.com/github/felipemaiapolo/infoselect/blob/master/InfoSelect.ipynb)
11 |
12 | In case you have any question or suggestion, please get in touch sending us an e-mail in *felipemaiapolo@gmail.com*.
13 |
14 | --------------
15 | ## Contents
16 | 1. [ Introduction ](#1)
17 | 2. [ Installing *InfoSelect* ](#2)
18 | 3. [ Main functionalities of *InfoSelect* ](#3)
19 | 4. [ Examples of *InfoSelect* use ](#4)
20 | 5. [ References ](#5)
21 |
22 | --------------
23 |
24 |
25 | ## 1\. Introduction
26 |
27 | In this package we implement the ideas proposed by [1, 2] in order to make variable/feature selection prior to regression and classification tasks using Gaussian Mixture Models (GMMs) to estimate the Mutual Information between labels and features. This is an efficient and well-performing alternative and was used in a recent work [3] by one of us:
28 |
29 | @article{maia2022effective,
30 | title={Effective sample size, dimensionality, and generalization in covariate shift adaptation},
31 | author={Maia Polo, Felipe and Vicente, Renato},
32 | journal={Neural Computing and Applications},
33 | pages={1--13},
34 | year={2022},
35 | publisher={Springer}
36 | }
37 |
38 |
39 | @misc{polo2020infoselect,
40 | title={InfoSelect - Mutual Information Based Feature Selection in Python},
41 | author={Polo, Felipe Maia and Da Silva, Felipe Leno},
42 | journal={GitHub: github.com/felipemaiapolo/infoselect},
43 | year={2020}
44 | }
45 |
46 |
47 |
48 |
49 | --------------
50 |
51 |
52 | ## 2\. Installing *InfoSelect*
53 |
54 | You can install the package from
55 | [PyPI](https://pypi.org/project/infoselect/)
56 |
57 | ``` :sh
58 | $ pip install infoselect
59 | ```
60 |
61 | Also, you can install the package from
62 | [GitHub](https://github.com/felipemaiapolo/infosel).
63 |
64 | ``` :sh
65 | $ pip install git+https://github.com/felipemaiapolo/infoselect.git#egg=infoselect
66 | ```
67 |
68 | --------------------
69 |
70 |
71 | ## 3\. Main functionalities of *InfoSelect*
72 |
73 |
74 | ### 3.1\. Main Class `SelectVars`
75 |
76 | This class is used to order features/variables according to their importance and making the selection itself. Next we detail its methods:
77 |
78 | 1. `__init__(self, gmm, selection_mode = 'forward')`
79 | - **gmm**:
80 | - If $Y$ is *non*-categorical: a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) - y should always be in the first column;
81 | - If $Y$ is categorical: a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`;
82 | - Please use auxiliary function `get_gmm` below, especially if you want to use `covariance_type` other than 'full'.
83 | - **selection_mode**: `forward`/`backward` algorithms.
84 | - `forward` selection: we start with an empty set of features and then select the feature that has the largest estimated mutual information with the target variable and. At each subsequent step, we select the feature that marginally maximizes the estimated mutual information of the target and all the chosen features so far. We stop when we have selected/ordered all the features;
85 | - `backward` elimination: we start with the full set of features and then at each step, we eliminate the feature that marginally maximizes the estimated mutual information of the target and all the remaining features. We stop when we have no more features to eliminate;
86 |
87 | 2. `fit(self, X, y, verbose=True, eps=0)`
88 | - **X**: numpy array of features;
89 | - **y**: numpy array of labels;
90 | - **verbose**: print or not to print!?
91 | - **eps**: small value so we can avoid taking log of zero in some cases.
92 |
93 | 3. `get_info(self)`:
94 | - This function creates and outputs a Pandas DataFrame with the history of feature selection/elimination. The `mi_mean` column gives the estimated Mutual Information while `mi_error` gives the standard error of that estimate. On the other hand, the `delta` column gives us the percentual information loss/gain in that round, relatively to the latter;
95 |
96 | 4. `plot_delta(self)`:
97 | - This function plots the history of percentual changes in the mutual information.
98 |
99 | 5. `plot_mi(self)`:
100 | - This function plots the history of the mutual information.
101 |
102 | 6. `transform(self, X, rd)`:
103 | - This function takes **X** and transforms it in **X_new**, maintaining the features of Round `rd`;
104 |
105 |
106 | ### 3.2\. Auxiliary Function `get_gmm`
107 |
108 | 1. `get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type="full", random_state=42)`:
109 |
110 | - Firstly, this function validate the number of GMM components, for each model it will train, in a holdout set using the mean log likelihood of samples in that set. If Y is non-categorical, it returns a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) model (in this order). On the other hand, if Y is categorical it returns a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`.
111 |
112 | - **X**: numpy array of features;
113 | - **y**: numpy array of labels;
114 | - **y_cat**: if we should consider Y as categorical;
115 | - **num_comps**: numbers of GMM components to be validated;
116 | - **val_size**: size of holdout set used to validate the GMMs numbers of components;
117 | - **reg_covar**: non-negative regularization added to the diagonal of covariance. Ensures the covariance matrices are non-singular.
118 | - **covariance_type**: one of the following options:'full','tied','diag','spherical'. See [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html). Thanks to Pritha Gupta for her suggestion on this point.
119 | - **random_state**: seed.
120 |
121 | --------------------
122 |
123 |
124 | ## 4\. Examples of *InfoSelect* use
125 |
126 | Loading Packages:
127 |
128 |
129 | ```python
130 | import infoselect as inf
131 | import numpy as np
132 | import pandas as pd
133 | import matplotlib.pyplot as plt
134 | ```
135 |
136 |
137 | ### 4.1\. Dataset
138 |
139 | We generate a dataset $\mathcal{D}$ sampled from $\mathcal{D}=\{(X_{0,i},...,X_{6,i},Y_i)\}_{i=1}^{n}$ similar to the one in [here](https://www.cs.toronto.edu/~delve/data/add10/desc.html), in which $Y_i$ is given by
140 |
141 | $$
142 | Y_i = 10 \sin(\pi X_{0,i} X_{1,i}) + 20 (X_{2,i} - 0.5)^2 + 10 X_{3,i} + 5 X_{4,i} + \varepsilon_i
143 | $$
144 |
145 | Where $X_{0,i},...,X_{6,i} \overset{iid}{\sim} U[0,1]$ and $\varepsilon_i \sim N(0,1)$ independent from all the other random variables for all $i\in [n]$. See that our target variable does not depend on the last two features. In the following, we set `n=10000`:
146 |
147 |
148 | ```python
149 | n=10000
150 | d=7
151 |
152 | X = np.random.uniform(0,1,d*n).reshape((n,d))
153 | e = np.random.normal(0,1,n)
154 | y = f(X,e)
155 |
156 | X.shape, y.shape
157 | ```
158 |
159 |
160 |
161 |
162 | ((10000, 7), (10000,))
163 |
164 |
165 |
166 | ### 4.2\. Selecting Features for a Regression Task
167 |
168 | Training (and validating) GMM:
169 |
170 |
171 | ```python
172 | %%time
173 |
174 | gmm = inf.get_gmm(X, y)
175 | ```
176 |
177 | Wall time: 8.43 s
178 |
179 |
180 | Ordering features by their importances using the *Backward Elimination* algorithm:
181 |
182 |
183 | ```python
184 | select = inf.SelectVars(gmm, selection_mode = 'backward')
185 | select.fit(X, y, verbose=True)
186 | ```
187 |
188 | Let's start...
189 |
190 | Round = 0 | Î = 1.36 | Δ%Î = 0.00 | Features=[0, 1, 2, 3, 4, 5, 6]
191 | Round = 1 | Î = 1.36 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4, 5]
192 | Round = 2 | Î = 1.36 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4]
193 | Round = 3 | Î = 0.97 | Δ%Î = -0.29 | Features=[0, 1, 3, 4]
194 | Round = 4 | Î = 0.73 | Δ%Î = -0.24 | Features=[0, 1, 3]
195 | Round = 5 | Î = 0.40 | Δ%Î = -0.46 | Features=[0, 3]
196 | Round = 6 | Î = 0.21 | Δ%Î = -0.48 | Features=[3]
197 |
198 |
199 | Checking history:
200 |
201 |
202 | ```python
203 | select.get_info()
204 | ```
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
rounds
215 |
mi_mean
216 |
mi_error
217 |
delta
218 |
num_feat
219 |
features
220 |
221 |
222 |
223 |
224 |
0
225 |
0
226 |
1.358832
227 |
0.008771
228 |
0.000000
229 |
7
230 |
[0, 1, 2, 3, 4, 5, 6]
231 |
232 |
233 |
1
234 |
1
235 |
1.358090
236 |
0.008757
237 |
-0.000546
238 |
6
239 |
[0, 1, 2, 3, 4, 5]
240 |
241 |
242 |
2
243 |
2
244 |
1.356661
245 |
0.008753
246 |
-0.001053
247 |
5
248 |
[0, 1, 2, 3, 4]
249 |
250 |
251 |
3
252 |
3
253 |
0.969897
254 |
0.007843
255 |
-0.285085
256 |
4
257 |
[0, 1, 3, 4]
258 |
259 |
260 |
4
261 |
4
262 |
0.734578
263 |
0.007396
264 |
-0.242622
265 |
3
266 |
[0, 1, 3]
267 |
268 |
269 |
5
270 |
5
271 |
0.400070
272 |
0.007192
273 |
-0.455375
274 |
2
275 |
[0, 3]
276 |
277 |
278 |
6
279 |
6
280 |
0.209808
281 |
0.005429
282 |
-0.475571
283 |
1
284 |
[3]
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 | It is possible to see that the estimated mutual information is untouched until Round 2, when it varies around -30%.
293 |
294 | Since there is a 'break' in Round 2, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows:
295 |
296 |
297 | ```python
298 | select.plot_mi()
299 | ```
300 |
301 |
302 |
303 |
304 | Plotting the percentual variations of the mutual information between rounds:
305 |
306 |
307 | ```python
308 | select.plot_delta()
309 | ```
310 |
311 |
312 | 
313 |
314 |
315 | Making the selection choosing to stop at Round 2:
316 |
317 |
318 | ```python
319 | X_new = select.transform(X, rd=2)
320 |
321 | X_new.shape
322 | ```
323 |
324 |
325 |
326 |
327 | (10000, 5)
328 |
329 |
330 |
331 | ### 4.3\. Selecting Features for a Classification Task
332 |
333 | Categorizing Y:
334 |
335 |
336 | ```python
337 | ind0 = (y
403 |
404 |
405 |
406 |
407 |
rounds
408 |
mi_mean
409 |
mi_error
410 |
delta
411 |
num_feat
412 |
features
413 |
414 |
415 |
416 |
417 |
0
418 |
0
419 |
0.000000
420 |
0.000000
421 |
0.000000
422 |
0
423 |
[]
424 |
425 |
426 |
1
427 |
1
428 |
0.139542
429 |
0.005217
430 |
0.000000
431 |
1
432 |
[3]
433 |
434 |
435 |
2
436 |
2
437 |
0.280835
438 |
0.006377
439 |
1.012542
440 |
2
441 |
[3, 0]
442 |
443 |
444 |
3
445 |
3
446 |
0.503872
447 |
0.006499
448 |
0.794196
449 |
3
450 |
[3, 0, 1]
451 |
452 |
453 |
4
454 |
4
455 |
0.617048
456 |
0.006322
457 |
0.224612
458 |
4
459 |
[3, 0, 1, 4]
460 |
461 |
462 |
5
463 |
5
464 |
0.745933
465 |
0.005135
466 |
0.208874
467 |
5
468 |
[3, 0, 1, 4, 2]
469 |
470 |
471 |
6
472 |
6
473 |
0.745549
474 |
0.005202
475 |
-0.000515
476 |
6
477 |
[3, 0, 1, 4, 2, 5]
478 |
479 |
480 |
7
481 |
7
482 |
0.740968
483 |
0.005457
484 |
-0.006144
485 |
7
486 |
[3, 0, 1, 4, 2, 5, 6]
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 | It is possible to see that the estimated mutual information is untouched from Round 6 onwards.
495 |
496 | Since there is a 'break' in Round 5, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows:
497 |
498 |
499 | ```python
500 | select.plot_mi()
501 | ```
502 |
503 |
504 | 
505 |
506 |
507 | Plotting the percentual variations of the mutual information between rounds:
508 |
509 |
510 | ```python
511 | select.plot_delta()
512 | ```
513 |
514 | 
515 |
516 |
517 | Making the selection choosing to stop at Round 5:
518 |
519 |
520 | ```python
521 | X_new = select.transform(X, rd=5)
522 |
523 | X_new.shape
524 | ```
525 |
526 |
527 |
528 |
529 | (10000, 5)
530 |
531 | --------------
532 |
533 |
534 | ## 5\. References
535 |
536 | [1] Eirola, E., Lendasse, A., & Karhunen, J. (2014, July). Variable selection for regression problems using Gaussian mixture models to estimate mutual information. In 2014 International Joint Conference on Neural Networks (IJCNN) (pp. 1606-1613). IEEE.
537 |
538 | [2] Lan, T., Erdogmus, D., Ozertem, U., & Huang, Y. (2006, July). Estimating mutual information using gaussian mixture model for feature ranking and selection. In The 2006 IEEE International Joint Conference on Neural Network Proceedings (pp. 5034-5039). IEEE.
539 |
540 | [3] Maia Polo, F., & Vicente, R. (2022). Effective sample size, dimensionality, and generalization in covariate shift adaptation. Neural Computing and Applications, 1-13.
541 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/InfoSelect-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "view-in-github"
8 | },
9 | "source": [
10 | ""
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {
16 | "id": "veqo_rjtmTKv"
17 | },
18 | "source": [
19 | "# ***InfoSelect*** - Mutual Information Based Feature Selection in Python\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "\n",
24 | "### *Felipe Maia Polo (felipemaiapolo), Felipe Leno da Silva (f-leno)*\n"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {
30 | "id": "08mvCZGw1pvk"
31 | },
32 | "source": [
33 | "## Contents\n",
34 | "1. [ Introduction ](#1)\n",
35 | "2. [ Installing *InfoSelect* ](#2)\n",
36 | "3. [ Main functionalities of *InfoSelect* ](#3)\n",
37 | "4. [ Examples of *InfoSelect* use ](#4)\n",
38 | "5. [ References ](#5)\n",
39 | "\n",
40 | "--------------\n",
41 | "\n",
42 | "\n",
43 | "## 1\\. Introduction \n",
44 | "\n",
45 | "In this package we implement the ideas proposed by [1, 2] in order to make variable/feature selection prior to regression and classification tasks using Gaussian Mixture Models (GMMs) to estimate the Mutual Information between labels and features. This is an efficient and well-performing alternative and was used in a recent work [3] by one of us.\n",
46 | "\n",
47 | "If you use our package in your research, you can cite it as follows:\n",
48 | "\n",
49 | " @article{maia2022effective,\n",
50 | " title={Effective sample size, dimensionality, and generalization in covariate shift adaptation},\n",
51 | " author={Maia Polo, Felipe and Vicente, Renato},\n",
52 | " journal={Neural Computing and Applications},\n",
53 | " pages={1--13},\n",
54 | " year={2022},\n",
55 | " publisher={Springer}\n",
56 | " }\n",
57 | "\n",
58 | "\n",
59 | " @misc{polo2020infoselect,\n",
60 | " title={InfoSelect - Mutual Information Based Feature Selection in Python},\n",
61 | " author={Polo, Felipe Maia and Da Silva, Felipe Leno},\n",
62 | " journal={GitHub: github.com/felipemaiapolo/infoselect},\n",
63 | " year={2020}\n",
64 | " }\n",
65 | "\n",
66 | "\n",
67 | "--------------\n",
68 | "\n",
69 | "\n",
70 | "## 2\\. Installing *InfoSelect* \n",
71 | "\n",
72 | "You can install the package from\n",
73 | "[PyPI](https://pypi.org/project/infoselect/)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 1,
79 | "metadata": {
80 | "colab": {
81 | "base_uri": "https://localhost:8080/"
82 | },
83 | "id": "MFT2CLZomTK1",
84 | "outputId": "876f4022-a7e9-492b-cbc0-1b50fee497d0"
85 | },
86 | "outputs": [
87 | {
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "Requirement already satisfied: infoselect in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (1.0.2)\n",
92 | "Requirement already satisfied: sklearn in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (0.0)\n",
93 | "Requirement already satisfied: numpy in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.21.6)\n",
94 | "Requirement already satisfied: pandas in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.3.5)\n",
95 | "Requirement already satisfied: scipy in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.7.3)\n",
96 | "Requirement already satisfied: matplotlib in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (3.5.1)\n",
97 | "Requirement already satisfied: packaging>=20.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (21.3)\n",
98 | "Requirement already satisfied: cycler>=0.10 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (0.11.0)\n",
99 | "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (2.8.2)\n",
100 | "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (3.0.4)\n",
101 | "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (4.33.3)\n",
102 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (1.4.2)\n",
103 | "Requirement already satisfied: pillow>=6.2.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (9.1.0)\n",
104 | "Requirement already satisfied: typing-extensions in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib->infoselect) (4.1.1)\n",
105 | "Requirement already satisfied: six>=1.5 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib->infoselect) (1.16.0)\n",
106 | "Requirement already satisfied: pytz>=2017.3 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from pandas->infoselect) (2022.1)\n",
107 | "Requirement already satisfied: scikit-learn in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from sklearn->infoselect) (1.0.2)\n",
108 | "Requirement already satisfied: joblib>=0.11 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from scikit-learn->sklearn->infoselect) (1.1.0)\n",
109 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from scikit-learn->sklearn->infoselect) (3.1.0)\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "!pip install infoselect"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {
120 | "id": "R0IgxOvs2VLp"
121 | },
122 | "source": [
123 | "\n",
124 | "## 3\\. Main functionalities of *InfoSelect* \n",
125 | "\n",
126 | "\n",
127 | "### 3.1\\. Main Class `SelectVars`\n",
128 | "\n",
129 | "This class is used to order features/variables according to their importance and making the selection itself. Next we detail its methods:\n",
130 | "\n",
131 | "1. `__init__(self, gmm, selection_mode = 'forward')`\n",
132 | " - **gmm**: \n",
133 | " - If is *non*-categorical: a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) - y should always be in the first column;\n",
134 | " - If is categorical: a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`;\n",
135 | " - PS: the GMMs must be `covariance_type='full'` at the current *InfoSelect* version.\n",
136 | " - **selection_mode**: `forward`/`backward` algorithms.\n",
137 | " - `forward` selection: we start with an empty set of features and then select the feature that has the largest estimated mutual information with the target variable and. At each subsequent step, we select the feature that marginally maximizes the estimated mutual information of the target and all the chosen features so far. We stop when we have selected/ordered all the features;\n",
138 | " - `backward` elimination: we start with the full set of features and then at each step, we eliminate the feature that marginally maximizes the estimated mutual information of the target and all the remaining features. We stop when we have no more features to eliminate;\n",
139 | "\n",
140 | "2. `fit(self, X, y, verbose=True, eps=0)`\n",
141 | " - **X**: numpy array of features; \n",
142 | " - **y**: numpy array of labels;\n",
143 | " - **verbose**: print or not to print!?\n",
144 | " - **eps**: small value so we can avoid taking log of zero in some cases .\n",
145 | "\n",
146 | "3. `get_info(self)`: \n",
147 | " - This function creates and outputs a Pandas DataFrame with the history of feature selection/elimination. The `mi_mean` column gives the estimated Mutual Information while `mi_error` gives the standard error of that estimate. On the other hand, the `delta` column gives us the percentual information loss/gain in that round, relatively to the latter;\n",
148 | " \n",
149 | "4. `plot_delta(self)`: \n",
150 | " - This function plots the history of percentual changes in the mutual information.\n",
151 | " \n",
152 | "5. `plot_mi(self)`: \n",
153 | " - This function plots the history of the mutual information.\n",
154 | " \n",
155 | "6. `transform(self, X, rd)`: \n",
156 | " - This function takes **X** and transforms it in **X_new**, maintaining the features of Round `rd`; \n",
157 | " \n",
158 | "\n",
159 | "### 3.2\\. Auxiliary Function `get_gmm`\n",
160 | "\n",
161 | "1. `get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type=\"full\", random_state=42)`: \n",
162 | "\n",
163 | " - Firstly, this function validate the number of GMM components, for each model it will train, in a holdout set using the mean log likelihood of samples in that set. If Y is non-categorical, it returns a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) model (in this order). On the other hand, if Y is categorical it returns a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`.\n",
164 | "\n",
165 | " - **X**: numpy array of features; \n",
166 | " - **y**: numpy array of labels;\n",
167 | " - **y_cat**: if we should consider Y as categorical;\n",
168 | " - **num_comps**: numbers of GMM components to be validated;\n",
169 | " - **val_size**: size of holdout set used to validate the GMMs numbers of components;\n",
170 | " - **reg_covar**: non-negative regularization added to the diagonal of covariance. Ensures the covariance matrices are non-singular.\n",
171 | " - **covariance_type**: one of the following options:'full','tied','diag','spherical'. See [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html)\n",
172 | " - **random_state**: seed.\n",
173 | "\n",
174 | "--------------------\n",
175 | "\n",
176 | "\n",
177 | "## 4\\. Examples of *InfoSelect* use\n",
178 | "\n",
179 | "Loading Packages:"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 2,
185 | "metadata": {
186 | "id": "EdhDRC3SmTLS"
187 | },
188 | "outputs": [],
189 | "source": [
190 | "import infoselect as inf\n",
191 | "import numpy as np \n",
192 | "import pandas as pd\n",
193 | "import matplotlib.pyplot as plt"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {
199 | "id": "YdeqfsuymTLX"
200 | },
201 | "source": [
202 | "### 1.1\\. Dataset\n",
203 | "\n",
204 | "We generate a dataset $D$ sampled from $\\mathcal{D}=\\{(X_{0,i},...,X_{6,i},Y_i)\\}_{i=1}^{n}$ similar to the one in [here](https://www.cs.toronto.edu/~delve/data/add10/desc.html), in which $Y_i$ is given by\n",
205 | "\n",
206 | " \n",
207 | "\\begin{align}\n",
208 | "Y_i &= f(X_{0,i},...,X_{6,i}) + \\epsilon_i \\\\[.5em]\n",
209 | "&=10\\cdot \\sin(\\pi X_{0,i} X_{1,i}) + 20 (X_{2,i}-0.5)^2 + 10 X_{3,i} + 5 X_{4,i} + \\epsilon_i\n",
210 | "\\end{align}\n",
211 | " \n",
212 | "\n",
213 | "Where $X_{0,i},...,X_{6,i} \\overset{iid}{\\sim} U[0,1]$ and $\\epsilon_i \\sim N(0,1)$ independent from all the other random variables for all $i\\in [n]$. In the following we set $n=20000$:"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 3,
219 | "metadata": {
220 | "id": "8Y4_HMDDmTLZ"
221 | },
222 | "outputs": [],
223 | "source": [
224 | "def f(X,e): return 10*np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2]-.5)**2 + 10*X[:,3] + 5*X[:,4] + e"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 4,
230 | "metadata": {
231 | "colab": {
232 | "base_uri": "https://localhost:8080/"
233 | },
234 | "id": "isQqEnhDmTLl",
235 | "outputId": "cd966641-eadb-40cc-ec64-67cb54ae8f31"
236 | },
237 | "outputs": [
238 | {
239 | "data": {
240 | "text/plain": [
241 | "((20000, 7), (20000,))"
242 | ]
243 | },
244 | "execution_count": 4,
245 | "metadata": {},
246 | "output_type": "execute_result"
247 | }
248 | ],
249 | "source": [
250 | "n=20000\n",
251 | "d=7\n",
252 | "\n",
253 | "X = np.random.uniform(0,1,d*n).reshape((n,d))\n",
254 | "e = np.random.normal(0,1,n)\n",
255 | "y = f(X,e)\n",
256 | "\n",
257 | "X.shape, y.shape"
258 | ]
259 | },
260 | {
261 | "cell_type": "markdown",
262 | "metadata": {
263 | "id": "Sgl9V_enmTL2"
264 | },
265 | "source": [
266 | "### 1.2\\. Selecting Features for a Regression Task"
267 | ]
268 | },
269 | {
270 | "cell_type": "markdown",
271 | "metadata": {
272 | "id": "J-KknWzDmTL4"
273 | },
274 | "source": [
275 | "Training (and validating) GMM:"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 228,
281 | "metadata": {
282 | "colab": {
283 | "base_uri": "https://localhost:8080/"
284 | },
285 | "id": "KKg_aU1hmTL8",
286 | "outputId": "e0071c65-ee69-4627-b4f1-519111c87d5c"
287 | },
288 | "outputs": [
289 | {
290 | "ename": "TypeError",
291 | "evalue": "get_gmm() got an unexpected keyword argument 'covariance_type'",
292 | "output_type": "error",
293 | "traceback": [
294 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
295 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
296 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n",
297 | "\u001b[0;31mTypeError\u001b[0m: get_gmm() got an unexpected keyword argument 'covariance_type'"
298 | ]
299 | }
300 | ],
301 | "source": [
302 | "%%time\n",
303 | "\n",
304 | "gmm = inf.get_gmm(X, y)"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": null,
310 | "metadata": {
311 | "colab": {
312 | "base_uri": "https://localhost:8080/"
313 | },
314 | "id": "FPtgps_emTMF",
315 | "outputId": "7854aeca-190e-45cb-8665-8572f38ac2ac"
316 | },
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/plain": [
321 | "GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,\n",
322 | " means_init=None, n_components=20, n_init=1,\n",
323 | " precisions_init=None, random_state=42, reg_covar=1e-06,\n",
324 | " tol=0.001, verbose=0, verbose_interval=10, warm_start=False,\n",
325 | " weights_init=None)"
326 | ]
327 | },
328 | "execution_count": 6,
329 | "metadata": {
330 | "tags": []
331 | },
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "gmm"
337 | ]
338 | },
339 | {
340 | "cell_type": "markdown",
341 | "metadata": {
342 | "id": "K0gL-DJ-mTMO"
343 | },
344 | "source": [
345 | "Ordering features by their importances using the *Backward Elimination* algorithm:"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": null,
351 | "metadata": {
352 | "colab": {
353 | "base_uri": "https://localhost:8080/"
354 | },
355 | "id": "F7Gy-vgWmTMR",
356 | "outputId": "a4d2f504-ebf5-4b47-c647-5b61f6cdd09c"
357 | },
358 | "outputs": [
359 | {
360 | "name": "stdout",
361 | "output_type": "stream",
362 | "text": [
363 | "Let's start...\n",
364 | "\n",
365 | "Round = 0 | Î = 1.48 | Δ%Î = 0.00 | Features=[0, 1, 2, 3, 4, 5, 6]\n",
366 | "Round = 1 | Î = 1.48 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4, 5]\n",
367 | "Round = 2 | Î = 1.48 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4]\n",
368 | "Round = 3 | Î = 1.00 | Δ%Î = -0.32 | Features=[0, 1, 3, 4]\n",
369 | "Round = 4 | Î = 0.75 | Δ%Î = -0.25 | Features=[0, 1, 3]\n",
370 | "Round = 5 | Î = 0.39 | Δ%Î = -0.48 | Features=[1, 3]\n",
371 | "Round = 6 | Î = 0.21 | Δ%Î = -0.46 | Features=[3]\n"
372 | ]
373 | }
374 | ],
375 | "source": [
376 | "select = inf.SelectVars(gmm, selection_mode = 'backward')\n",
377 | "select.fit(X, y, verbose=True) "
378 | ]
379 | },
380 | {
381 | "cell_type": "markdown",
382 | "metadata": {
383 | "id": "px7utZtNmTMW"
384 | },
385 | "source": [
386 | "Checking history:"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {
393 | "colab": {
394 | "base_uri": "https://localhost:8080/",
395 | "height": 258
396 | },
397 | "id": "UgTyazIDmTMX",
398 | "outputId": "1ad808f5-36bd-4142-d114-af353fd90a9c"
399 | },
400 | "outputs": [
401 | {
402 | "data": {
403 | "text/html": [
404 | "