├── setup.cfg ├── infoselect ├── __pycache__ │ ├── gmm.cpython-37.pyc │ ├── mi.cpython-37.pyc │ └── __init__.cpython-37.pyc ├── gmm.py ├── mi.py └── __init__.py ├── setup.py ├── LICENSE.txt ├── README.md └── .ipynb_checkpoints └── InfoSelect-checkpoint.ipynb /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /infoselect/__pycache__/gmm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/gmm.cpython-37.pyc -------------------------------------------------------------------------------- /infoselect/__pycache__/mi.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/mi.cpython-37.pyc -------------------------------------------------------------------------------- /infoselect/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipemaiapolo/infoselect/HEAD/infoselect/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | import setuptools 3 | 4 | with open("README.md", "r") as fh: 5 | long_description = fh.read() 6 | 7 | setuptools.setup( 8 | name="infoselect", 9 | version="1.0.1", 10 | author="Felipe Maia Polo & Felipe Leno da Silva", 11 | author_email="felipemaiapolo@gmail.com, f.leno@usp.br", 12 | description="Mutual Information Based Feature Selection in Python.", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url='https://github.com/felipemaiapolo/infoselect', 16 | packages=setuptools.find_packages(), 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | install_requires=['scipy','numpy','pandas','sklearn','matplotlib'], 23 | ) 24 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) [2019] [info_selection] 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /infoselect/gmm.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | import random 5 | import copy 6 | import matplotlib.pyplot as plt 7 | from sklearn import mixture 8 | from sklearn.model_selection import train_test_split 9 | from scipy.stats import multivariate_normal 10 | 11 | def check_array(X, name="X", dim=2): 12 | if not (type(X)==np.ndarray) or len(X.shape)!=dim: 13 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim)) 14 | 15 | def gmm_scores(X_train, X_val, k=3, covariance_type='full', reg_covar=1e-06, random_state=42): 16 | 17 | ''' 18 | This function trains a GMM and evaluate it in a holdout set using the mean log_likelihood of samples 19 | 20 | Inputs: - X_train: training set; 21 | - k: number of GMM components; 22 | - covariance_type: covariance type (scikit-learn implementation); 23 | 24 | - X_val: holdout set (used if criterion=="loglike"); 25 | - random_state: seed. 26 | 27 | Output: - score 28 | ''' 29 | 30 | assert covariance_type in ['full','tied','diag','spherical'] 31 | 32 | clf = mixture.GaussianMixture(n_components=k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state, max_iter=1000) 33 | clf.fit(X_train) 34 | return clf.score(X_val) 35 | 36 | def edit_covariances(gmm, covariance_type, d): 37 | 38 | n_comp = gmm.n_components 39 | 40 | if covariance_type=='spherical': 41 | covs = np.array([(var*np.eye(d)).tolist() for var in gmm.covariances_]) 42 | 43 | elif covariance_type=='diag': 44 | covs = np.array([np.diag(var).tolist() for var in gmm.covariances_]) 45 | 46 | elif covariance_type=='tied': 47 | covs = np.array(n_comp*[gmm.covariances_.tolist()]) 48 | 49 | else: 50 | covs = gmm.covariances_ 51 | 52 | gmm.covariances_ = covs 53 | 54 | return gmm 55 | 56 | def get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type='full', random_state=42): 57 | 58 | ''' 59 | This function trains a GMM and evaluate it in a holdout set using the mean log_likelihood of samples 60 | 61 | Inputs: - X: numpy array of features; 62 | - y: numpy array of labels; 63 | - y_cat: if we should consider y as categorical; 64 | - num_comps: numbers of GMM components to be tested; 65 | - val_size: size of holdout set used to validate the GMMs numbers of components 66 | - reg_covar: covariance regularization (scikit-learn implementation); 67 | - covariance_type: covariance type (scikit-learn implementation); 68 | - random_state: seed. 69 | 70 | Output: - GMM ou dictionary of GMMs 71 | ''' 72 | 73 | #Checking input format 74 | check_array(X, name="X", dim=2) 75 | check_array(y, name="y", dim=1) 76 | assert covariance_type in ['full','tied','diag','spherical'] 77 | 78 | #Y categorical/or with few values 79 | if y_cat: 80 | classes=list(set(y)) 81 | gmm={} 82 | 83 | for c in classes: 84 | #Selecting number of components 85 | X_gmm_train, X_gmm_val, _, _=train_test_split(X[y==c], X[y==c], test_size=val_size, random_state=random_state) 86 | scores=np.array([gmm_scores(X_gmm_train, X_gmm_val, k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) for k in num_comps]) 87 | k_star=num_comps[np.argmax(scores)] 88 | 89 | #Training GMMs 90 | gmm[c] = mixture.GaussianMixture(n_components=k_star, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) 91 | gmm[c].fit(X[y==c]) 92 | gmm[c] = edit_covariances(gmm[c], covariance_type, X.shape[1]) 93 | 94 | return gmm #it is a dictionary of GMMs 95 | 96 | #Y continuous/or with many values 97 | else: 98 | #Selecting number of components 99 | X_gmm_train, X_gmm_val, y_gmm_train, y_gmm_val = train_test_split(X, y, test_size=val_size, random_state=random_state) 100 | Z_gmm_train=np.hstack((y_gmm_train.reshape((-1,1)), X_gmm_train)) 101 | Z_gmm_val=np.hstack((y_gmm_val.reshape((-1,1)), X_gmm_val)) 102 | scores=np.array([gmm_scores(X_train=Z_gmm_train, X_val=Z_gmm_val, k=k, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) for k in num_comps]) 103 | k_star=num_comps[np.argmax(scores)] 104 | 105 | #Training GMM 106 | Z = np.hstack((y.reshape((-1,1)),X)) 107 | gmm = mixture.GaussianMixture(n_components=k_star, covariance_type=covariance_type, reg_covar=reg_covar, random_state=random_state) 108 | gmm.fit(Z) 109 | gmm = edit_covariances(gmm, covariance_type, Z.shape[1]) 110 | 111 | return gmm #it is a GMM -------------------------------------------------------------------------------- /infoselect/mi.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import pandas as pd 4 | import random 5 | import copy 6 | import matplotlib.pyplot as plt 7 | from sklearn import mixture 8 | from sklearn.model_selection import train_test_split 9 | from scipy.stats import multivariate_normal 10 | 11 | def check_array(X, name="X", dim=2): 12 | if not (type(X)==np.ndarray) or len(X.shape)!=dim: 13 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim)) 14 | 15 | def MI_gmm_reg(X, y, gmm, feat, eps=10**-50): 16 | 17 | ''' 18 | This function calculates the mutual information between y and X in cases where we assume y continuous/with many values. 19 | 20 | Inputs: - X: numpy array of features; 21 | - y: numpy array of labels; 22 | - gmm: GMM trained model; 23 | - feat: features indexes (feat); 24 | - eps: small value so we can avoid taking log of zero in some cases 25 | 26 | Output: - dictionary containing the estimate for the mutual information between y and X, 27 | and the standard deviation of measurements calculated from the samples. 28 | ''' 29 | 30 | n, d = X.shape 31 | components=gmm.n_components 32 | Z=np.hstack((y.reshape((-1,1)),X)) 33 | feat2=[0]+[f+1 for f in feat] #feat2 includes y in addition de X[:,feat]. PS: we sum 1 because when we train GMM, the first variable is always y 34 | 35 | ### Calculating log-likelihood with samples (x_i,y_i) 36 | like=np.zeros(n) 37 | for c in range(components): 38 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,feat2], gmm.means_[c][feat2], gmm.covariances_[c][feat2][:,feat2]) 39 | 40 | log_like_xy=np.log(like + eps) 41 | 42 | 43 | ### Calculating log-likelihood with samples (x_i) 44 | like=np.zeros(n) 45 | for c in range(components): 46 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,feat2[1:]], gmm.means_[c][feat2[1:]], gmm.covariances_[c][feat2[1:]][:,feat2[1:]]) 47 | 48 | log_like_x=np.log(like + eps) 49 | 50 | 51 | ### Calculating log-likelihood with samples (y_i) 52 | like=np.zeros(n) 53 | for c in range(components): 54 | like+=gmm.weights_[c]*multivariate_normal.pdf(Z[:,0], gmm.means_[c][0], gmm.covariances_[c][0][0]) 55 | 56 | log_like_y=np.log(like + eps) 57 | 58 | 59 | #Output 60 | m=np.mean(log_like_xy-log_like_x-log_like_y) 61 | s=np.std(log_like_xy-log_like_x-log_like_y) 62 | 63 | return {'mi':m, 'std':s} 64 | 65 | 66 | 67 | def MI_gmm_class(X, y, gmm, feat, eps=10**-50): 68 | 69 | ''' 70 | This function calculates the mutual information between y and X in cases where we assume y categorical/with few values. 71 | 72 | Inputs: - X: numpy array of features; 73 | - y: numpy array of labels; 74 | - gmm: dict. of GMM trained models; 75 | - feat: features indexes (feat); 76 | - eps: small value so we can avoid taking log of zero in some cases 77 | 78 | Output: - dictionary containing the estimate for the mutual information between y and X, 79 | and the standard deviation of measurements calculated from the samples. 80 | ''' 81 | 82 | 83 | n,d=X.shape 84 | classes=list(set(y)) 85 | p={} 86 | 87 | ### Calculating log-likelihood with samples (y_i) 88 | like=np.zeros(n) 89 | for c in classes: 90 | p[c]=np.mean(y==c) 91 | like[y==c]=p[c] 92 | 93 | log_like_y=np.log(like + eps) 94 | 95 | 96 | ### Calculating log-likelihood with samples (x_i,y_i) 97 | like=np.zeros(n) 98 | for c in classes: 99 | #X|Y 100 | like_aux=np.zeros(n) 101 | for comp in range(gmm[c].n_components): 102 | like_aux[y==c]+=gmm[c].weights_[comp]*multivariate_normal.pdf(X[y==c][:,feat], gmm[c].means_[comp][feat], gmm[c].covariances_[comp][feat][:,feat]) 103 | 104 | #(X,Y) 105 | like[y==c]=p[c]*like_aux[y==c] 106 | log_like_xy=np.log(like + eps) 107 | 108 | 109 | ### Calculating log-likelihood with samples (x_i) 110 | like=np.zeros(n) 111 | for c in classes: 112 | #X|Y 113 | like_aux=np.zeros(n) 114 | for comp in range(gmm[c].n_components): 115 | like_aux+=gmm[c].weights_[comp]*multivariate_normal.pdf(X[:,feat], gmm[c].means_[comp][feat], gmm[c].covariances_[comp][feat][:,feat]) 116 | 117 | #Marginalization of (X,Y) 118 | like+=p[c]*like_aux 119 | 120 | log_like_x=np.log(like + eps) 121 | 122 | 123 | #Output 124 | m=np.mean(log_like_xy-log_like_x-log_like_y) 125 | s=np.std(log_like_xy-log_like_x-log_like_y) 126 | 127 | return {'mi':m, 'std':s} 128 | 129 | 130 | 131 | def MI(cand, posic, r, X, y, gmm, include_cand = True, eps=10**-50): 132 | 133 | ''' 134 | This function is an intermediary function between the main class and the two functions that make the calculation of the 135 | mutual information. It basically decides which of the two functions to use and if we should do the forward or backward step. 136 | 137 | Inputs: - cand: position of the candidate variable to be chosen; 138 | - posic: list with positions of the selected variables so far; 139 | - r: round; 140 | - X: numpy array of features; 141 | - y: numpy array of labels; 142 | - gmm: model or dict. of GMM(s); 143 | - include_cand: include or remove variables (forwar/backward)d; 144 | - eps: small value so we can avoid taking log of zero in some cases 145 | 146 | Output: - cand: position of the candidate variable to be chosen; 147 | - dic: dictionary containing the estimate for the mutual information between y and X, 148 | and the standard deviation of measurements calculated from the samples. 149 | ''' 150 | 151 | n,d=X.shape 152 | aux = copy.deepcopy(posic) 153 | if include_cand: 154 | aux[r] = cand 155 | else: 156 | aux.remove(cand) 157 | 158 | if type(gmm)==dict: 159 | dic=MI_gmm_class(X, y, gmm, aux, eps) 160 | else: 161 | dic=MI_gmm_reg(X, y, gmm, aux, eps) 162 | 163 | return cand, dic -------------------------------------------------------------------------------- /infoselect/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | name="info_selection/info_selection" 3 | __version__ = "1.0.0" 4 | 5 | ### 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | import random 10 | import copy 11 | import matplotlib.pyplot as plt 12 | from sklearn import mixture 13 | from sklearn.model_selection import train_test_split 14 | from scipy.stats import multivariate_normal 15 | from .gmm import * 16 | from .mi import * 17 | 18 | def check_array(X, name="X", dim=2): 19 | if not (type(X)==np.ndarray) or len(X.shape)!=dim: 20 | raise ValueError(name+" should be a {:}-dimensional Numpy array.".format(dim)) 21 | 22 | class SelectVars: 23 | 24 | ''' 25 | This is the main class of the package. 26 | ''' 27 | 28 | selection_mode = None 29 | gmm = None 30 | n = None 31 | 32 | def __init__(self, gmm, selection_mode = 'forward'): 33 | """ 34 | Inputs: - gmm: model or dict. of GMM(s); 35 | - selection_mode: forward/backward algorithms. 36 | """ 37 | 38 | if not selection_mode in ['forward', 'backward']: 39 | raise ValueError("Selection model should be either 'forward' or 'backward'.") 40 | 41 | self.selection_mode = selection_mode 42 | self.gmm=gmm 43 | 44 | def fit(self, X, y, verbose=True, eps=0): 45 | 46 | ''' 47 | This function order the features according to their importance - from 48 | most important to least important (forward) or from least important to most importante (backward). 49 | 50 | Inputs: - X: numpy array of features; 51 | - y: numpy array of labels; 52 | - verbose: print or not to print!? 53 | - eps: small value so we can avoid taking log of zero in some cases 54 | ''' 55 | 56 | #Checking input format 57 | check_array(X, name="X", dim=2) 58 | check_array(y, name="y", dim=1) 59 | 60 | '''Creating some important objects''' 61 | self.n, self.d = X.shape 62 | include_var = self.selection_mode == 'forward' #True if include or False if remove 63 | 64 | self.delta_list = [] # list with history of % changes of mutual info when adding/removing the best/worst variables 65 | self.mi_list = [] # list with mutual info history by adding/removing the best/worst variables 66 | self.stds_list = [] # list with stds history and that we will use to calculate the standard error of MIs 67 | self.feat_hist=[] # history of variables at each round 68 | lista = list(range(self.d)) # list with indexes of all variables 69 | 70 | '''Defining number of iterations and list of features we use in each iteration''' 71 | if verbose: print("Let's start...\n") 72 | 73 | #The 'posic' list starts empty if you include 74 | if include_var: 75 | posic = [] # lista com posições das variáveis selecionadas até o momento 76 | self.feat_hist.append(copy.deepcopy(posic)) 77 | rounds = range(self.d) 78 | 79 | self.mi_list.append(0) 80 | self.stds_list.append(0) 81 | self.delta_list.append(0) 82 | 83 | #The 'posic' list starts full if we take it out 84 | else: 85 | posic = copy.deepcopy(lista) 86 | self.feat_hist.append(copy.deepcopy(posic)) 87 | rounds = range(self.d-1) 88 | 89 | if type(self.gmm)==dict: 90 | dic=MI_gmm_class(X, y, self.gmm, posic, eps) 91 | else: 92 | dic=MI_gmm_reg(X, y, self.gmm, posic, eps) 93 | 94 | self.mi_list.append(dic['mi']) 95 | self.stds_list.append(dic['std']) 96 | self.delta_list.append(0) 97 | 98 | if verbose: print("Round = {:3d} | Î = {:5.2f} | Δ%Î = {:5.2f} | Features={}".format(0, self.mi_list[-1], 0, posic)) 99 | 100 | 101 | '''Calculating the Mutual Information (forward or backward fashion)''' 102 | for r in rounds: # "r" of rounds/repetitions 103 | 104 | if include_var: 105 | posic.append(None) 106 | 107 | #Calcula MI entre y e X[:,(posic, cand)] -> cand: variável candidata a ser selecionada 108 | outputs = [MI(cand, posic, r, X, y, self.gmm, include_var, eps) for cand in lista] 109 | 110 | #Escolhendo variável que traz maior retorno 111 | MI_best=-math.inf 112 | 113 | for out in outputs: 114 | 115 | cand, dic = out 116 | MI_current = dic['mi'] 117 | 118 | if MI_current > MI_best: 119 | MI_best = MI_current 120 | std_best = dic['std'] 121 | best_index = cand 122 | 123 | #Δ%Î 124 | if r==0 and include_var: 125 | self.delta_list.append(0) 126 | else: 127 | self.delta_list.append(MI_best/self.mi_list[-1]-1) 128 | 129 | #Updating variable list 130 | lista.remove(best_index) 131 | if include_var: 132 | posic[r] = best_index 133 | else: 134 | posic.remove(best_index) 135 | 136 | #Updating lists 137 | self.mi_list.append(MI_best) 138 | self.stds_list.append(std_best) 139 | self.feat_hist.append(copy.deepcopy(posic)) 140 | 141 | #Verbose 142 | if verbose: print("Round = {:3d} | Î = {:5.2f} | Δ%Î = {:5.2f} | Features={}".format(r+1, MI_best, self.delta_list[-1], posic)) 143 | 144 | 145 | 146 | def get_info(self): 147 | 148 | ''' 149 | This function creates and outputs a Pandas DataFrame with the history of feature importance. 150 | ''' 151 | 152 | dic={'rounds': range(0,len(self.mi_list)), 153 | 'mi_mean': self.mi_list, 154 | 'mi_error': [s/np.sqrt(self.n) for s in self.stds_list], 155 | 'delta': self.delta_list, 156 | 'features':self.feat_hist, 157 | 'num_feat':[len(l) for l in self.feat_hist]} 158 | return pd.DataFrame(dic).loc[:,['rounds','mi_mean','mi_error','delta','num_feat','features']] 159 | 160 | def plot_delta(self): 161 | 162 | ''' 163 | This function plots the history of percentual changes in the mutual information. 164 | ''' 165 | 166 | l=self.delta_list 167 | plt.plot(list(range(1,len(l))),l[1:]) 168 | plt.axhline(y=0, color='r', linestyle='--') 169 | plt.xlabel("Rounds") 170 | plt.ylabel("Δ% Mutual Information") 171 | plt.show() 172 | 173 | def plot_mi(self): 174 | 175 | ''' 176 | This function plots the history of the mutual information. 177 | ''' 178 | 179 | l,s=self.mi_list, self.stds_list 180 | plt.errorbar(list(range(len(l))), l, yerr=np.array(s)/np.sqrt(self.n)) # 181 | plt.axhline(y=0, color='g', linestyle='--') 182 | plt.xlabel("Rounds") 183 | plt.ylabel("Mutual Information") 184 | plt.show() 185 | 186 | def transform(self, X, rd): 187 | 188 | ''' 189 | This transforms X using the round 'rd'. Examine the history dataframe and plot before choosing 'rd'. 190 | ''' 191 | 192 | return X[:,self.get_info().loc[rd,'features']] 193 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ***InfoSelect*** - Mutual Information Based Feature Selection in Python 3 | 4 | 5 | 6 | 7 | ### *Felipe Maia Polo (felipemaiapolo), Felipe Leno da Silva (f-leno)* 8 | 9 | [![PyPI](https://img.shields.io/pypi/v/infoselect.svg)](https://pypi.python.org/pypi/infoselect) 10 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/felipemaiapolo/infoselect/blob/master/InfoSelect.ipynb) 11 | 12 | In case you have any question or suggestion, please get in touch sending us an e-mail in *felipemaiapolo@gmail.com*. 13 | 14 | -------------- 15 | ## Contents 16 | 1. [ Introduction ](#1) 17 | 2. [ Installing *InfoSelect* ](#2) 18 | 3. [ Main functionalities of *InfoSelect* ](#3) 19 | 4. [ Examples of *InfoSelect* use ](#4) 20 | 5. [ References ](#5) 21 | 22 | -------------- 23 | 24 | 25 | ## 1\. Introduction 26 | 27 | In this package we implement the ideas proposed by [1, 2] in order to make variable/feature selection prior to regression and classification tasks using Gaussian Mixture Models (GMMs) to estimate the Mutual Information between labels and features. This is an efficient and well-performing alternative and was used in a recent work [3] by one of us: 28 | 29 | @article{maia2022effective, 30 | title={Effective sample size, dimensionality, and generalization in covariate shift adaptation}, 31 | author={Maia Polo, Felipe and Vicente, Renato}, 32 | journal={Neural Computing and Applications}, 33 | pages={1--13}, 34 | year={2022}, 35 | publisher={Springer} 36 | } 37 | 38 | 39 | @misc{polo2020infoselect, 40 | title={InfoSelect - Mutual Information Based Feature Selection in Python}, 41 | author={Polo, Felipe Maia and Da Silva, Felipe Leno}, 42 | journal={GitHub: github.com/felipemaiapolo/infoselect}, 43 | year={2020} 44 | } 45 | 46 | 47 | 48 | 49 | -------------- 50 | 51 | 52 | ## 2\. Installing *InfoSelect* 53 | 54 | You can install the package from 55 | [PyPI](https://pypi.org/project/infoselect/) 56 | 57 | ``` :sh 58 | $ pip install infoselect 59 | ``` 60 | 61 | Also, you can install the package from 62 | [GitHub](https://github.com/felipemaiapolo/infosel). 63 | 64 | ``` :sh 65 | $ pip install git+https://github.com/felipemaiapolo/infoselect.git#egg=infoselect 66 | ``` 67 | 68 | -------------------- 69 | 70 | 71 | ## 3\. Main functionalities of *InfoSelect* 72 | 73 | 74 | ### 3.1\. Main Class `SelectVars` 75 | 76 | This class is used to order features/variables according to their importance and making the selection itself. Next we detail its methods: 77 | 78 | 1. `__init__(self, gmm, selection_mode = 'forward')` 79 | - **gmm**: 80 | - If $Y$ is *non*-categorical: a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) - y should always be in the first column; 81 | - If $Y$ is categorical: a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`; 82 | - Please use auxiliary function `get_gmm` below, especially if you want to use `covariance_type` other than 'full'. 83 | - **selection_mode**: `forward`/`backward` algorithms. 84 | - `forward` selection: we start with an empty set of features and then select the feature that has the largest estimated mutual information with the target variable and. At each subsequent step, we select the feature that marginally maximizes the estimated mutual information of the target and all the chosen features so far. We stop when we have selected/ordered all the features; 85 | - `backward` elimination: we start with the full set of features and then at each step, we eliminate the feature that marginally maximizes the estimated mutual information of the target and all the remaining features. We stop when we have no more features to eliminate; 86 | 87 | 2. `fit(self, X, y, verbose=True, eps=0)` 88 | - **X**: numpy array of features; 89 | - **y**: numpy array of labels; 90 | - **verbose**: print or not to print!? 91 | - **eps**: small value so we can avoid taking log of zero in some cases. 92 | 93 | 3. `get_info(self)`: 94 | - This function creates and outputs a Pandas DataFrame with the history of feature selection/elimination. The `mi_mean` column gives the estimated Mutual Information while `mi_error` gives the standard error of that estimate. On the other hand, the `delta` column gives us the percentual information loss/gain in that round, relatively to the latter; 95 | 96 | 4. `plot_delta(self)`: 97 | - This function plots the history of percentual changes in the mutual information. 98 | 99 | 5. `plot_mi(self)`: 100 | - This function plots the history of the mutual information. 101 | 102 | 6. `transform(self, X, rd)`: 103 | - This function takes **X** and transforms it in **X_new**, maintaining the features of Round `rd`; 104 | 105 | 106 | ### 3.2\. Auxiliary Function `get_gmm` 107 | 108 | 1. `get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type="full", random_state=42)`: 109 | 110 | - Firstly, this function validate the number of GMM components, for each model it will train, in a holdout set using the mean log likelihood of samples in that set. If Y is non-categorical, it returns a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) model (in this order). On the other hand, if Y is categorical it returns a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`. 111 | 112 | - **X**: numpy array of features; 113 | - **y**: numpy array of labels; 114 | - **y_cat**: if we should consider Y as categorical; 115 | - **num_comps**: numbers of GMM components to be validated; 116 | - **val_size**: size of holdout set used to validate the GMMs numbers of components; 117 | - **reg_covar**: non-negative regularization added to the diagonal of covariance. Ensures the covariance matrices are non-singular. 118 | - **covariance_type**: one of the following options:'full','tied','diag','spherical'. See [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html). Thanks to Pritha Gupta for her suggestion on this point. 119 | - **random_state**: seed. 120 | 121 | -------------------- 122 | 123 | 124 | ## 4\. Examples of *InfoSelect* use 125 | 126 | Loading Packages: 127 | 128 | 129 | ```python 130 | import infoselect as inf 131 | import numpy as np 132 | import pandas as pd 133 | import matplotlib.pyplot as plt 134 | ``` 135 | 136 | 137 | ### 4.1\. Dataset 138 | 139 | We generate a dataset $\mathcal{D}$ sampled from $\mathcal{D}=\{(X_{0,i},...,X_{6,i},Y_i)\}_{i=1}^{n}$ similar to the one in [here](https://www.cs.toronto.edu/~delve/data/add10/desc.html), in which $Y_i$ is given by 140 | 141 | $$ 142 | Y_i = 10 \sin(\pi X_{0,i} X_{1,i}) + 20 (X_{2,i} - 0.5)^2 + 10 X_{3,i} + 5 X_{4,i} + \varepsilon_i 143 | $$ 144 | 145 | Where $X_{0,i},...,X_{6,i} \overset{iid}{\sim} U[0,1]$ and $\varepsilon_i \sim N(0,1)$ independent from all the other random variables for all $i\in [n]$. See that our target variable does not depend on the last two features. In the following, we set `n=10000`: 146 | 147 | 148 | ```python 149 | n=10000 150 | d=7 151 | 152 | X = np.random.uniform(0,1,d*n).reshape((n,d)) 153 | e = np.random.normal(0,1,n) 154 | y = f(X,e) 155 | 156 | X.shape, y.shape 157 | ``` 158 | 159 | 160 | 161 | 162 | ((10000, 7), (10000,)) 163 | 164 | 165 | 166 | ### 4.2\. Selecting Features for a Regression Task 167 | 168 | Training (and validating) GMM: 169 | 170 | 171 | ```python 172 | %%time 173 | 174 | gmm = inf.get_gmm(X, y) 175 | ``` 176 | 177 | Wall time: 8.43 s 178 | 179 | 180 | Ordering features by their importances using the *Backward Elimination* algorithm: 181 | 182 | 183 | ```python 184 | select = inf.SelectVars(gmm, selection_mode = 'backward') 185 | select.fit(X, y, verbose=True) 186 | ``` 187 | 188 | Let's start... 189 | 190 | Round = 0 | Î = 1.36 | Δ%Î = 0.00 | Features=[0, 1, 2, 3, 4, 5, 6] 191 | Round = 1 | Î = 1.36 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4, 5] 192 | Round = 2 | Î = 1.36 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4] 193 | Round = 3 | Î = 0.97 | Δ%Î = -0.29 | Features=[0, 1, 3, 4] 194 | Round = 4 | Î = 0.73 | Δ%Î = -0.24 | Features=[0, 1, 3] 195 | Round = 5 | Î = 0.40 | Δ%Î = -0.46 | Features=[0, 3] 196 | Round = 6 | Î = 0.21 | Δ%Î = -0.48 | Features=[3] 197 | 198 | 199 | Checking history: 200 | 201 | 202 | ```python 203 | select.get_info() 204 | ``` 205 | 206 | 207 | 208 | 209 |
210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 |
roundsmi_meanmi_errordeltanum_featfeatures
001.3588320.0087710.0000007[0, 1, 2, 3, 4, 5, 6]
111.3580900.008757-0.0005466[0, 1, 2, 3, 4, 5]
221.3566610.008753-0.0010535[0, 1, 2, 3, 4]
330.9698970.007843-0.2850854[0, 1, 3, 4]
440.7345780.007396-0.2426223[0, 1, 3]
550.4000700.007192-0.4553752[0, 3]
660.2098080.005429-0.4755711[3]
288 |
289 | 290 | 291 | 292 | It is possible to see that the estimated mutual information is untouched until Round 2, when it varies around -30%. 293 | 294 | Since there is a 'break' in Round 2, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows: 295 | 296 | 297 | ```python 298 | select.plot_mi() 299 | ``` 300 | 301 | 302 | 303 | 304 | Plotting the percentual variations of the mutual information between rounds: 305 | 306 | 307 | ```python 308 | select.plot_delta() 309 | ``` 310 | 311 | 312 | ![png](https://raw.githubusercontent.com/felipemaiapolo/imgs_infoselect/main/output_19_0.png) 313 | 314 | 315 | Making the selection choosing to stop at Round 2: 316 | 317 | 318 | ```python 319 | X_new = select.transform(X, rd=2) 320 | 321 | X_new.shape 322 | ``` 323 | 324 | 325 | 326 | 327 | (10000, 5) 328 | 329 | 330 | 331 | ### 4.3\. Selecting Features for a Classification Task 332 | 333 | Categorizing Y: 334 | 335 | 336 | ```python 337 | ind0 = (y 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 |
roundsmi_meanmi_errordeltanum_featfeatures
000.0000000.0000000.0000000[]
110.1395420.0052170.0000001[3]
220.2808350.0063771.0125422[3, 0]
330.5038720.0064990.7941963[3, 0, 1]
440.6170480.0063220.2246124[3, 0, 1, 4]
550.7459330.0051350.2088745[3, 0, 1, 4, 2]
660.7455490.005202-0.0005156[3, 0, 1, 4, 2, 5]
770.7409680.005457-0.0061447[3, 0, 1, 4, 2, 5, 6]
490 | 491 | 492 | 493 | 494 | It is possible to see that the estimated mutual information is untouched from Round 6 onwards. 495 | 496 | Since there is a 'break' in Round 5, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows: 497 | 498 | 499 | ```python 500 | select.plot_mi() 501 | ``` 502 | 503 | 504 | ![png](https://raw.githubusercontent.com/felipemaiapolo/imgs_infoselect/main/output_33_0.png) 505 | 506 | 507 | Plotting the percentual variations of the mutual information between rounds: 508 | 509 | 510 | ```python 511 | select.plot_delta() 512 | ``` 513 | 514 | ![png](https://raw.githubusercontent.com/felipemaiapolo/imgs_infoselect/main/output_35_0.png) 515 | 516 | 517 | Making the selection choosing to stop at Round 5: 518 | 519 | 520 | ```python 521 | X_new = select.transform(X, rd=5) 522 | 523 | X_new.shape 524 | ``` 525 | 526 | 527 | 528 | 529 | (10000, 5) 530 | 531 | -------------- 532 | 533 | 534 | ## 5\. References 535 | 536 | [1] Eirola, E., Lendasse, A., & Karhunen, J. (2014, July). Variable selection for regression problems using Gaussian mixture models to estimate mutual information. In 2014 International Joint Conference on Neural Networks (IJCNN) (pp. 1606-1613). IEEE. 537 | 538 | [2] Lan, T., Erdogmus, D., Ozertem, U., & Huang, Y. (2006, July). Estimating mutual information using gaussian mixture model for feature ranking and selection. In The 2006 IEEE International Joint Conference on Neural Network Proceedings (pp. 5034-5039). IEEE. 539 | 540 | [3] Maia Polo, F., & Vicente, R. (2022). Effective sample size, dimensionality, and generalization in covariate shift adaptation. Neural Computing and Applications, 1-13. 541 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/InfoSelect-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "view-in-github" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "veqo_rjtmTKv" 17 | }, 18 | "source": [ 19 | "# ***InfoSelect*** - Mutual Information Based Feature Selection in Python\n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "\n", 24 | "### *Felipe Maia Polo (felipemaiapolo), Felipe Leno da Silva (f-leno)*\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "08mvCZGw1pvk" 31 | }, 32 | "source": [ 33 | "## Contents\n", 34 | "1. [ Introduction ](#1)\n", 35 | "2. [ Installing *InfoSelect* ](#2)\n", 36 | "3. [ Main functionalities of *InfoSelect* ](#3)\n", 37 | "4. [ Examples of *InfoSelect* use ](#4)\n", 38 | "5. [ References ](#5)\n", 39 | "\n", 40 | "--------------\n", 41 | "\n", 42 | "\n", 43 | "## 1\\. Introduction \n", 44 | "\n", 45 | "In this package we implement the ideas proposed by [1, 2] in order to make variable/feature selection prior to regression and classification tasks using Gaussian Mixture Models (GMMs) to estimate the Mutual Information between labels and features. This is an efficient and well-performing alternative and was used in a recent work [3] by one of us.\n", 46 | "\n", 47 | "If you use our package in your research, you can cite it as follows:\n", 48 | "\n", 49 | " @article{maia2022effective,\n", 50 | " title={Effective sample size, dimensionality, and generalization in covariate shift adaptation},\n", 51 | " author={Maia Polo, Felipe and Vicente, Renato},\n", 52 | " journal={Neural Computing and Applications},\n", 53 | " pages={1--13},\n", 54 | " year={2022},\n", 55 | " publisher={Springer}\n", 56 | " }\n", 57 | "\n", 58 | "\n", 59 | " @misc{polo2020infoselect,\n", 60 | " title={InfoSelect - Mutual Information Based Feature Selection in Python},\n", 61 | " author={Polo, Felipe Maia and Da Silva, Felipe Leno},\n", 62 | " journal={GitHub: github.com/felipemaiapolo/infoselect},\n", 63 | " year={2020}\n", 64 | " }\n", 65 | "\n", 66 | "\n", 67 | "--------------\n", 68 | "\n", 69 | "\n", 70 | "## 2\\. Installing *InfoSelect* \n", 71 | "\n", 72 | "You can install the package from\n", 73 | "[PyPI](https://pypi.org/project/infoselect/)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 1, 79 | "metadata": { 80 | "colab": { 81 | "base_uri": "https://localhost:8080/" 82 | }, 83 | "id": "MFT2CLZomTK1", 84 | "outputId": "876f4022-a7e9-492b-cbc0-1b50fee497d0" 85 | }, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Requirement already satisfied: infoselect in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (1.0.2)\n", 92 | "Requirement already satisfied: sklearn in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (0.0)\n", 93 | "Requirement already satisfied: numpy in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.21.6)\n", 94 | "Requirement already satisfied: pandas in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.3.5)\n", 95 | "Requirement already satisfied: scipy in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (1.7.3)\n", 96 | "Requirement already satisfied: matplotlib in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from infoselect) (3.5.1)\n", 97 | "Requirement already satisfied: packaging>=20.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (21.3)\n", 98 | "Requirement already satisfied: cycler>=0.10 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (0.11.0)\n", 99 | "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (2.8.2)\n", 100 | "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (3.0.4)\n", 101 | "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (4.33.3)\n", 102 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (1.4.2)\n", 103 | "Requirement already satisfied: pillow>=6.2.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from matplotlib->infoselect) (9.1.0)\n", 104 | "Requirement already satisfied: typing-extensions in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from kiwisolver>=1.0.1->matplotlib->infoselect) (4.1.1)\n", 105 | "Requirement already satisfied: six>=1.5 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib->infoselect) (1.16.0)\n", 106 | "Requirement already satisfied: pytz>=2017.3 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from pandas->infoselect) (2022.1)\n", 107 | "Requirement already satisfied: scikit-learn in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from sklearn->infoselect) (1.0.2)\n", 108 | "Requirement already satisfied: joblib>=0.11 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from scikit-learn->sklearn->infoselect) (1.1.0)\n", 109 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/anaconda/envs/cit/lib/python3.7/site-packages (from scikit-learn->sklearn->infoselect) (3.1.0)\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "!pip install infoselect" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "R0IgxOvs2VLp" 121 | }, 122 | "source": [ 123 | "\n", 124 | "## 3\\. Main functionalities of *InfoSelect* \n", 125 | "\n", 126 | "\n", 127 | "### 3.1\\. Main Class `SelectVars`\n", 128 | "\n", 129 | "This class is used to order features/variables according to their importance and making the selection itself. Next we detail its methods:\n", 130 | "\n", 131 | "1. `__init__(self, gmm, selection_mode = 'forward')`\n", 132 | " - **gmm**: \n", 133 | " - If is *non*-categorical: a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) - y should always be in the first column;\n", 134 | " - If is categorical: a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`;\n", 135 | " - PS: the GMMs must be `covariance_type='full'` at the current *InfoSelect* version.\n", 136 | " - **selection_mode**: `forward`/`backward` algorithms.\n", 137 | " - `forward` selection: we start with an empty set of features and then select the feature that has the largest estimated mutual information with the target variable and. At each subsequent step, we select the feature that marginally maximizes the estimated mutual information of the target and all the chosen features so far. We stop when we have selected/ordered all the features;\n", 138 | " - `backward` elimination: we start with the full set of features and then at each step, we eliminate the feature that marginally maximizes the estimated mutual information of the target and all the remaining features. We stop when we have no more features to eliminate;\n", 139 | "\n", 140 | "2. `fit(self, X, y, verbose=True, eps=0)`\n", 141 | " - **X**: numpy array of features; \n", 142 | " - **y**: numpy array of labels;\n", 143 | " - **verbose**: print or not to print!?\n", 144 | " - **eps**: small value so we can avoid taking log of zero in some cases .\n", 145 | "\n", 146 | "3. `get_info(self)`: \n", 147 | " - This function creates and outputs a Pandas DataFrame with the history of feature selection/elimination. The `mi_mean` column gives the estimated Mutual Information while `mi_error` gives the standard error of that estimate. On the other hand, the `delta` column gives us the percentual information loss/gain in that round, relatively to the latter;\n", 148 | " \n", 149 | "4. `plot_delta(self)`: \n", 150 | " - This function plots the history of percentual changes in the mutual information.\n", 151 | " \n", 152 | "5. `plot_mi(self)`: \n", 153 | " - This function plots the history of the mutual information.\n", 154 | " \n", 155 | "6. `transform(self, X, rd)`: \n", 156 | " - This function takes **X** and transforms it in **X_new**, maintaining the features of Round `rd`; \n", 157 | " \n", 158 | "\n", 159 | "### 3.2\\. Auxiliary Function `get_gmm`\n", 160 | "\n", 161 | "1. `get_gmm(X, y, y_cat=False, num_comps=[2,5,10,15,20,30,40,50], val_size=0.33, reg_covar=1e-06, covariance_type=\"full\", random_state=42)`: \n", 162 | "\n", 163 | " - Firstly, this function validate the number of GMM components, for each model it will train, in a holdout set using the mean log likelihood of samples in that set. If Y is non-categorical, it returns a [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in (y,X) model (in this order). On the other hand, if Y is categorical it returns a Python dictionary containing one [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) fitted in X conditional on each category - something like X[y==c,:]. Format `{0:gmm0, 1:gmm1, ..., C:gmmC}`.\n", 164 | "\n", 165 | " - **X**: numpy array of features; \n", 166 | " - **y**: numpy array of labels;\n", 167 | " - **y_cat**: if we should consider Y as categorical;\n", 168 | " - **num_comps**: numbers of GMM components to be validated;\n", 169 | " - **val_size**: size of holdout set used to validate the GMMs numbers of components;\n", 170 | " - **reg_covar**: non-negative regularization added to the diagonal of covariance. Ensures the covariance matrices are non-singular.\n", 171 | " - **covariance_type**: one of the following options:'full','tied','diag','spherical'. See [Scikit-Learn GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html)\n", 172 | " - **random_state**: seed.\n", 173 | "\n", 174 | "--------------------\n", 175 | "\n", 176 | "\n", 177 | "## 4\\. Examples of *InfoSelect* use\n", 178 | "\n", 179 | "Loading Packages:" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 2, 185 | "metadata": { 186 | "id": "EdhDRC3SmTLS" 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "import infoselect as inf\n", 191 | "import numpy as np \n", 192 | "import pandas as pd\n", 193 | "import matplotlib.pyplot as plt" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "id": "YdeqfsuymTLX" 200 | }, 201 | "source": [ 202 | "### 1.1\\. Dataset\n", 203 | "\n", 204 | "We generate a dataset $D$ sampled from $\\mathcal{D}=\\{(X_{0,i},...,X_{6,i},Y_i)\\}_{i=1}^{n}$ similar to the one in [here](https://www.cs.toronto.edu/~delve/data/add10/desc.html), in which $Y_i$ is given by\n", 205 | "\n", 206 | "
\n", 207 | "\\begin{align}\n", 208 | "Y_i &= f(X_{0,i},...,X_{6,i}) + \\epsilon_i \\\\[.5em]\n", 209 | "&=10\\cdot \\sin(\\pi X_{0,i} X_{1,i}) + 20 (X_{2,i}-0.5)^2 + 10 X_{3,i} + 5 X_{4,i} + \\epsilon_i\n", 210 | "\\end{align}\n", 211 | "
\n", 212 | "\n", 213 | "Where $X_{0,i},...,X_{6,i} \\overset{iid}{\\sim} U[0,1]$ and $\\epsilon_i \\sim N(0,1)$ independent from all the other random variables for all $i\\in [n]$. In the following we set $n=20000$:" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 3, 219 | "metadata": { 220 | "id": "8Y4_HMDDmTLZ" 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "def f(X,e): return 10*np.sin(np.pi*X[:,0]*X[:,1]) + 20*(X[:,2]-.5)**2 + 10*X[:,3] + 5*X[:,4] + e" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 4, 230 | "metadata": { 231 | "colab": { 232 | "base_uri": "https://localhost:8080/" 233 | }, 234 | "id": "isQqEnhDmTLl", 235 | "outputId": "cd966641-eadb-40cc-ec64-67cb54ae8f31" 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "((20000, 7), (20000,))" 242 | ] 243 | }, 244 | "execution_count": 4, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "n=20000\n", 251 | "d=7\n", 252 | "\n", 253 | "X = np.random.uniform(0,1,d*n).reshape((n,d))\n", 254 | "e = np.random.normal(0,1,n)\n", 255 | "y = f(X,e)\n", 256 | "\n", 257 | "X.shape, y.shape" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": { 263 | "id": "Sgl9V_enmTL2" 264 | }, 265 | "source": [ 266 | "### 1.2\\. Selecting Features for a Regression Task" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": { 272 | "id": "J-KknWzDmTL4" 273 | }, 274 | "source": [ 275 | "Training (and validating) GMM:" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 228, 281 | "metadata": { 282 | "colab": { 283 | "base_uri": "https://localhost:8080/" 284 | }, 285 | "id": "KKg_aU1hmTL8", 286 | "outputId": "e0071c65-ee69-4627-b4f1-519111c87d5c" 287 | }, 288 | "outputs": [ 289 | { 290 | "ename": "TypeError", 291 | "evalue": "get_gmm() got an unexpected keyword argument 'covariance_type'", 292 | "output_type": "error", 293 | "traceback": [ 294 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 295 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 296 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 297 | "\u001b[0;31mTypeError\u001b[0m: get_gmm() got an unexpected keyword argument 'covariance_type'" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "%%time\n", 303 | "\n", 304 | "gmm = inf.get_gmm(X, y)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": { 311 | "colab": { 312 | "base_uri": "https://localhost:8080/" 313 | }, 314 | "id": "FPtgps_emTMF", 315 | "outputId": "7854aeca-190e-45cb-8665-8572f38ac2ac" 316 | }, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,\n", 322 | " means_init=None, n_components=20, n_init=1,\n", 323 | " precisions_init=None, random_state=42, reg_covar=1e-06,\n", 324 | " tol=0.001, verbose=0, verbose_interval=10, warm_start=False,\n", 325 | " weights_init=None)" 326 | ] 327 | }, 328 | "execution_count": 6, 329 | "metadata": { 330 | "tags": [] 331 | }, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "gmm" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": { 342 | "id": "K0gL-DJ-mTMO" 343 | }, 344 | "source": [ 345 | "Ordering features by their importances using the *Backward Elimination* algorithm:" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": { 352 | "colab": { 353 | "base_uri": "https://localhost:8080/" 354 | }, 355 | "id": "F7Gy-vgWmTMR", 356 | "outputId": "a4d2f504-ebf5-4b47-c647-5b61f6cdd09c" 357 | }, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "Let's start...\n", 364 | "\n", 365 | "Round = 0 | Î = 1.48 | Δ%Î = 0.00 | Features=[0, 1, 2, 3, 4, 5, 6]\n", 366 | "Round = 1 | Î = 1.48 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4, 5]\n", 367 | "Round = 2 | Î = 1.48 | Δ%Î = -0.00 | Features=[0, 1, 2, 3, 4]\n", 368 | "Round = 3 | Î = 1.00 | Δ%Î = -0.32 | Features=[0, 1, 3, 4]\n", 369 | "Round = 4 | Î = 0.75 | Δ%Î = -0.25 | Features=[0, 1, 3]\n", 370 | "Round = 5 | Î = 0.39 | Δ%Î = -0.48 | Features=[1, 3]\n", 371 | "Round = 6 | Î = 0.21 | Δ%Î = -0.46 | Features=[3]\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "select = inf.SelectVars(gmm, selection_mode = 'backward')\n", 377 | "select.fit(X, y, verbose=True) " 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "id": "px7utZtNmTMW" 384 | }, 385 | "source": [ 386 | "Checking history:" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "colab": { 394 | "base_uri": "https://localhost:8080/", 395 | "height": 258 396 | }, 397 | "id": "UgTyazIDmTMX", 398 | "outputId": "1ad808f5-36bd-4142-d114-af353fd90a9c" 399 | }, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/html": [ 404 | "
\n", 405 | "\n", 418 | "\n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | "
roundsmi_meanmi_errordeltanum_featfeatures
001.4839690.0061470.0000007[0, 1, 2, 3, 4, 5, 6]
111.4831010.006139-0.0005856[0, 1, 2, 3, 4, 5]
221.4816670.006135-0.0009675[0, 1, 2, 3, 4]
331.0012920.005613-0.3242124[0, 1, 3, 4]
440.7519520.005343-0.2490183[0, 1, 3]
550.3901380.005015-0.4811662[1, 3]
660.2122570.003765-0.4559461[3]
\n", 496 | "
" 497 | ], 498 | "text/plain": [ 499 | " rounds mi_mean mi_error delta num_feat features\n", 500 | "0 0 1.483969 0.006147 0.000000 7 [0, 1, 2, 3, 4, 5, 6]\n", 501 | "1 1 1.483101 0.006139 -0.000585 6 [0, 1, 2, 3, 4, 5]\n", 502 | "2 2 1.481667 0.006135 -0.000967 5 [0, 1, 2, 3, 4]\n", 503 | "3 3 1.001292 0.005613 -0.324212 4 [0, 1, 3, 4]\n", 504 | "4 4 0.751952 0.005343 -0.249018 3 [0, 1, 3]\n", 505 | "5 5 0.390138 0.005015 -0.481166 2 [1, 3]\n", 506 | "6 6 0.212257 0.003765 -0.455946 1 [3]" 507 | ] 508 | }, 509 | "execution_count": 8, 510 | "metadata": { 511 | "tags": [] 512 | }, 513 | "output_type": "execute_result" 514 | } 515 | ], 516 | "source": [ 517 | "select.get_info()" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": { 523 | "id": "CtsYNrEsmTMc" 524 | }, 525 | "source": [ 526 | "It is possible to see that the estimated mutual information is untouched until Round 2, when it varies around -$30\\%$.\n", 527 | "\n", 528 | "Since there is a 'break' in Round 2, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows:" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "colab": { 536 | "base_uri": "https://localhost:8080/", 537 | "height": 279 538 | }, 539 | "id": "5G8kBvEvmTMd", 540 | "outputId": "f9ed0da4-762e-48dd-d9ce-3f94e8db2c81" 541 | }, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "image/png": "\n", 546 | "text/plain": [ 547 | "
" 548 | ] 549 | }, 550 | "metadata": { 551 | "needs_background": "light", 552 | "tags": [] 553 | }, 554 | "output_type": "display_data" 555 | } 556 | ], 557 | "source": [ 558 | "select.plot_mi()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": { 564 | "id": "uYZoCp1vmTMh" 565 | }, 566 | "source": [ 567 | "Plotting the percentual variations of the mutual information between rounds:" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": { 574 | "colab": { 575 | "base_uri": "https://localhost:8080/", 576 | "height": 279 577 | }, 578 | "id": "BBV2ciMkmTMh", 579 | "outputId": "d8ee110e-0245-42e1-ebb6-5a14f21b4b5d" 580 | }, 581 | "outputs": [ 582 | { 583 | "data": { 584 | "image/png": "\n", 585 | "text/plain": [ 586 | "
" 587 | ] 588 | }, 589 | "metadata": { 590 | "needs_background": "light", 591 | "tags": [] 592 | }, 593 | "output_type": "display_data" 594 | } 595 | ], 596 | "source": [ 597 | "select.plot_delta()" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": { 603 | "id": "5QAhk35EmTMo" 604 | }, 605 | "source": [ 606 | "Making the selection choosing to stop at Round 2:" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": null, 612 | "metadata": { 613 | "colab": { 614 | "base_uri": "https://localhost:8080/" 615 | }, 616 | "id": "u6qtvG8vmTMp", 617 | "outputId": "04946e63-1d8b-4a6b-c233-bb85e9cb54a2" 618 | }, 619 | "outputs": [ 620 | { 621 | "data": { 622 | "text/plain": [ 623 | "(20000, 5)" 624 | ] 625 | }, 626 | "execution_count": 11, 627 | "metadata": { 628 | "tags": [] 629 | }, 630 | "output_type": "execute_result" 631 | } 632 | ], 633 | "source": [ 634 | "X_new = select.transform(X, rd=2)\n", 635 | "\n", 636 | "X_new.shape" 637 | ] 638 | }, 639 | { 640 | "cell_type": "markdown", 641 | "metadata": { 642 | "id": "OzXhfro9mTMx" 643 | }, 644 | "source": [ 645 | "### 1.3\\. Selecting Features for a Classification Task" 646 | ] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": { 651 | "id": "Upuhc2UqmTMy" 652 | }, 653 | "source": [ 654 | "Categorizing $Y$:" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": { 661 | "id": "wDOrI5yRmTMz" 662 | }, 663 | "outputs": [], 664 | "source": [ 665 | "ind0 = (y\n", 848 | "\n", 861 | "\n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | "
roundsmi_meanmi_errordeltanum_featfeatures
000.0000000.0000000.0000000[]
110.1449040.0037170.0000001[3]
220.2808400.0044930.9381102[3, 1]
330.5245220.0045590.8676913[3, 1, 0]
440.6362690.0043150.2130454[3, 1, 0, 4]
550.7951230.0034000.2496675[3, 1, 0, 4, 2]
660.7926730.003550-0.0030826[3, 1, 0, 4, 2, 6]
770.7913150.003708-0.0017127[3, 1, 0, 4, 2, 6, 5]
\n", 948 | "" 949 | ], 950 | "text/plain": [ 951 | " rounds mi_mean mi_error delta num_feat features\n", 952 | "0 0 0.000000 0.000000 0.000000 0 []\n", 953 | "1 1 0.144904 0.003717 0.000000 1 [3]\n", 954 | "2 2 0.280840 0.004493 0.938110 2 [3, 1]\n", 955 | "3 3 0.524522 0.004559 0.867691 3 [3, 1, 0]\n", 956 | "4 4 0.636269 0.004315 0.213045 4 [3, 1, 0, 4]\n", 957 | "5 5 0.795123 0.003400 0.249667 5 [3, 1, 0, 4, 2]\n", 958 | "6 6 0.792673 0.003550 -0.003082 6 [3, 1, 0, 4, 2, 6]\n", 959 | "7 7 0.791315 0.003708 -0.001712 7 [3, 1, 0, 4, 2, 6, 5]" 960 | ] 961 | }, 962 | "execution_count": 16, 963 | "metadata": { 964 | "tags": [] 965 | }, 966 | "output_type": "execute_result" 967 | } 968 | ], 969 | "source": [ 970 | "select.get_info()" 971 | ] 972 | }, 973 | { 974 | "cell_type": "markdown", 975 | "metadata": { 976 | "id": "wFTKMAtJmTN4" 977 | }, 978 | "source": [ 979 | "It is possible to see that the estimated mutual information is untouched from Round 6 onwards.\n", 980 | "\n", 981 | "Since there is a 'break' in Round 5, we should choose to stop the algorithm at theta round. This will be clear in the Mutual Information history plot that follows:" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": { 988 | "colab": { 989 | "base_uri": "https://localhost:8080/", 990 | "height": 279 991 | }, 992 | "id": "SX0Le6YMmTN6", 993 | "outputId": "54053b01-dcd5-46ce-c1b6-4357536aa11b" 994 | }, 995 | "outputs": [ 996 | { 997 | "data": { 998 | "image/png": "\n", 999 | "text/plain": [ 1000 | "
" 1001 | ] 1002 | }, 1003 | "metadata": { 1004 | "needs_background": "light", 1005 | "tags": [] 1006 | }, 1007 | "output_type": "display_data" 1008 | } 1009 | ], 1010 | "source": [ 1011 | "select.plot_mi()" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "markdown", 1016 | "metadata": { 1017 | "id": "NbCXcBoCmTN_" 1018 | }, 1019 | "source": [ 1020 | "Plotting the percentual variations of the mutual information between rounds:" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": { 1027 | "colab": { 1028 | "base_uri": "https://localhost:8080/", 1029 | "height": 279 1030 | }, 1031 | "id": "k145NZeOmTOA", 1032 | "outputId": "07354713-4f34-4c28-ac69-084d2fe2d1ff" 1033 | }, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXyU5bn/8c812VdCNpaEJWEVBCEgm1bFahUXVLBV616t/dljjx49v7Zaa1trT9tTa8/vnNpatS5VW7WCFvdaRU/bRATCIrtMQEjYkgkJhOzJ9ftjJhoCJEPI5JmZ53q/Xs/LmWeemfkO4Fzz3Pf93LeoKsYYY9zL43QAY4wxzrJCYIwxLmeFwBhjXM4KgTHGuJwVAmOMcblYpwMcr+zsbB05cqTTMYwxJqKsXLmySlVzjvZYxBWCkSNHsmLFCqdjGGNMRBGRT4/1mDUNGWOMy1khMMYYl7NCYIwxLmeFwBhjXM4KgTHGuJwVAmOMcTkrBMYY43JWCCLQqh37ef6jHeyqaXA6ijEmCkTcBWUGvrvoYzbvPQjA+MFpzB2fy9njc5k6LIPYGKvtxpjjY4UgwlQebGLz3oNcN3sE+QOTeG/TPh773zJ++76XAUlxnDE2h7PH53Dm2FwyU+KdjmuMiQBWCCLMh2U+AC6bmsfU4QO55YxRHGhs4R+fVPHepn28v3kfr67ZhQhMGZbB2eNymTs+l4lD0xERh9MbY8KRFYIIU+z1kZoQy6S8AZ/tS0+M44JJQ7hg0hDa25V1u2p5b9M+lm7axy/f2cIv39lCbloCc8flMnd8DqePySE1wf7qjTF+9m0QYUq8VcwsyDxmX4DHI0zOz2ByfgZ3nDOWyoNNfLClkqWb9vHGut28sGIncTHCjILMQGHIpTA7xc4WjHExKwQRZFdNA9t99Vwza0TQz8lJS+DyaflcPi2flrZ2Vn66n6Wb/WcLD7y+kQde38iIrOTPisLMgkwS42JC+CmMMeHGCkEEKfH6+wfmjMru1fPjYjzMKsxiVmEWd887iZ3V9bwfOFt4fvkOnireTlJcDKeNzmLu+FzmjstlaEZSX34EY0wYskIQQYq9PgYmxzF+cFqfvN6wzGSunTWCa2eNoLGljZIyH0s37eO9Tfv428Z9gH946lnj/MNTi4bb8FRjopEVggihqpR4q5g9KguPp+/b8xPjYvzNQ+Ny+dF8Zeu+OpZu9heFx/9exiMfeElPjA0MT83lzLE5ZKUm9HkOY0z/s0IQIT711bOrtpFbe9ksdDxEhDGD0hgzKO0ow1MreW3tbkTglPwMzg5czDZhSHpICpQxJvSsEESI4s/6B7L6/b2POTx1cyW/+tsWHnpnCzlpCcwd5z9bOG10NmmJcf2e0xjTO1YIIkSxt4pB6QkUZqc4mqPr8NSquibe31zJ0s37eHPdHl5cUU5cjHDqyM+Hp47KseGpxoQzKwQRwN8/4OOMsTlh94WanXrs4ak/eWMjP3ljI8Mzk5k7Loe543OZVZhlw1ONCTNWCCLAlr11+A41M9uBZqHj0XV4avn+epZu9g9PfWHFTp4u+ZTEOA+njcr2D08dn0ueDU81xnFWCCJAsbcKgNmF4V0IusofeOzhqe9u8g9PHTcojZ9fPpkpwzIcTmuMe1khiADFXh/DMpMYlpnsdJRe6zo81VtZx9JNlTzygZeH3tnCH742w+mIxriWFYIw19auLCvzMe/kIU5H6TMiwujcNEbnpnGgsYWHl25l74FGBqUnOh3NGFeyy0TD3IZdBzjQ2Mqc0ZHVLBSsy6bm0a7wyqoKp6MY41pWCMJcpPYPBKswJ5Wi4RksKi1HVZ2OY4wrWSEIc8VeH6NzU8mN4maTBUX5bNlbx/pdB5yOYowrWSEIY82t7SzfXu3I1cT96aLJQ4iP8fDSynKnoxjjSlYIwtja8hrqm9uivhBkJMdzzoRclqzZRUtbu9NxjHEdKwRhrNjrQwRmFkR3IQBYMDWf6kPNvL+50ukoxriOFYIwVuytYsKQdAamxDsdJeTOHJdDVko8i0uteciY/maFIEw1trRR+mlN1DcLdYiL8TB/ylDe3biPmvpmp+MY4ypWCMLUyk/309zWHvbzC/WlhUX5NLe18+ra3U5HMcZVQloIROR8EdksIltF5LtHeXy4iCwVkVUislZELghlnkhS7K0ixuOfztktJg5NZ9ygNGseMqafhawQiEgM8DAwD5gAXCUiE7ocdi/woqpOBa4EfhOqPJGmxOtjcv4AVy3wIiIsnJbHqh01eCvrnI5jjGuE8oxgBrBVVctUtRl4HrikyzEKpAduDwB2hTBPxKhramVNea1r+gc6u3RKHh6Bl0ttyglj+ksoC0EesLPT/fLAvs5+CFwjIuXAG8C3jvZCInKLiKwQkRWVldE/vHD5tmra2pU5/bA+cbjJTU/kC2NyeHlVBe3tNuWEMf3B6c7iq4CnVDUfuAB4RkSOyKSqj6rqdFWdnpOT0+8h+1uxt4r4GA/TRgx0OoojFhTlUVHTwIfbfE5HMcYVQlkIKoBhne7nB/Z1dhPwIoCqlgCJgPt+BndR7PVRNCLDtUs6fmnCYFITYlm00pqHjOkPoSwEy4ExIlIgIvH4O4OXdDlmB/BFABE5CX8hiP62n27sP9TMht0HXNks1CEpPoYLJw3hzXW7qW9udTqOMVEvZIVAVVuB24C3gY34RwetF5H7RWR+4LC7gK+LyBrgT8AN6vK5iJdt86GKKzuKO1tQlEd9cxtvr9/jdBRjol5IVyhT1TfwdwJ33ndfp9sbgNNCmSHSFHt9JMfHMDnf3Wv4njoyk2GZSSxaWcFlU/OdjmNMVHO6s9h0Uez1cerITOJj3f1X4/EIl03N55/eKnbXNjgdx5io5u5vmzCz70AjW/fVub5ZqMPCojxU4WVbxtKYkLJCEEZKyvzDJd00v1B3RmSlMH3EQBaXVtgylsaEkBWCMFLi9ZGWGMvEoQOcjhI2Fk7LZ+u+OtaW1zodxZioZYUgjBR7fcwqzCLGI05HCRsXTBpCfKzHJqIzJoSsEISJndX17Kiut/6BLgYkxXHuhEEsWbOL5lZbxtKYULBCECY6+gfcfCHZsVxelM/++haWbt7ndBRjopIVgjBR4vWRlRLP2EGpTkcJO18Yk012agKLVlrzkDGhYIUgDKgqxd4qZo/KQsT6B7qKjfFw6ZShLN28j+pDtoylMX2tx0IgIjkico+IPCoiT3Rs/RHOLcqqDrH3QJM1C3VjQVE+LW3Ka2ttyQpj+lowZwR/wb9ozN+A1zttpo8Uezv6B6yj+FgmDE3npCHp1jxkTAgEM9dQsqp+J+RJXKzEW8XQAYmMyEp2OkpYW1iUxwOvb2TrvoOMzk1zOo4xUSOYM4LXbFH50GlvV0q8PmaPyrb+gR7MnzKUGI+wyJaxNKZPBVMIbsdfDBpF5GBgOxDqYG6xac9B9te32LQSQchNS+SMMdm8sqqCNlvG0pg+02MhUNU0VfWoamLgdpqqpvf0PBMcm1/o+Cwoymd3bSMlXlvG0pi+EtTwURGZLyIPBraLQh3KTUq8VYzMSiYvI8npKBHh3AmDSEuMtSknjOlDwQwf/Rn+5qENge12EflpqIO5QWtbO8vKqpltw0aDlhgXw0WTh/Dmuj3UNdkylsb0hWDOCC4AzlXVJ1T1CeB84MLQxnKHdbsOcLCp1YaNHqeFRfk0tLTx1jpbxtKYvhDslcWd1020OZL7SLG3CoBZhVYIjse0EQMZkZVszUPG9JFgCsFPgVUi8pSIPA2sBH4S2ljuUOL1MW5QGjlpCU5HiSgiwoKp+ZSU+aiosWUsjTlRwYwa+hMwC1gMLAJmq+oLoQ4W7Zpa21i+vdpGC/XSgsAylq/YMpbGnLBjFgIRGR/4bxEwBCgPbEMD+8wJWL2jhsaWdusf6KVhmcnMKMhk0cpyW8bSmBPU3RQTdwK3AL88ymMKnB2SRC5R7PXhEZhp/QO9trAoj+8s+pjVO2uYOnyg03GMiVjHPCNQ1VsCN+ep6tzOG/6RROYElHh9nJw3gAFJcU5HiVjzJg0hIdbDIus0NuaEBNNZXBzkPhOk+uZWVu3cz2w7Gzgh6YlxnDdxMK+u2U1Ta5vTcYyJWN31EQwWkWlAkohMFZGiwHYWYNNknoAV2/fT0qbWUdwHFhTlUdvQwnsbbRlLY3qruz6C84AbgHzgoU77DwL3hDBT1Csp8xHrEU4dmel0lIh3+uhsctMSWFRawbxJQ5yOY0xEOmYhUNWngadFZKGqLurHTFGv2OtjyrAMUhKCWQ7CdCc2xsOlU/N44h/b8NU1kZVq12QYc7yCuY5gkYhcKCLfFpH7Orb+CBeNDjS28HF5jQ0b7UMLi/JpbVeWrLFlLI3pjWAmnXsEuAL4FiDAl4ERIc4VtT4qq6ZdsYnm+tC4wWlMHJrOYluwxpheCWbU0BxVvQ7Yr6o/AmYDY0MbK3oVe30kxHqYOjyj54NN0BYW5fNxRS1b9h50OooxESeYQtAxmUu9iAwFWvBfaWx6odhbxfSRA0mMi3E6SlT5fBlLu6bAmOMV7JrFGcAvgFJgO/CnUIaKVr66JjbtOcgcaxbqc9mpCZw1NseWsTSmF4LpLP6xqtYERg6NAMar6veDeXEROV9ENovIVhH57jGO+YqIbBCR9SLyx+OLH1k+LKsGbFnKUFk4LZ+9B5r459Yqp6MYE1F6HL8oIjH4F6IZ2XG8iKCqDwXxvIeBc/FPVrdcRJao6oZOx4wB7gZOU9X9IpLb2w8SCYq9VaQmxDI5z5Z0CIWzx+eSnhjLotJyzhib43QcYyJGMAPZXwUagY+B9uN47RnAVlUtAxCR54FL8C932eHrwMOquh9AVaP68tASr48ZBZnExgS7HpA5HolxMVx8ylAWlZZzsLGFtESbx8mYYARTCPJVdXIvXjsP2Nnpfjkws8sxYwFE5J9ADPBDVX2r6wuJyC34Z0Jl+PDhvYjivD21jZRVHeKqGZGZP1IsKMrnuWU7ePPjPXzl1GFOxzEmIgTz0/RNEflSiN4/FhgDnAVcBTwW6Jg+jKo+qqrTVXV6Tk5knvKXlPnbra1/ILSKhmdQkJ1io4eMOQ7BFIIPgZdFpEFEDojIQRE5EMTzKoDOP8nyA/s6KweWqGqLqm4DtuAvDFGneKuPAUlxTBiS7nSUqOZfxjKPZduq2Vld73QcYyJCMIXgIfwXkSWrarqqpqlqMN9my4ExIlIgIvHAlcCSLse8gv9sABHJxt9UVBZs+EihqhR7fcwuzMLjEafjRL3LivIAeNmWsTQmKMEUgp3AOj3O9QBVtRW4DXgb2Ai8qKrrReR+EZkfOOxtwCciG4ClwP9VVd/xvE8k2FndQEVNA3NGW7NQf8gfmMyswkwWl9oylsYEI5jO4jLgfRF5E2jq2NnT8NHAMW8Ab3TZd1+n24p/Scw7gw0ciYq9/v4Bm2iu/ywoyufbL62ldMd+po2w6b6N6U4wZwTbgHeBeCCt02aCVOz1kZOWwKicVKejuMYFk4aQGOdhkU1EZ0yPuj0jCFwUNlZVr+6nPFGno3/gtNFZiFj/QH9JTYjl/ImDeW3NLu67aILN7WRMN7o9I1DVNmBEoLPX9MLWfXVU1TVZs5ADFk7L50BjK+/aMpbGdCvYPoJ/isgS4FDHzmD6CIy/WQiwieYcMGdUNoPTE1lUWs6Fk23CXGOOJZg+Ai/wWuBY6yM4TsXeKvIHJjEsM9npKK4T4xEunZrHB1sqqTzY1PMTjHGpHs8IAovRICKpgft1oQ4VLdrblQ/LqvnShEFOR3GthUV5PPKBl7+sruDmLxQ6HceYsBTMUpUni8gqYD2wXkRWisjE0EeLfBt2H6C2ocWuH3DQmEFpTM4fYMtYGtONYJqGHgXuVNURqjoCuAt4LLSxokNJoH9gdqH1DzhpwdQ8Nuw+wMbdwcyMYoz7BFMIUlR1accdVX0fSAlZoihS7K2iMCeFwQMSnY7iahefMpRYj7DYJqIz5qiCKQRlIvJ9ERkZ2O4lCucD6mstbe18tK3aho2GgazUBOaOz+WV1btobTueJTWMcYdgCsHXgBxgMbAIyA7sM91YW17LoeY2GzYaJhYW5VF5sIm/2zKWxhzhmKOGROQZVb0WuE5V/7UfM0WFksD8QrMK7YwgHMwdn0tGchyLSyuYOy6qV0Q15rh1d0YwTUSGAl8TkYEiktl566+AkarY6+OkIelkpthF2eEgITaGiycP5a/r93CgscXpOMaEle4KwSP4J5sbD6zssq0IfbTI1djSxopP91v/QJhZOC2fptZ23li72+koxoSVYxYCVf1vVT0JeEJVC1W1oNNmV+Z0o3THfppb260QhJlT8gdQmGPLWBrTVY+dxap6q4jEiMhQERnesfVHuEhV4vUR4xFmFFgLWjgRERYW5bN8+34+9R3q+QnGuEQwVxbfBuwF3gFeD2yvhThXRCv2+piUN4C0xDino5guLp2ahwh2pbExnQQzfPQOYJyqTlTVSYFtcqiDRapDTa2s2VnDbGsWCkt5GUnMLsxi8SpbxtKYDsGuWVwb6iDRYvn2alrb1foHwtjConx2Vjew4tP9TkcxJiwEdWUx/jWL7xaROzu2UAeLVCVeH3ExwnRbJzdsnX/yYJLjY1i00jqNjYHgCsEO/P0DtmZxEIq9PqYOH0hSvC2NGK5SEmI5/+TBvL52N40tbU7HMcZxQa9HYHpWW9/Cul213P7FMU5HMT1YWJTP4tIK/rphL/NPGep0HGMc1d0UE68Cx+xNU9X5IUkUwT7c5kPVlqWMBLMLsxg6IJHFpeVWCIzrdXdG8GC/pYgSJV4fiXEepgzLcDqK6YEnsIzlIx942Xegkdx0myrcuNcxC4GqftCfQaJBsbeKU0dmEh8bTNeLcdqConx+876Xv6zexdfPsIvljXvZN1YfqTzYxJa9ddYsFEFG56ZyyrAMFpXaNQXG3awQ9JGSMv+ylHb9QGRZWJTHpj0H2WDLWBoXs0LQR0q8VaQlxjJxaLrTUcxxuHjyUOJixKacMK5mo4b6SInXx8yCTGJjrLZGkoEp8Zw9Ppe/rK7gu/PGE2d/f8aFbNRQH6ioaWC7r55rZ490OorphYVF+by9fi9//6SSs8cPcjqOMf3ORg31gRKv9Q9EsrPG5TIwOY5FKyusEBhXCmYa6jEi8pKIbBCRso6tP8JFimJvFZkp8YwbZDNvRKL4WA+XTMnjnY17qa23ZSyN+wTTIPok8FugFZgL/AF4NpShIomqUuL1MbswC49HnI5jemlBUR7Nre289vEup6MY0++CKQRJqvouIKr6qar+ELgwmBcXkfNFZLOIbBWR73Zz3EIRURGZHlzs8LHdV8/u2kZbfyDCTcobwJjcVBs9ZFwpmELQJCIe4BMRuU1ELgNSe3qSiMQADwPzgAnAVSIy4SjHpQG3A8uOK3mYKPZWAdY/EOlEhAVF+az8dD/bqmwZS+MuwRSC24Fk4F+BacC1wPVBPG8GsFVVy1S1GXgeuOQox/0Y+DnQGFTiMFPs9TE4PZGC7BSno5gTdFlgGcuXbXF74zLBLF6/XFXrVLVcVW9U1QWq+mEQr52Hf3WzDuWBfZ8RkSJgmKq+3t0LicgtIrJCRFZUVlYG8db9o71d+dDrY86oLESsfyDSDR6QyOmjs1lUWkF7u005YdwjmFFDS0Xkva7bib5xoLnpIeCuno5V1UdVdbqqTs/JyTnRt+4zW/YdxHeo2foHosiCojwqahr4aHu101GM6Tc9LkwD/Hun24nAQvwjiHpSAQzrdD8/sK9DGnAy/mUwAQYDS0RkvqquCOL1HVe81X/9gBWC6HHexMGkxK9jcWk5swrt79W4QzBNQys7bf9U1TuBs4J47eXAGBEpEJF44EpgSafXrVXVbFUdqaojgQ+BiCkC4J9obnhmMvkDk52OYvpIcnws8yYN4Y2P99DQbMtYGncIpmkos9OWLSLnAQN6ep6qtgK3AW8DG4EXVXW9iNwvIhE/T1Fbu/Jhmc9GC0WhhUX51DW18tcNe5yOYky/CKZpaCX+yecEf5PQNuCmYF5cVd8A3uiy775jHHtWMK8ZLtbvquVgY6s1C0WhmQWZ5GUk8dLKci6ZktfzE4yJcMEUgpNU9bChnSKSEKI8EaPYa/0D0crjERYU5fHw0q3sqW1k8ABbxtJEt2CuIyg+yr6Svg4SaYq9PsbkppKbZl8S0eiyqXm0K7yy2q40NtHvmIVARAaLyDQgSUSmikhRYDsL/wVmrtXc2s7ybdXWPxDFCnNSKRqewaKVtoyliX7dNQ2dB9yAf9jnQ532HwTuCWGmsLemvIaGljZm2/rEUW1BUT73vrKOdRUHmJTf4/gIYyLWMc8IVPVpVZ0L3KCqcztt81V1cT9mDDvFW32IwKzCTKejmBC6aPIQ4mM8LLIpJ0yUC6az+GQRmdh1p6reH4I8EaHYW8XEoelkJMc7HcWEUEZyPOdMyGXJml1878KTbBlLE7WC+ZddBxwKbG34ZxMdGcJMYa2huY1VO2qYY81CrrBgaj7Vh5p5f3P4zHFlTF/r8YxAVX/Z+b6IPIj/IjFXWvnpfprb2m3YqEucOS6HrJR4FpeWc+4EW8bSRKfenOsm4+9AdqWSsipiPcKpI61/wA3iYjzMnzKUdzfuo6a+2ek4xoREMFNMfCwiawPbemAz8F+hjxaeir0+JucPIDUhmO4VEw0WFuXT3NbOq2t3Ox3FmJAI5tvsok63W4G9gXmEXOdgYwtry2u59cxRTkcx/Wji0HTGDUpj0cpyrp01wuk4xvS57i4oyxSRTPzXDXRsDUB6YL/rLN9eTVu72oVkLiMiLJyWx+qdNXgr65yOY0yf665pqApYDawIbCs7bREzVXRfKt7qIz7WQ9GIgU5HMf3skil5eAQW2zUFJgp1Vwj+G9gPvIV/jeJCVS0IbIX9ki7MFHt9TBs+kMS4GKejmH42KD2R08fk8LItY2miUHdXFt8BTAH+jH/B+lUi8p8iUtBf4cLJ/kPNbNh9wJqFXGxhUR67ahv5sMzndBRj+lS3o4bUbynwbeAR4EbgnP4IFm46/uefM9oKgVt9acJgUhNiWVRqM5Ka6NJdZ3GKiHxVRP6Cf3GZVGCaqj7Wb+nCSLHXR3J8DJPzM5yOYhySFB/DhZOG8Oa63dQ3u3LgnIlS3Z0R7MN/JlAC/BIoA6aLyAIRWdAf4cJJsbeKGQWZNt+Myy0oyqO+uY231tkyliZ6dHcdwZ/xL1E5LrB1poBrZiDde6ARb+Uhrjh1mNNRjMNOHZnJsMwkFpdWsKDItRfYmyhzzEKgqjf0Y46wVhJYltImmjMej3DZ1Hz+571P2FXTwNCMJKcjGXPCrJ0jCCVeH+mJsZw0JN3pKCYMLCzKQ20ZSxNFrBAEobisilmFWcR4xOkoJgyMyEph+oiBtoyliRpWCHqws7qendUNdv2AOcyCony8lYdYW17rdBRjTljQhUBERovIsyKySERmhzJUOPmsf2C09Q+Yz104eQjxsbaMpYkO3V1HkNhl14+Bu4E7gN+GMlQ4KfZWkZ0az5jcVKejmDAyICmOcycMYsmaXTS3tjsdx5gT0t3w0VdF5BlV/UPgfgv+JSoV/5KVUU9VKfb6mD0qGxHrHzCHu7won9fX7mbp5n2cN3Gw03F6pba+BW9VHWWVh9gW+G9Z5SEGpsTx668WkZ2a4HRE0w+6KwTnA7eKyFvAfwD/DvwrkARc3Q/ZHOetPMS+g03WP2CO6gtjsslOTWDRyvKwLgTNre3sqK6nrLKOsqpDlFXWsa3K/4XvO/T5qmsxHmFEZjIjs1Mo9lZx7e8/4vmvz2JAcpyD6U1/6O46gjbg1yLyDPB94FbgXlX19lc4p5V4qwCsEJijio3xcOmUoTxdsp3qQ81kpsQ7lkVVqaxr+uwXfceX/raqQ+yorqet04yp2anxFGancu6EQRRkp1CYk0phTgrDM5M/u3L+f7dUcvPTK7j+yY949uaZtiJflDvm366IzAT+L9CM/4ygAfiJiFQAP1bVmv6J6Jxir4+8jCSGZyY7HcWEqQVF+Tz+j228umYX188ZGfL3a2hu8/+a/6wZJ/CFX3mIg02fz3+UEOuhIDuFk4akceGkIRTmpHz2pT8gqedf+GeMzeHXX53Krc+V8rWnlvP0jTNIirfp16NVd2X+d8AF+Cebe1JVTwOuFJEzgReA8/ohn2Pa25WSMh/nnDTI+gfMMU0Yms5JQ9JZXFreZ4WgvV2pqGkINN90NOf4b++qbTzs2KEDEinMSeWyojwKA1/0Bdkp5GUk4TnB616+NHEwv7piCrc/v4pvPLuSx66bRkKsFYNo1F0haMXfOZyC/6wAAFX9APggtLGct3HPAWrqW6xZyPRoYVEeD7y+ka37DjI6Ny3o59U2tBzWXl/2WaftIZo6jURKTYilMCeFGQWZnzXjFGb7v/BD/St9/ilDaWxu49uL1vKtP67i4auLbOLFKNRdIfgq8A38ReC6/okTPjquH5hthcD0YP6Uofz0zU0sKq3gO+ePP+yxlraOjtrDR+WUVdVRVXd4R+3wzGQKs1M4fXT251/4OSnkpCY4elb6lVOH0dDSxg+WrOeuF9fwqyum2FX2Uaa7zuItwF1d94vI6cBVqvovoQzmtBKvj4LsFIYMsEnFTPdy0xI5Y0w2L5dWMDwz+fMmnUp/R21rp47arJR4CnNS+OL4QYe12w/PTCY+Nnx/aV8/ZyT1zW38/K1NJMXF8NMFk0646cmEj6CGAojIVPxnCF8B9gDjgR4LgYicD/w/IAZ4XFV/1uXxO4Gb8TdDVQJfU9VPj+cDhEJrWzvLtlUzf8pQp6OYCPHl6cP45nOl3L34Y+JjPRRkpTBucBrzJg32N+PkpDAqOzWih2LeetYo6ptb+Z/3tpIUH8MPLp5g/WdRortRQ2OBq/AXgIP41yc4S1W3ici2nl5YRGKAh4FzgXJguYgsUdUNnQ5bBUxX1XoRuRX4T+CKXn+aPvJxRS11Ta3WP2CCNu/kwbz8zTlkpyYwNCMpaptO7jx3LPXNbfz+H9tIjo/h212awkxk6u6MYBOwHLhcVT/u8lgwUy7OALaqahmAiDwPXAJ8VkwUETkAABBNSURBVAgC6yF3+BC4JpjQoVYc6B+YVWiFwARHRJg6fKDTMUJORLj3wpOob27jN+97SUmI5V/mjnY6ljlB3TVKLgC2AX8VkWdE5GIROZ7z2jxgZ6f75YF9x3IT8ObRHhCRW0RkhYisqKysPI4IvVPi9TF+cJpdXm/MUYgIP7n0ZC6bmscv3t7M7//RYwOBCXPHLASq+oqqXgmMxv8FfQtQLiJPAn26QouIXANMB35xjCyPqup0VZ2ek5PTl299hKbWNpZvr7bRQsZ0w+MRfnH5ZM6fOJgfv7aBP320w+lI5gT0OExBVQ+p6h9V9WL8ncQlwNogXrsC6LzIb35g32FE5Bzge8B8VW0KKnUIrdpRQ1Nruy1LaUwPYmM8/PdVUzlrXA73vPwxr6yyFdsi1XGNV1PV/YFf52cHcfhyYIyIFIhIPHAlsKTzAYHRSL/DXwT2HU+WUCn2+vAIzCjIdDqKMWEvPtbDI9dMY1ZBFnf9eQ1vrdvjdCTTCyEbuKyqrcBtwNvARuBFVV0vIveLyPzAYb/AP4XFn0VktYgsOcbL9ZsSbxWT8gYENR+LMQYS42J4/PrpnJI/gG/9qZT3N4fFbzpzHEJ6BYuqvqGqY1V1lKr+JLDvPlVdErh9jqoOUtUpgW1+968YWvXNrazeWcNsaxYy5rikJMTy5I0zGDsojW88s/KzK/NNZAjfSxkdsGL7flra1K4fMKYXBiTF8cxNMxmemcxNTy+ndMd+pyOZIFkh6KTY6yMuRpg+MvrHgxsTCpkp8Tx380xy0hK4/omPWFdR63QkEwQrBJ2UeKuYMiyD5HhbhMOY3spNT+S5m2eSlhDLdU98xCd7DzodyfTACkFAbUMLH1fUWv+AMX0gf2Ayz319FjEe4erHl7G96pDTkUw3rBAEfLStmna1ZSmN6SsF2Sk8d/NMWtraufrxZVTUNDgdyRyDFYKAYm8VCbEepg7PcDqKMVFj7KA0nrlpJgcaW7jm8WXsO9jY85NMv7NCEFDi9XHqyExbis+YPnZy3gCeuvFU9h5o5JrHl1F9qLnnJ5l+ZYUAqKprYtOegza/kDEhMm1EJo9fN53tvnque2IZBxpbnI5kOrFCAHxY5r/4xfoHjAmdOaOz+d0109i85yA3PrmcQ02tTkcyAVYI8F8/kJoQy6S8AU5HMSaqzR2fy39fOZVVO/bz9T+soLGlzelIBisEgL9/YGZBJrEx9sdhTKjNmzSEB798CiVlPr75XCnNre1OR3I913/z7a5tYFvVIesfMKYfLSjK54FLT+a9Tfu444VVtLZZMXCS6y+h7Zgcy9YfMKZ/XT1zBA3NbTzw+kYS49by4OWn4InStZ7DnesLQbHXx8DkOMYPTnM6ijGuc/MXCqlvbuOhd7aQFBfDA5eejIgVg/7m6kKgqpR4fcwqzLJfIsY45Ftnj6a+uY1HPvCSHB/DPRecZMWgn7m6EOyorqeipoH/c2ah01GMcS0R4Tvnj6OhuZXH/r6N5PhY/u3csU7HchVXF4LiQP+ATTRnjLNEhB9cPJH65jb+37ufkBwfwzfOHOV0LNdwfSHITUtgVE6K01GMcT2PR/jZwsk0tLTx0zc3kRQfw3WzRzodyxVcWwj8/QNVnD4629ojjQkTMR7hV1dMobGlnfv+sp6kuBi+PH2Y07GinmuvI/hkXx1Vdc02bNSYMBMX4+HXX53KF8Zk851Fa3l1zS6nI0U91xaC4q1VAHYhmTFhKDEuhkevnc70EZn82wur+duGvU5HimruLQReH8MykxiWmex0FGPMUSTFx/D7G6YzcWg633yulH98UuV0pKjlykLQ1q4s21bNnEJrFjImnKUlxvH012ZQmJPC1/+wguXbq52OFJVcWQg27j5AbUMLc0Zbs5Ax4S4jOZ5nbprJkIxEbnxyOWt21jgdKeq4shAUewP9A4VWCIyJBDlpCTx380wGpsRx3RMfsXH3AacjRRWXFgIfo3JSyE1PdDqKMSZIQwYk8cebZ5EUF8O1v1+Gt7LO6UhRw3WFoKWtnY+2VduwUWMi0LDMZJ69eSaqcPVjy9hZXe90pKjgukKwtryG+uY2W5bSmAg1OjeVZ2+eSUNLG199/EP21DY6HSniua4QFG/1zy80y/oHjIlYJw1J5w9fm8H+Qy1c/fiHVNU1OR0pormvEHh9TBiSzsCUeKejGGNOwCnDMnjihlOpqGngmseXUVPf7HSkiOWqQtDY0sbKHfutWciYKDGjIJPHrptOWeUhrn9yOQcbW5yOFJFcVQhKP91Pc2u7XT9gTBT5wpgcHr66iHUVtdz01AoamtucjhRxXFUIir0+YjzCqSMznY5ijOlD504YxK+umMLyT6u55ZkVNLVaMTgeIS0EInK+iGwWka0i8t2jPJ4gIi8EHl8mIiNDmaekzMfk/AGkJcaF8m2MMQ6Yf8pQfr5wMn//pIrb/riKlrZ2pyNFjJAVAhGJAR4G5gETgKtEZEKXw24C9qvqaOBXwM9DlaeuqZU1O2usf8CYKPaV6cP40fyJvLNhL3e+uIa2dnU6UkQI5cI0M4CtqloGICLPA5cAGzodcwnww8Dtl4Bfi4io6rH/9jZvhrPOOnzfV74C3/wm1NfDBRcc+ZwbbmD57AtIq6vhlvtugp92OSO49Va44grYuROuvfbI5991F1x8sf+9v/GNIx+/91445xxYvRruuOPIx//jP2DOHCguhnvuOfLx//ovmDIF/vY3eOCBIx//3e9g3Dh49VX45S+PfPyZZ2DYMHjhBfjtb498/KWXIDsbnnrKv3X1xhuQnAy/+Q28+OKRj7//vv+/Dz4Ir712+GNJSfDmm/7bP/4xvPvu4Y9nZcGiRf7bd98NJSWHP56fD88+6799xx3+P8POxo6FRx/1377lFtiy5fDHp0zx//kBXHMNlJcf/vjs2fDTn/pvL1wIPt/hj3/xi/D97/tvz5sHDQ2HP37RRfDv/+6/3fXfHQT1b48bboCqKrj88iMft397ff5v73rgnJoG1r4cy91xv+FnCybj+d49/fpvTwGdNYv2n/wHCni+fDni86GdDmidO5fG734PFJIvvQgaGgH97Jim8+Zx6Fv/RrsqWRedB0CsR4jxBBbSOtF/e52EshDkATs73S8HZh7rGFVtFZFaIAs4bL5ZEbkFuAVgckJCr8Js3H2A+BgPqYmuXZTNGNfIy0iiPjWeW1eUU1Lm49ZlO5gQmKyu44t2X20s9//sPdpVuX35Tsbs2v/Z8xXYUVfO/ff/FVX4/spyRviqD/si31i/nZ/c+yaqyoOrKxh04PAfGaVNXv7ze/5C9dv1exnYcPj8SP98byv/0/RXAJ7a6iOx9fBrId59dyuP1fmL2/M7/NkKslMYFIKpcaS7H98n9MIilwPnq+rNgfvXAjNV9bZOx6wLHFMeuO8NHHPMicenT5+uK1as6FWmmvpmMpLt+gFj3EBVeeKf2yn9dD8iICII4Om4LSBI4D54OvYFjvtsH58f33Hf4zl8v/91pcv7dDzn6O8nfH68p+M1PEd/v47jioYPZHRuaq/+PERkpapOP9pjofx5XAF0Xmw0P7DvaMeUi0gsMADocu7ed6wIGOMeIsJNpxdw0+kFTkcJe6EcNbQcGCMiBSISD1wJLOlyzBL8TXoAlwPvdds/YIwxps+F7Iwg0OZ/G/A2EAM8oarrReR+YIWqLgF+DzwjIluBavzFwhhjTD8Kac+pqr4BvNFl332dbjcCXw5lBmOMMd1z1ZXFxhhjjmSFwBhjXM4KgTHGuJwVAmOMcTkrBMYY43Ihu7I4VESkEvi0l0/Ppsv0FRHMPkv4iZbPAfZZwtWJfJYRqppztAcirhCcCBFZcaxLrCONfZbwEy2fA+yzhKtQfRZrGjLGGJezQmCMMS7ntkLwqNMB+pB9lvATLZ8D7LOEq5B8Flf1ERhjjDmS284IjDHGdGGFwBhjXM4VhUBEnhCRfYEV0SKaiAwTkaUiskFE1ovI7U5n6g0RSRSRj0RkTeBz/MjpTCdKRGJEZJWIvNbz0eFLRLaLyMcislpEerccYBgQkQwReUlENonIRhGZ7XSm3hCRcYG/i47tgIgcZYHqE3gPN/QRiMgZQB3wB1U92ek8J0JEhgBDVLVURNKAlcClqrrB4WjHRUQESFHVOhGJA/4B3K6qHzocrddE5E5gOpCuqhc5nae3RGQ7ML27JWMjgYg8DfxdVR8PLI6VrKo1Tuc6ESISg39lx5mq2tsLa4/gijMCVf1f/AvfRDxV3a2qpYHbB4GNQJ6zqY6f+tUF7sYFtoj9VSIi+cCFwONOZzEgIgOAM/AvfoWqNkd6EQj4IuDtyyIALikE0UpERgJTgWXOJumdQFPKamAf8I6qRuTnCPgv4NtAu9NB+oACfxWRlSJyi9NheqkAqASeDDTXPS4iKU6H6gNXAn/q6xe1QhChRCQVWATcoaoHnM7TG6rapqpTgHxghohEZLOdiFwE7FPVlU5n6SOnq2oRMA/4l0DTaqSJBYqA36rqVOAQ8F1nI52YQPPWfODPff3aVggiUKBNfRHwnKoudjrPiQqcsi8Fznc6Sy+dBswPtK0/D5wtIs86G6n3VLUi8N99wMvADGcT9Uo5UN7pLPMl/IUhks0DSlV1b1+/sBWCCBPoZP09sFFVH3I6T2+JSI6IZARuJwHnApucTdU7qnq3quar6kj8p+7vqeo1DsfqFRFJCQxCINCU8iUg4kbbqeoeYKeIjAvs+iIQUQMqjuIqQtAsBCFevD5ciMifgLOAbBEpB36gqr93NlWvnQZcC3wcaF8HuEdV33AwU28MAZ4OjILwAC+qakQPu4wSg4CX/b83iAX+qKpvORup174FPBdoUikDbnQ4T68FivK5wDdC8vpuGD5qjDHm2KxpyBhjXM4KgTHGuJwVAmOMcTkrBMYY43JWCIwxxuWsEBgDiEhbYGbHdSLyasc1DiF8vxtE5NehfA9jgmWFwBi/BlWdEpidthr4F6cDGdNfrBAYc6QSAjO6isgUEflQRNaKyMsiMjCw/30RmR64nR2YXqLjl/5iEXlLRD4Rkf/seFERuVFEtojIR/gvDOzY/+XAmcgaEfnffvycxgBWCIw5TOBK5y8CSwK7/gB8R1UnAx8DPwjiZaYAVwCTgCsCiwkNAX6EvwCcDkzodPx9wHmqegr+ScWM6VdWCIzxSwpM2bEH/zQL7wTmtM9Q1Q8CxzyNf477nryrqrWq2oh/fpsRwEzgfVWtVNVm4IVOx/8TeEpEvg7E9NHnMSZoVgiM8WsITIk9AhB67iNo5fP/fxK7PNbU6XYbPczppar/B7gXGAasFJGsYEMb0xesEBjTiarWA/8K3IV/Dvv9IvKFwMPXAh1nB9uBaYHblwfx0suAM0UkKzCN+Jc7HhCRUaq6TFXvw7+YyrAT/iDGHAdXzD5qzPFQ1VUishb/tL/XA4+ISDKHz2D5IPBiYAWv14N4zd0i8kP8HdE1wOpOD/9CRMbgPxN5F1jTV5/FmGDY7KPGGONy1jRkjDEuZ4XAGGNczgqBMca4nBUCY4xxOSsExhjjclYIjDHG5awQGGOMy/1/PRRbkhdFX+kAAAAASUVORK5CYII=\n", 1038 | "text/plain": [ 1039 | "
" 1040 | ] 1041 | }, 1042 | "metadata": { 1043 | "needs_background": "light", 1044 | "tags": [] 1045 | }, 1046 | "output_type": "display_data" 1047 | } 1048 | ], 1049 | "source": [ 1050 | "select.plot_delta()\n" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "markdown", 1055 | "metadata": { 1056 | "id": "E0X_P7NgmTOE" 1057 | }, 1058 | "source": [ 1059 | "Making the selection choosing to stop at Round 5:" 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "code", 1064 | "execution_count": null, 1065 | "metadata": { 1066 | "colab": { 1067 | "base_uri": "https://localhost:8080/" 1068 | }, 1069 | "id": "GgwHXp-PmTOF", 1070 | "outputId": "b1482802-a4a5-4b77-d09f-2b419bb35c45" 1071 | }, 1072 | "outputs": [ 1073 | { 1074 | "data": { 1075 | "text/plain": [ 1076 | "(20000, 5)" 1077 | ] 1078 | }, 1079 | "execution_count": 19, 1080 | "metadata": { 1081 | "tags": [] 1082 | }, 1083 | "output_type": "execute_result" 1084 | } 1085 | ], 1086 | "source": [ 1087 | "X_new = select.transform(X, rd=5)\n", 1088 | "\n", 1089 | "X_new.shape" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": { 1095 | "id": "JwdP1J8W2m8G" 1096 | }, 1097 | "source": [ 1098 | "\n", 1099 | "## 5\\. References\n", 1100 | "\n", 1101 | "[1] Eirola, E., Lendasse, A., & Karhunen, J. (2014, July). Variable selection for regression problems using Gaussian mixture models to estimate mutual information. In 2014 International Joint Conference on Neural Networks (IJCNN) (pp. 1606-1613). IEEE.\n", 1102 | "\n", 1103 | "[2] Lan, T., Erdogmus, D., Ozertem, U., & Huang, Y. (2006, July). Estimating mutual information using gaussian mixture model for feature ranking and selection. In The 2006 IEEE International Joint Conference on Neural Network Proceedings (pp. 5034-5039). IEEE.\n", 1104 | "\n", 1105 | "[3] Maia Polo, F., & Vicente, R. (2022). Effective sample size, dimensionality, and generalization in covariate shift adaptation. Neural Computing and Applications, 1-13.\n", 1106 | "\n", 1107 | "\n" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": null, 1113 | "metadata": { 1114 | "id": "GvWp_Cmd20WX" 1115 | }, 1116 | "outputs": [], 1117 | "source": [] 1118 | } 1119 | ], 1120 | "metadata": { 1121 | "colab": { 1122 | "collapsed_sections": [], 1123 | "include_colab_link": true, 1124 | "name": "InfoSelect.ipynb", 1125 | "provenance": [] 1126 | }, 1127 | "kernelspec": { 1128 | "display_name": "Python 3 (ipykernel)", 1129 | "language": "python", 1130 | "name": "python3" 1131 | }, 1132 | "language_info": { 1133 | "codemirror_mode": { 1134 | "name": "ipython", 1135 | "version": 3 1136 | }, 1137 | "file_extension": ".py", 1138 | "mimetype": "text/x-python", 1139 | "name": "python", 1140 | "nbconvert_exporter": "python", 1141 | "pygments_lexer": "ipython3", 1142 | "version": "3.7.13" 1143 | } 1144 | }, 1145 | "nbformat": 4, 1146 | "nbformat_minor": 1 1147 | } 1148 | --------------------------------------------------------------------------------