├── README.md ├── Residential-Building-Data-Set.xlsx ├── final_model.py ├── paper_final_experiments.py ├── requirements.txt ├── sonar.all-data.csv └── synth_data_experiments.py /README.md: -------------------------------------------------------------------------------- 1 | # Binary Mask Optimization Algorithm Repository 2 | 3 | This repository provides the code for our paper: Binary Feature Mask Optimization for Feature Selection 4 | The repository includes the used datasets. 5 | 6 | ## Installation 7 | 8 | 1. Clone the repository: 9 | git clone https://github.com/mefe06/feature-selection.git 10 | 11 | 2. Navigate to the repository directory: 12 | cd feature-selection 13 | 14 | 3. Install the required packages: 15 | pip install -r requirements.txt 16 | 17 | ## Results 18 | 19 | 1. Run "python3 paper_final_experiments.py" for real life dataset results or "python3 synth_data_experiments.py" for synthetic data results. 20 | 21 | ## Contributing 22 | 23 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. 24 | 25 | ## License 26 | 27 | This project is licensed under the MIT License. 28 | -------------------------------------------------------------------------------- /Residential-Building-Data-Set.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mefe06/feature-selection/8fe3bd92b99423ad0d0656806d87d9838433b91e/Residential-Building-Data-Set.xlsx -------------------------------------------------------------------------------- /final_model.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from scipy.optimize import minimize 5 | from sklearn.model_selection import RandomizedSearchCV 6 | from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score, mean_squared_error,mean_absolute_error, log_loss 7 | from sklearn.neural_network import MLPRegressor, MLPClassifier 8 | from sklearn.feature_selection import RFE 9 | from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier 10 | from sklearn.model_selection import cross_val_score, KFold,StratifiedKFold 11 | from sklearn.neighbors import KNeighborsClassifier 12 | from sklearn.metrics import make_scorer, roc_auc_score 13 | import random 14 | from sklearn.svm import SVR, SVC 15 | from sklearn.feature_selection import mutual_info_classif, mutual_info_regression 16 | import pandas as pd 17 | import time 18 | 19 | class LGBM_w_Feature_Selector(): 20 | 21 | def __init__(self, layer_sizes = None, boosting_type=None, num_leaves=None, max_depth=None, learning_rate=0.01, subsample=None, colsample_bytree=None, reg_alpha=None, 22 | reg_lambda=None, model="lgbm",problem_type="Classifier", objective="binary", param_grid=None,n_estimators=1000, random_state=42, target_name="Y", 23 | early_stopping_rounds=100, X_train=None, X_test=None, X_val_1=None, X_val_2=None, y_val_1=None, y_val_2=None, y_train=None, show_loss_plot = False, 24 | y_test=None, iterations=1000, slack=0.01): 25 | 26 | self.problem_type = problem_type 27 | if model=="lgbm": 28 | self.model_type = "lgbm" 29 | if problem_type == "Classifier": 30 | self.model = lgb.LGBMClassifier(max_depth=max_depth,verbose= -1) 31 | self.initial_model=lgb.LGBMClassifier(random_state=42, verbose= -1) 32 | self.criterion = log_loss 33 | self.cv_scoring = "neg_log_loss" 34 | 35 | else: 36 | 37 | self.model = lgb.LGBMRegressor(max_depth=max_depth,verbose= -1) 38 | self.initial_model=lgb.LGBMRegressor(random_state=42,verbose= -1) 39 | 40 | self.criterion = mean_squared_error 41 | self.cv_scoring = "neg_mean_squared_error" 42 | else: 43 | self.model_type = "mlp" 44 | 45 | if problem_type == "Classifier": 46 | self.model = MLPClassifier(hidden_layer_sizes= layer_sizes, activation='relu', random_state=42) 47 | self.initial_model = MLPClassifier(random_state=42) 48 | self.criterion = log_loss 49 | self.cv_scoring = "neg_log_loss" 50 | 51 | else: 52 | #self.model = MLPRegressor(hidden_layer_sizes=layer_sizes, warm_start=False, max_iter=500) 53 | self.model = MLPRegressor(hidden_layer_sizes= layer_sizes, activation='relu', random_state=42) 54 | self.initial_model = MLPRegressor(random_state=42) 55 | self.criterion = mean_squared_error 56 | self.cv_scoring = "neg_mean_squared_error" 57 | self.params = param_grid 58 | self.val_losses = [] 59 | self.selected_feature_nbs_gbmo=[] 60 | self.best_ft_nbs = [] 61 | self.early_stopping_rounds = early_stopping_rounds 62 | self.initial_model = self.model 63 | self.iterations = iterations 64 | self.X_train = X_train 65 | self.X_test = X_test 66 | self.X_val_1 = X_val_1 67 | self.X_val_2 = X_val_2 68 | self.X_full_train = np.vstack((X_train,X_val_1)) 69 | self.slack = slack 70 | self.y_train = y_train 71 | self.y_val_1 = y_val_1 72 | self.y_val_2 = y_val_2 73 | self.y_test = y_test 74 | self.y_full_train = np.vstack((y_train,y_val_1)) 75 | self.plot_loss = show_loss_plot 76 | self.num_of_features = self.X_train.shape[1] 77 | self.mask = np.ones(self.num_of_features) 78 | self.target_name=target_name 79 | #if problem_type == "Classifier": 80 | # self.criterion = log_loss 81 | #else: 82 | # self.criterion = mean_squared_error 83 | 84 | def shuffle_column(self,arr, i): 85 | # Copy the array to avoid modifying the original array 86 | arr_copy = np.copy(arr) 87 | 88 | # Get the ith column 89 | column = arr_copy[:, i] 90 | 91 | # Shuffle the values in the column 92 | np.random.shuffle(column) 93 | 94 | # Update the ith column in the copied array 95 | arr_copy[:, i] = column 96 | 97 | return arr_copy 98 | def set_column_to_zero(self, input_array, i): 99 | if i < 0 or i >= input_array.shape[1]: 100 | raise ValueError("Invalid column index") 101 | 102 | result_array = input_array.copy() 103 | result_array[:, i] = 0 104 | return result_array 105 | # def general_feature_importance_calc(self): 106 | # self.model.fit(self.X_train, self.y_train) 107 | # inital_error = mean_squared_error(self.model.predict(self.X_val), self.y_val) 108 | 109 | # importance_dict = {} 110 | # for feat in range(self.X_train.shape[1]): 111 | # importance_dict[feat] = mean_squared_error(self.model.predict(self.shuffle_column(self.X_val, feat)), self.y_val) - inital_error 112 | 113 | # sorted_items = sorted(importance_dict.items(), key=lambda x: x[1],reverse=True) 114 | 115 | # sorted_keys = [item[0] for item in sorted_items] 116 | # sorted_values = [item[1] for item in sorted_items] 117 | # return sorted_keys, sorted_values 118 | 119 | 120 | 121 | def search(self,mask,x_val, y_val, shuffle = False): 122 | min_loss = np.inf 123 | least_useful_feature_ind = None 124 | ret_mask = mask.copy() 125 | losses = [] 126 | for i in range(len(self.mask)): 127 | temp_mask = mask.copy() 128 | if mask[i] != 0: 129 | if shuffle: 130 | x = self.shuffle_column(x_val, i) 131 | else: 132 | x = self.set_column_to_zero(x_val, i) 133 | # try: 134 | # temp_loss = self.criterion(self.best_model_opt_1.predict(x*temp_mask), y_val) 135 | # except: 136 | if self.problem_type=="Classifier": 137 | temp_loss = self.criterion( y_val, self.model.predict_proba(x*temp_mask)[:,1]) 138 | else: 139 | temp_loss = self.criterion(self.model.predict(x*temp_mask), y_val) 140 | 141 | losses.append(temp_loss) 142 | if temp_loss < min_loss: 143 | min_loss = temp_loss 144 | least_useful_feature_ind = i 145 | 146 | ret_mask[least_useful_feature_ind] = 0 147 | return min_loss, ret_mask, losses 148 | 149 | def main_search_1(self, run_cv = False, p =0.1, loss_tolerance = 2, lamda=15): 150 | ## get best hyperparameters on val 151 | # if run_cv: 152 | # random_search = RandomizedSearchCV(estimator=self.model, param_distributions=self.params, n_iter=20, cv=5, 153 | # random_state=42, verbose=0,scoring=self.cv_scoring) 154 | # # Fit the model 155 | # random_search.fit(self.X_train, self.y_train) 156 | 157 | # # Update the best model and best parameters 158 | # best_params_opt_1 = random_search.best_params_ 159 | # self.best_model_opt_1 = random_search.best_estimator_ 160 | # print("Best parameters for iteration {} are: {}".format(iter, best_params_opt_1)) 161 | # self.model = self.best_model_opt_1 162 | 163 | self.model.fit(self.X_train, self.y_train) 164 | cur_loss = 20 165 | feat_nb = self.X_val_1.shape[1] 166 | mask = np.ones(feat_nb) 167 | #loss_tolerance = 2 #int(feat_nb/tol_param) 168 | self.selected_feature_nbs_gbmo=[] 169 | self.val_losses = [] 170 | 171 | update_tolerance = 3 172 | prev_loss = np.inf 173 | loss_patience, update_patience = 0, 0 174 | cur_loss, cur_mask, losses = self.search(mask, self.X_val_1, self.y_val_1) 175 | self.val_losses.append(cur_loss) 176 | prev_mask = np.zeros(feat_nb) 177 | while (loss_patience < loss_tolerance) and (np.sum(mask)>2): #and (cur_loss<=self.val_losses[0]): #and (np.sum(mask)>2): #&(np.sum(mask)>feat_nb/lamda): 178 | self.selected_feature_nbs_gbmo.append(np.sum(mask)) 179 | prev_loss = cur_loss 180 | prev_mask = cur_mask 181 | cur_loss, cur_mask, losses = self.search(mask, self.X_val_1, self.y_val_1) 182 | self.val_losses.append(cur_loss) 183 | 184 | if (cur_loss f_number): 215 | self.selected_feature_nbs_flbmo.append(np.sum(mask)) 216 | 217 | cur_loss, cur_mask, losses = self.search(mask, self.X_val_1, self.y_val_1, shuffle = shuffle) 218 | mask = cur_mask #print(mask) 219 | 220 | ### optimal mask 221 | #print(mask) 222 | return mask 223 | 224 | 225 | # def CV_test(self, X,y): 226 | # ''' 227 | # StratifiedKFold Cross-Validation with KNeighborsClassifier 228 | # ''' 229 | # score = [] 230 | # if self.problem_type=="Classifier": 231 | # cv = StratifiedKFold(5, random_state=42, shuffle = True) 232 | # else: 233 | # cv = KFold(5, random_state=42, shuffle = True) 234 | # #reg = MLPRegressor() 235 | # if self.problem_type == "Classifier": 236 | # #reg = SVC() 237 | # score.append(cross_val_score(self.model, X, y, cv=cv, scoring=make_scorer(roc_auc_score)).mean()) 238 | # else: 239 | # #reg = SVR() 240 | # score.append(cross_val_score(self.model, X, y, cv=cv, scoring=make_scorer(mean_squared_error)).mean()) 241 | # return np.mean(score) 242 | 243 | 244 | def create_mask(self, selected_indices, n): 245 | result_array = [0] * n # Initialize the array with zeros 246 | 247 | for index in selected_indices: 248 | if 0 <= index < n: 249 | result_array[index] = 1 250 | 251 | return result_array 252 | 253 | # def compare_fi(self, X,y,d, debug=False): 254 | # ''' 255 | # Compare feature selection methods 256 | # ''' 257 | # nn_lst = [] 258 | # gb_list = [] 259 | # acc_nn, acc_gg = [], [] 260 | 261 | # for f_nn, f_gb in zip(d['nn'], d['gb']): 262 | # nn_lst.append(f_nn) 263 | # gb_list.append(f_gb) 264 | # if debug: 265 | # print(nn_lst) 266 | # print(gb_list) 267 | # acc_nn.append(self.CV_test(X[:,nn_lst], y).mean()) 268 | # acc_gg.append(self.CV_test(X[:,gb_list], y).mean()) 269 | # return acc_nn, acc_gg 270 | 271 | # def compare_fi_f_nb(self, X,y,d, debug=False): 272 | # ''' 273 | # Compare feature selection methods 274 | # ''' 275 | # nn_lst = [] 276 | # gb_list = [] 277 | # acc_nn, acc_gg = [], [] 278 | 279 | # for f_nn, f_gb in zip(d['nn'], d['gb']): 280 | # nn_lst.append(f_nn) 281 | # gb_list.append(f_gb) 282 | # if debug: 283 | # print(nn_lst) 284 | # print(gb_list) 285 | # acc_nn.append(self.CV_test(X[:,nn_lst], y).mean()) 286 | # acc_gg.append(self.CV_test(X[:,gb_list], y).mean()) 287 | # return acc_nn, acc_gg 288 | 289 | def cv_for_other_fs_methods(self,f_numbers ): 290 | best_gbm=1000000 291 | best_mi=1000000 292 | if self.problem_type=="Classifier": 293 | mi_importances= mutual_info_classif(self.X_full_train, self.y_full_train) 294 | else: 295 | mi_importances= mutual_info_regression(self.X_full_train, self.y_full_train) 296 | temp = np.hstack((self.X_full_train, self.y_full_train))#pd.concat([self.X_full_train, self.y_full_train], axis=1) 297 | corr = np.corrcoef(temp, rowvar=False) 298 | target_corr = corr[:-1, -1] 299 | for f_number in f_numbers: 300 | 301 | mi_fi = mi_importances.argsort()[-f_number:][::-1] 302 | gb_fi = np.argsort(np.abs(target_corr))[-f_number:] 303 | mi_score = self.val_with_mask( np.expand_dims(self.create_mask(mi_fi, self.mask_len),axis=0)) 304 | gb_score = self.val_with_mask( np.expand_dims(self.create_mask(gb_fi, self.mask_len),axis=0)) 305 | if mi_score 0: # Avoiding log(0) 168 | # # target += np.log(real_features[:, i] + 1) # Logarithmic growth 169 | # # target = (target-np.min(target))/(np.max(target)-np.min(target)) 170 | # #target+=np.random.normal(0, 0.3, n_samples) 171 | # #target+=np.random.normal(0, 0.05, n_samples) 172 | # #target = (target-np.min(target))/(np.max(target)-np.min(target)) 173 | # target = np.expand_dims(target, 1) 174 | # X_train, X_temp_test, y_train, y_temp_test = train_test_split(data, target, test_size=0.55, random_state=42) 175 | 176 | # # Split the temporary test set into true test set and temporary validation set 177 | # X_test, X_temp_val, y_test, y_temp_val = train_test_split(X_temp_test, y_temp_test, test_size=0.73, random_state=42) 178 | 179 | # # Finally, split the temporary validation set into two validation sets (val1 and val2) 180 | # X_val1, X_val2, y_val1, y_val2 = train_test_split(X_temp_val, y_temp_val, test_size=0.33, random_state=42) 181 | 182 | # network = LGBM_w_Feature_Selector(model="lgbm",problem_type="regression",param_grid=lgbm_param_grid,X_train=X_train, X_test=X_test, slack=0.0, 183 | # X_val_1=X_val1, y_val_1=y_val1,X_val_2=X_val2, y_val_2=y_val2, y_train=y_train,y_test=y_test,iterations=10) 184 | # # gbmo_score, flbmo_score = network.test_on_synth_data(slacks=[0.001, 0.005], loss_ts=[1,2], f_numbers=[10, 20, 30]) 185 | # # print(gbmo_score) 186 | # # print(flbmo_score) 187 | # _,all_score, flbmo_score, mo_score, gbm_score, rfe_score, mi_score = network.feature_extraction(5, seed=11 , method="convergence", loss_tolerance=1,run_CV=True, include_RFE=True, f_numbers=[6, 10, 15], loss_ts=[1,2], slacks=[ 0.001, 0.005, 0.02, 0.05]) #slacks=[0.005, 0.01]) 188 | --------------------------------------------------------------------------------