├── .gitattributes ├── optimizers ├── __init__.py └── optimizers.py ├── wrappers ├── __init__.py ├── wrappers.py └── data_transformations.py ├── utils ├── __init__.py ├── utils.py ├── parse_params.py └── parse_config.py ├── .editorconfig ├── base ├── __init__.py ├── base_model.py ├── base_data_loader.py └── base_optimizer.py ├── data_loaders ├── __init__.py └── data_loaders.py ├── models ├── models.py └── __init__.py ├── requirements.txt ├── LICENSE ├── configs ├── config_classification.json ├── config_classification_bayes.json ├── config_regression.json └── config_unions.json ├── main.py ├── .gitignore └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | -------------------------------------------------------------------------------- /wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from .data_transformations import * 5 | from .wrappers import * 6 | 7 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from .utils import * 5 | from .parse_config import ConfigParser 6 | from .parse_params import * 7 | 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | indent_style = space 7 | indent_size = 4 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from .base_optimizer import BaseOptimizer 5 | from .base_model import BaseModel 6 | from .base_data_loader import BaseDataLoader 7 | 8 | 9 | -------------------------------------------------------------------------------- /wrappers/wrappers.py: -------------------------------------------------------------------------------- 1 | from sklearn.cross_decomposition import PLSRegression 2 | from sklearn.svm import SVC 3 | 4 | 5 | class PLSRegressionWrapper(PLSRegression): 6 | def transform(self, X): 7 | return super().transform(X) 8 | 9 | def fit_transform(self, X, Y): 10 | return self.fit(X, Y).transform(X) 11 | -------------------------------------------------------------------------------- /data_loaders/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from dataclasses import dataclass 5 | @dataclass 6 | class DataHandler: 7 | X_data: 'DataHandler' = None 8 | X_data_test: 'DataHandler' = None 9 | y_data: 'DataHandler' = None 10 | y_data_test: 'DataHandler' = None 11 | data_handler = DataHandler() 12 | -------------------------------------------------------------------------------- /models/models.py: -------------------------------------------------------------------------------- 1 | from base import BaseModel 2 | from sklearn.pipeline import Pipeline 3 | 4 | 5 | class Model(BaseModel): 6 | def __init__(self, pipeline, unions): 7 | steps = self.create_steps(pipeline, unions) 8 | self.model = Pipeline(steps=steps) 9 | 10 | def created_model(self): 11 | return self.model 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | analytics-python==1.2.9 2 | bamboolib==1.26.0 3 | certifi==2021.5.30 4 | cryptography==2.9.2 5 | cycler==0.10.0 6 | ipyslickgrid==0.0.3 7 | joblib==1.0.1 8 | kiwisolver==1.3.2 9 | matplotlib==3.4.3 10 | numpy==1.21.2 11 | pandas==1.3.3 12 | Pillow==8.3.2 13 | plotly==4.14.3 14 | ppscore==1.2.0 15 | psutil==5.8.0 16 | pyaml==21.10.1 17 | PyYAML==6.0 18 | retrying==1.3.3 19 | scikit-learn==1.0 20 | scikit-optimize==0.9.0 21 | scipy==1.7.1 22 | seaborn==0.10.1 23 | threadpoolctl==2.2.0 24 | toml==0.10.2 25 | wincertstore==0.2 26 | xlrd==2.0.1 27 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | from wrappers import * 5 | from sklearn.svm import SVC 6 | from sklearn.decomposition import PCA 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.preprocessing import PolynomialFeatures 9 | from sklearn.linear_model import Ridge 10 | from sklearn.dummy import DummyRegressor 11 | 12 | methods_dict = { 13 | 'ridge': Ridge, 14 | 'pf': PolynomialFeatures, 15 | 'scaler': StandardScaler, 16 | 'PLS':PLSRegressionWrapper, 17 | 'savgol':SavgolWrapper, 18 | 'SVC':SVC, 19 | 'PCA':PCA 20 | } 21 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import csv 3 | import os 4 | import json 5 | from pathlib import Path 6 | from collections import OrderedDict 7 | 8 | 9 | def read_csv(data_path): 10 | with open(data_path, mode='r') as csv_file: 11 | csv_reader = csv.reader(csv_file) 12 | 13 | data_list = [] 14 | line_count = 0 15 | for row in csv_reader: 16 | line_count += 1 17 | data_list.append(row) 18 | print(f'Processed {line_count} lines.') 19 | return data_list 20 | 21 | def read_json(fname): 22 | fname = Path(fname) 23 | with fname.open('rt') as handle: 24 | return json.load(handle, object_hook=OrderedDict) 25 | 26 | def write_json(content, fname): 27 | fname = Path(fname) 28 | with fname.open('wt') as handle: 29 | json.dump(content, handle, indent=4, sort_keys=False) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /wrappers/data_transformations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.signal import savgol_filter 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | 6 | # Example of savitzky golay filter implementation 7 | class SavgolWrapper(BaseEstimator, TransformerMixin): 8 | def __init__(self, win_length=7, polyorder=2, deriv=0): 9 | self.win_length = win_length 10 | self.polyorder = polyorder 11 | self.deriv = deriv 12 | 13 | def fit(self, X, y=None): 14 | return self 15 | 16 | def transform(self, X, y=None): 17 | signatures_sav = [] 18 | sp = [self.win_length, self.polyorder, self.deriv] 19 | for signal in X: 20 | if self.win_length != 0: 21 | signal = savgol_filter(signal, sp[0], sp[1], sp[2]) 22 | signatures_sav.append(signal) 23 | return np.array(signatures_sav) 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 janezlapajne 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data_loaders/data_loaders.py: -------------------------------------------------------------------------------- 1 | from base import BaseDataLoader 2 | from data_loaders import data_handler 3 | from sklearn.datasets import load_iris, load_boston 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | class Classification(BaseDataLoader): 8 | def __init__(self, data_path, shuffle, test_split, random_state, stratify, training): 9 | '''set data_path in configs if data localy stored''' 10 | 11 | X, y = load_iris(return_X_y=True) 12 | data_handler.X_data = X 13 | data_handler.y_data = y 14 | 15 | super().__init__(data_handler, shuffle, test_split, random_state, stratify, training) 16 | 17 | 18 | 19 | class Regression(BaseDataLoader): 20 | def __init__(self, data_path, shuffle, test_split, random_state, stratify, training): 21 | '''set data_path in configs if data localy stored''' 22 | 23 | X, y = load_boston(return_X_y=True) 24 | X_train, X_test, y_train, y_test = train_test_split(X, y, 25 | test_size=0.2, 26 | random_state=1, 27 | shuffle=True) 28 | 29 | data_handler.X_data = X_train 30 | data_handler.y_data = y_train 31 | data_handler.X_data_test = X_test 32 | data_handler.y_data_test = y_test 33 | 34 | super().__init__(data_handler, shuffle, test_split, random_state, stratify, training) 35 | -------------------------------------------------------------------------------- /configs/config_classification.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Classification", 3 | 4 | "model": { 5 | "type": "Model", 6 | "args": { 7 | "pipeline": ["scaler", "PLS", "pf", "SVC"], 8 | "unions": { 9 | } 10 | } 11 | }, 12 | 13 | "tuned_parameters":[{ 14 | "SVC__kernel": ["rbf"], 15 | "SVC__gamma": [1e-5, 1e-6, 1], 16 | "SVC__C": [1, 100, 1000], 17 | "PLS__n_components": [1,2,3] 18 | }], 19 | 20 | "optimizer": "OptimizerClassification", 21 | 22 | "search_method":{ 23 | "type": "GridSearchCV", 24 | "args": { 25 | "refit": false, 26 | "n_jobs": -1, 27 | "verbose": 2, 28 | "error_score": 0 29 | } 30 | }, 31 | 32 | "cross_validation": { 33 | "type": "RepeatedStratifiedKFold", 34 | "args": { 35 | "n_splits": 5, 36 | "n_repeats": 10, 37 | "random_state": 1 38 | } 39 | }, 40 | 41 | "data_loader": { 42 | "type": "Classification", 43 | "args":{ 44 | "data_path": "data/path-to-file", 45 | "shuffle": true, 46 | "test_split": 0.2, 47 | "stratify": true, 48 | "random_state":1 49 | } 50 | }, 51 | 52 | "score": "max balanced_accuracy", 53 | "test_model": true, 54 | "debug": false, 55 | "save_dir": "saved/" 56 | 57 | } 58 | -------------------------------------------------------------------------------- /configs/config_classification_bayes.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Classification", 3 | 4 | "model": { 5 | "type": "Model", 6 | "args": { 7 | "pipeline": ["scaler", "PLS", "SVC"], 8 | "unions": { 9 | } 10 | } 11 | }, 12 | 13 | "tuned_parameters":[{ 14 | "SVC__kernel": ["rbf"], 15 | "SVC__gamma": ["RS", 0.000001, 0.01], 16 | "SVC__C": ["RS", 1, 10000], 17 | "PLS__n_components": ["RSI", 1, 10] 18 | }], 19 | 20 | "optimizer": "OptimizerClassification", 21 | 22 | "search_method":{ 23 | "type": "BayesSearchCV", 24 | "args": { 25 | "refit": false, 26 | "n_jobs": -1, 27 | "verbose": 2, 28 | "error_score": 0, 29 | "n_iter": 10 30 | } 31 | }, 32 | 33 | "cross_validation": { 34 | "type": "RepeatedStratifiedKFold", 35 | "args": { 36 | "n_splits": 5, 37 | "n_repeats": 10, 38 | "random_state": 1 39 | } 40 | }, 41 | 42 | "data_loader": { 43 | "type": "Classification", 44 | "args":{ 45 | "data_path": "data/path-to-file", 46 | "shuffle": true, 47 | "test_split": 0.2, 48 | "stratify": true, 49 | "random_state":1 50 | } 51 | }, 52 | 53 | "score": "max balanced_accuracy", 54 | "test_model": true, 55 | "debug": false, 56 | "save_dir": "saved/" 57 | 58 | } 59 | -------------------------------------------------------------------------------- /configs/config_regression.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Regression", 3 | 4 | "model": { 5 | "type": "Model", 6 | "args": { 7 | "pipeline": ["scaler", "PCA", "ridge"], 8 | "unions": { 9 | } 10 | } 11 | }, 12 | 13 | "tuned_parameters":[{ 14 | "PCA__n_components": [5,10], 15 | "ridge__solver": ["svd", "cholesky", "lsqr", "sag"], 16 | "ridge__alpha": ["RS", 1e-5, 100], 17 | "ridge__fit_intercept": [true, false] 18 | }], 19 | 20 | "optimizer": "OptimizerRegression", 21 | 22 | "search_method":{ 23 | "type": "RandomizedSearchCV", 24 | "args": { 25 | "n_iter": 20, 26 | "refit": false, 27 | "n_jobs": -1, 28 | "verbose": 2, 29 | "error_score": 0 30 | } 31 | }, 32 | 33 | "cross_validation": { 34 | "type": "RepeatedKFold", 35 | "args": { 36 | "n_splits": 5, 37 | "n_repeats": 50, 38 | "random_state": 1 39 | } 40 | }, 41 | 42 | "data_loader": { 43 | "type": "Regression", 44 | "args":{ 45 | "data_path": "data/path-to-file", 46 | "shuffle": false, 47 | "test_split": 0.0, 48 | "stratify": false, 49 | "random_state":1 50 | } 51 | }, 52 | 53 | "score": "max neg_mean_absolute_error", 54 | "test_model": true, 55 | "debug": false, 56 | "save_dir": "saved/" 57 | 58 | } 59 | -------------------------------------------------------------------------------- /configs/config_unions.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Classification", 3 | 4 | "model": { 5 | "type": "Model", 6 | "args": { 7 | "pipeline": ["scaler", "pca-pls", "SVC"], 8 | "unions": { 9 | "pca-pls": ["PLS", "PCA"] 10 | } 11 | } 12 | }, 13 | 14 | "tuned_parameters":[{ 15 | "SVC__kernel": ["rbf"], 16 | "SVC__gamma": [1e-5, 1e-6, 1], 17 | "SVC__C": [1, 100, 1000], 18 | "pca-pls__PLS__n_components": [1,2,3], 19 | "pca-pls__PCA__n_components": [1,2,3] 20 | }], 21 | 22 | "optimizer": "OptimizerClassification", 23 | 24 | "search_method":{ 25 | "type": "GridSearchCV", 26 | "args": { 27 | "refit": false, 28 | "n_jobs": -1, 29 | "verbose": 2, 30 | "error_score": 0 31 | } 32 | }, 33 | 34 | "cross_validation": { 35 | "type": "RepeatedStratifiedKFold", 36 | "args": { 37 | "n_splits": 5, 38 | "n_repeats": 10, 39 | "random_state": 1 40 | } 41 | }, 42 | 43 | "data_loader": { 44 | "type": "Classification", 45 | "args":{ 46 | "data_path": "data/path-to-file", 47 | "shuffle": true, 48 | "test_split": 0.2, 49 | "stratify": true, 50 | "random_state":1 51 | } 52 | }, 53 | 54 | "score": "max balanced_accuracy", 55 | "test_model": true, 56 | "debug": false, 57 | "save_dir": "saved/" 58 | 59 | } 60 | -------------------------------------------------------------------------------- /base/base_model.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from models import methods_dict 3 | from sklearn.pipeline import FeatureUnion 4 | 5 | class BaseModel(): 6 | 7 | @abstractmethod 8 | def created_model(self): 9 | '''should return created model''' 10 | return NotImplementedError 11 | 12 | def create_steps(self, pipeline, unions): 13 | steps = list() 14 | for model_name in pipeline: 15 | # add features from pipeline 16 | if model_name in methods_dict.keys(): 17 | step = self._make_step(model_name) 18 | steps.append(step) 19 | 20 | # add combined features 21 | elif model_name in unions.keys(): 22 | steps_cf = list() 23 | for model_name_cf in unions[model_name]: 24 | if model_name_cf in methods_dict.keys(): 25 | step = self._make_step(model_name_cf) 26 | steps_cf.append(step) 27 | if steps_cf: 28 | steps.append([model_name, FeatureUnion(steps_cf)]) 29 | 30 | else: 31 | # if method not found 32 | steps.append([model_name, None]) 33 | return steps 34 | 35 | def change_step(self, model_name, model_instance, steps): 36 | for idx in range(len((steps))): 37 | if steps[idx][0] == model_name: 38 | steps[idx][1] = model_instance 39 | break 40 | return steps 41 | 42 | def _make_step(self, model_name): 43 | if isinstance(methods_dict[model_name], type): 44 | step = [model_name, methods_dict[model_name]()] 45 | else: 46 | # if already initialized 47 | step = [model_name, methods_dict[model_name]] 48 | return step 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /base/base_data_loader.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from sklearn.utils import shuffle as shuffle_data 3 | 4 | class BaseDataLoader(): 5 | def __init__(self, data_handler, shuffle, test_split, random_state, stratify, training): 6 | dh = data_handler 7 | 8 | if dh.X_data_test is dh.y_data_test is None: 9 | if 0 < test_split < 1: 10 | stratify = dh.y_data if stratify else None 11 | X_train, X_test, y_train, y_test = train_test_split(dh.X_data, 12 | dh.y_data, 13 | test_size=test_split, 14 | random_state=random_state, 15 | shuffle=shuffle, 16 | stratify=stratify) 17 | self.X_out, self.y_out = (X_train, y_train) if training else (X_test, y_test) 18 | print("Training and test sets created regarding defined test_split percentage.") 19 | else: 20 | self.X_out, self.y_out = dh.X_data, dh.y_data 21 | if shuffle: 22 | self.X_out, self.y_out = shuffle_data(self.X_out, self.y_out, random_state=random_state) 23 | print("Whole dataset is used for training.") 24 | 25 | elif dh.X_data_test is not None and dh.y_data_test is not None: 26 | self.X_out, self.y_out = (dh.X_data, dh.y_data) if training \ 27 | else (dh.X_data_test, dh.y_data_test) 28 | if shuffle: 29 | self.X_out, self.y_out = shuffle_data(self.X_out, self.y_out, random_state=random_state) 30 | print("For training and testing separate datasets configured in data_handler will be used.") 31 | else: 32 | raise ValueError('data_handler not configured properly.') 33 | 34 | def get_data(self): 35 | print(f"Number of loaded data instances: {len(self.X_out)}") 36 | return self.X_out, self.y_out 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /utils/parse_params.py: -------------------------------------------------------------------------------- 1 | 2 | from sklearn.utils.fixes import loguniform 3 | from scipy.stats import randint 4 | from sklearn.experimental import enable_halving_search_cv # noqa 5 | import sklearn.model_selection as model_selection_ 6 | import skopt as skopt_ 7 | from skopt.space import Real, Categorical, Integer 8 | 9 | def modify_params(search_method_params, config): 10 | tuned_parameters = config["tuned_parameters"] 11 | search_type = config["search_method"]["type"] 12 | 13 | assert search_type == "GridSearchCV" or \ 14 | search_type == "RandomizedSearchCV" or \ 15 | search_type == "HalvingGridSearchCV" or \ 16 | search_type == "HalvingGridSearchCV" or \ 17 | search_type == "BayesSearchCV" , \ 18 | f"Search type {search_type} not supported." 19 | 20 | if "Grid" in search_type: 21 | search_method_params['param_grid'] = tuned_parameters 22 | 23 | elif search_type == "BayesSearchCV": 24 | for method_name in tuned_parameters[0]: 25 | temp = tuned_parameters[0][method_name] 26 | if len(temp) == 3 and temp[0] == 'RS': 27 | tuned_parameters[0][method_name] = Real(temp[1], temp[2], prior="log-uniform") 28 | elif len(temp) == 3 and temp[0] == 'RSI': 29 | tuned_parameters[0][method_name] = Integer(temp[1], temp[2], prior="uniform") 30 | else: 31 | tuned_parameters[0][method_name] = Categorical(temp) 32 | search_method_params['search_spaces'] = tuned_parameters 33 | 34 | else: 35 | for method_name in tuned_parameters[0]: 36 | temp = tuned_parameters[0][method_name] 37 | if len(temp) == 3 and temp[0] == 'RS': 38 | tuned_parameters[0][method_name] = loguniform(temp[1], temp[2]) 39 | elif len(temp) == 3 and temp[0] == 'RSI': 40 | tuned_parameters[0][method_name] = randint(temp[1], temp[2]) 41 | search_method_params['param_distributions'] = tuned_parameters 42 | 43 | return search_method_params, search_type 44 | 45 | 46 | def get_lib(search_type): 47 | if search_type == "GridSearchCV" or \ 48 | search_type == "RandomizedSearchCV" or \ 49 | search_type == "HalvingGridSearchCV" or \ 50 | search_type == "HalvingRandomSearchCV": 51 | return model_selection_ 52 | elif search_type == "BayesSearchCV": 53 | return skopt_ 54 | 55 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import argparse 4 | import collections 5 | from utils import ConfigParser, read_json, modify_params, get_lib 6 | import data_loaders.data_loaders as data_loaders_ 7 | import models.models as models_ 8 | import optimizers.optimizers as optimizers_ 9 | import sklearn.model_selection as model_selection_ 10 | 11 | 12 | def main(config): 13 | 14 | data_loader = config.init_obj('data_loader', data_loaders_, **{'training':True}) 15 | model = config.init_obj('model', models_).created_model() 16 | cross_val = config.init_obj('cross_validation', model_selection_) 17 | mnt, scoring = config['score'].split() 18 | 19 | search_method_params = { 20 | 'estimator': model, 21 | 'scoring': scoring, 22 | 'cv': cross_val 23 | } 24 | search_method_params, search_type = modify_params(search_method_params, config) 25 | search_method = config.init_obj('search_method', get_lib(search_type), **search_method_params) 26 | 27 | Optimizer = config.import_module('optimizer', optimizers_) 28 | optim = Optimizer(model=model, 29 | data_loader=data_loader, 30 | search_method=search_method, 31 | scoring=scoring, 32 | mnt=mnt, 33 | config=config) 34 | 35 | optim.optimize() 36 | 37 | 38 | if config['test_model']: 39 | data_loader = config.init_obj('data_loader', data_loaders_, **{'training':False}) 40 | X_test, y_test = data_loader.get_data() 41 | model = optim.load_model() 42 | y_pred = model.predict(X_test) 43 | test_report = optim.create_test_report(y_test, y_pred) 44 | optim.save_report(test_report, 'report_test.txt') 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | args = argparse.ArgumentParser(description='Sklearn Template') 50 | args.add_argument('-c', '--config', default=None, type=str, 51 | help='config file path (default: None)') 52 | 53 | # custom cli options to modify configuration from default values given in json file. 54 | CustomArgs = collections.namedtuple('CustomArgs', 'flags type target') 55 | options = [ 56 | CustomArgs(['-cv', '--cross_validation'], type=int, target='cross_validation;args;n_repeats'), 57 | ] 58 | config = ConfigParser.from_args(args, options) 59 | main(config) 60 | 61 | 62 | # configs_list = glob.glob(os.path.join("configs", "*.json")) 63 | # for cfg_fname in configs_list: 64 | # config = read_json(cfg_fname) 65 | # config = ConfigParser(config) 66 | # main(config) 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # images 132 | *.png 133 | *.tif 134 | *.jpg 135 | 136 | .vscode 137 | 138 | # random 139 | *.pkl 140 | config.json 141 | *.txt 142 | !requirements.txt 143 | saved 144 | -------------------------------------------------------------------------------- /base/base_optimizer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from sklearn.utils import shuffle 4 | import numpy as np 5 | from abc import abstractmethod 6 | 7 | 8 | class BaseOptimizer(): 9 | def __init__(self, model, data_loader, search_method, config): 10 | self.X_train, self.y_train = data_loader.get_data() 11 | self.model = model 12 | self.search_method = search_method 13 | self.save_dir = config.save_dir 14 | self.debug = config.debug 15 | self.config = config 16 | 17 | def _perform_grid_search(self): 18 | # sorted(sklearn.metrics.SCORERS.keys()) -> get available metrics 19 | self.search_method.fit(self.X_train, self.y_train) 20 | return self.search_method 21 | 22 | def _save_model(self, model): 23 | save_path = os.path.join(self.save_dir, "model.pkl") 24 | with open(save_path,'wb') as f: 25 | pickle.dump(model, f) 26 | 27 | def load_model(self): 28 | load_path = os.path.join(self.save_dir, "model.pkl") 29 | with open(load_path, 'rb') as f: 30 | model = pickle.load(f) 31 | print(model) 32 | return model 33 | 34 | def save_report(self, report, name_txt): 35 | save_path = os.path.join(self.save_dir, name_txt) 36 | with open(save_path, "w") as text_file: 37 | text_file.write(report) 38 | 39 | def optimize(self): 40 | if self.debug: 41 | self._debug_true() 42 | else: 43 | self._debug_false() 44 | 45 | def _debug_false(self): 46 | gs = self._perform_grid_search() 47 | model = self.fitted_model(gs) 48 | train_report = self.create_train_report(gs) 49 | self._save_model(model) 50 | self.save_report(train_report, "report_train.txt") 51 | 52 | def _debug_true(self): 53 | x = self.X_train 54 | y = self.y_train 55 | # for each parameter take just the first element from param_grid 56 | if hasattr(self.search_method, "param_grid"): 57 | param_grid = self.search_method.param_grid[0].copy() 58 | for param in param_grid.keys(): 59 | param_grid[param] = param_grid[param][0] 60 | 61 | self.model.set_params(**param_grid) 62 | 63 | print("-----------------------------------------------------------------") 64 | print("Model architecture:") 65 | print("input: {}".format(x.shape)) 66 | for layer in self.model: 67 | if hasattr(layer, "fit_transform"): 68 | x = layer.fit_transform(x,y) 69 | elif hasattr(layer, "fit") and hasattr(layer, "predict"): 70 | layer.fit(x,y) 71 | x = layer.predict(x) 72 | else: 73 | x = np.array([]) 74 | print(f"Warning: {layer} layer dimensions wrong!") 75 | 76 | print("layer {}: {}".format(layer, x.shape)) 77 | print("-----------------------------------------------------------------") 78 | else: 79 | print("\n Error: Debug option only available for GridSearch") 80 | quit() 81 | 82 | def create_train_report(self, cor): 83 | '''Should return report from training''' 84 | return "Train report not configured." 85 | 86 | def create_test_report(self, y_test, y_pred): 87 | '''Should return report from testing''' 88 | return "Test report not configured." 89 | 90 | @abstractmethod 91 | def fitted_model(self, cor): 92 | '''Should return fitted model''' 93 | raise NotImplementedError 94 | 95 | 96 | -------------------------------------------------------------------------------- /optimizers/optimizers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from base import BaseOptimizer 3 | from sklearn.metrics import classification_report, mean_absolute_error 4 | 5 | 6 | class OptimizerClassification(BaseOptimizer): 7 | def __init__(self, model, data_loader, search_method, scoring, mnt, config): 8 | self.scoring = scoring 9 | self.mnt = mnt 10 | super().__init__(model, data_loader, search_method, config) 11 | 12 | def fitted_model(self, cor): 13 | clf_results = cor.cv_results_ 14 | params = np.array(clf_results["params"]) 15 | means = clf_results["mean_test_score"] 16 | 17 | if self.mnt == 'min': 18 | sort_idx = np.argsort(means) 19 | if self.mnt == 'max': 20 | sort_idx = np.argsort(means)[::-1] 21 | 22 | params_sorted = params[sort_idx] 23 | self.model.set_params(**params_sorted[0]) # define the best model 24 | self.model.fit(self.X_train, self.y_train) 25 | 26 | return self.model 27 | 28 | def create_train_report(self, cor): 29 | print("Optimizing for: ", self.scoring) 30 | print("_________________") 31 | 32 | clf_results = cor.cv_results_ 33 | params = np.array(clf_results["params"]) 34 | means = clf_results["mean_test_score"] 35 | stds = clf_results["std_test_score"] 36 | 37 | if self.mnt == 'min': 38 | sort_idx = np.argsort(means) 39 | if self.mnt == 'max': 40 | sort_idx = np.argsort(means)[::-1] 41 | 42 | indexes = np.arange(len(means)) 43 | 44 | indexes_sorted = indexes[sort_idx] 45 | means_sorted = means[sort_idx] 46 | stds_sorted = stds[sort_idx] 47 | params_sorted = params[sort_idx] 48 | 49 | train_report = f"### Optimizing for {self.scoring} ###\n\n" 50 | for idx, mean, std, params_ in zip(indexes_sorted, means_sorted, stds_sorted, params_sorted): 51 | print("%d %0.3f (+/-%0.03f) for %r" 52 | % (idx, mean, std * 2, params_)) 53 | train_report += f"{mean:.3f} +/-{std*2:.3f} for {params_}\n" 54 | train_report += f"\n### Best model: ###\n\n {str(self.model)}" 55 | train_report += f"\n Number of samples used for training: {len(self.y_train)}" 56 | 57 | return train_report 58 | 59 | def create_test_report(self, y_test, y_pred): 60 | test_report = str(classification_report(y_test, y_pred)) 61 | print("\n Report on test data:") 62 | print(test_report) 63 | test_report += f"\n\n True Values:\n {y_test}" 64 | test_report += f"\n Pred Values:\n {y_pred}" 65 | return test_report 66 | 67 | 68 | 69 | class OptimizerRegression(BaseOptimizer): 70 | def __init__(self, model, data_loader, search_method, scoring, mnt, config): 71 | self.scoring = scoring 72 | self.mnt = mnt 73 | super().__init__(model, data_loader, search_method, config) 74 | 75 | def fitted_model(self, cor): 76 | clf_results = cor.cv_results_ 77 | params = np.array(clf_results["params"]) 78 | means = clf_results["mean_test_score"] 79 | 80 | if self.mnt == 'min': 81 | sort_idx = np.argsort(means) 82 | if self.mnt == 'max': 83 | sort_idx = np.argsort(means)[::-1] 84 | 85 | params_sorted = params[sort_idx] 86 | self.model.set_params(**params_sorted[0]) # define the best model 87 | self.model.fit(self.X_train, self.y_train) 88 | 89 | return self.model 90 | 91 | def create_train_report(self, cor): 92 | print("Optimizing for: ", self.scoring) 93 | print("_________________") 94 | 95 | clf_results = cor.cv_results_ 96 | params = np.array(clf_results["params"]) 97 | means = clf_results["mean_test_score"] 98 | stds = clf_results["std_test_score"] 99 | 100 | if self.mnt == 'min': 101 | sort_idx = np.argsort(means) 102 | if self.mnt == 'max': 103 | sort_idx = np.argsort(means)[::-1] 104 | 105 | indexes = np.arange(len(means)) 106 | 107 | indexes_sorted = indexes[sort_idx] 108 | means_sorted = means[sort_idx] 109 | stds_sorted = stds[sort_idx] 110 | params_sorted = params[sort_idx] 111 | 112 | train_report = f"### Optimizing for {self.scoring} ###\n\n" 113 | for idx, mean, std, params_ in zip(indexes_sorted, means_sorted, stds_sorted, params_sorted): 114 | print("%d %0.3f (+/-%0.03f) for %r" 115 | % (idx, mean, std * 2, params_)) 116 | train_report += f"{mean:.3f} +/-{std*2:.3f} for {params_}\n" 117 | train_report += f"\n### Best model: ###\n\n {str(self.model)}" 118 | train_report += f"\n Number of samples used for training: {len(self.y_train)}" 119 | 120 | return train_report 121 | 122 | def create_test_report(self, y_test, y_pred): 123 | mae = mean_absolute_error(y_test, y_pred) 124 | test_report = f"True Values:\n {y_test}" 125 | test_report += f"\n Pred Values:\n {y_pred}" 126 | test_report += f"\n MAE: \n {mae}" 127 | print(test_report) 128 | return test_report 129 | -------------------------------------------------------------------------------- /utils/parse_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from pathlib import Path 4 | from functools import reduce, partial 5 | from operator import getitem 6 | from datetime import datetime 7 | from utils import read_json, write_json 8 | 9 | 10 | class ConfigParser: 11 | def __init__(self, config, modification=None, run_id=None): 12 | """ 13 | class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving 14 | and logging module. 15 | :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example. 16 | :param modification: Dict keychain:value, specifying position values to be replaced from config dict. 17 | :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default 18 | """ 19 | # load config file and apply modification 20 | self._config = _update_config(config, modification) 21 | 22 | # set save_dir where trained model and log will be saved. 23 | save_dir = Path(self.config['save_dir']) 24 | 25 | exper_name = self.config['name'] 26 | if run_id is None: # use timestamp as default run-id 27 | run_id = datetime.now().strftime(r'%m%d_%H%M%S') 28 | self._save_dir = save_dir / exper_name / run_id 29 | 30 | # make directory for saving checkpoints and log. 31 | exist_ok = run_id == '' 32 | self.save_dir.mkdir(parents=True, exist_ok=exist_ok) 33 | 34 | # save updated config file to the checkpoint dir 35 | write_json(self.config, self.save_dir / 'config.json') 36 | 37 | self._debug = self.config['debug'] 38 | 39 | 40 | @classmethod 41 | def from_args(cls, args, options=''): 42 | """ 43 | Initialize this class from some cli arguments. Used in train, test. 44 | """ 45 | for opt in options: 46 | args.add_argument(*opt.flags, default=None, type=opt.type) 47 | if not isinstance(args, tuple): 48 | args = args.parse_args() 49 | 50 | msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example." 51 | assert args.config is not None, msg_no_cfg 52 | cfg_fname = Path(args.config) 53 | 54 | config = read_json(cfg_fname) 55 | 56 | # parse custom cli options into dictionary 57 | modification = {opt.target : getattr(args, _get_opt_name(opt.flags)) for opt in options} 58 | return cls(config, modification) 59 | 60 | def init_obj(self, name, module, *args, **kwargs): 61 | """ 62 | Finds a function handle with the name given as 'type' in config, and returns the 63 | instance initialized with corresponding arguments given. 64 | 65 | `object = config.init_obj('name', module, a, b=1)` 66 | is equivalent to 67 | `object = module.name(a, b=1)` 68 | """ 69 | module_name = self[name]['type'] # can do because of __getitem__ 70 | module_args = dict(self[name]['args']) 71 | assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed' 72 | module_args.update(kwargs) 73 | return getattr(module, module_name)(*args, **module_args) 74 | 75 | def init_ftn(self, name, module, *args, **kwargs): 76 | """ 77 | Finds a function handle with the name given as 'type' in config, and returns the 78 | function with given arguments fixed with functools.partial. 79 | 80 | `function = config.init_ftn('name', module, a, b=1)` 81 | is equivalent to 82 | `function = lambda *args, **kwargs: module.name(a, *args, b=1, **kwargs)`. 83 | """ 84 | module_name = self[name]['type'] 85 | module_args = dict(self[name]['args']) 86 | assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed' 87 | module_args.update(kwargs) 88 | return partial(getattr(module, module_name), *args, **module_args) 89 | 90 | def import_module(self, name, module): 91 | name = self[name] 92 | return getattr(module, name) 93 | 94 | def __getitem__(self, name): 95 | """Access items like ordinary dict.""" 96 | return self.config[name] 97 | 98 | # setting read-only attributes 99 | @property 100 | def config(self): 101 | return self._config 102 | 103 | @property 104 | def save_dir(self): 105 | return self._save_dir 106 | 107 | @property 108 | def debug(self): 109 | return self._debug 110 | 111 | # helper functions to update config dict with custom cli options 112 | def _update_config(config, modification): 113 | if modification is None: 114 | return config 115 | 116 | for k, v in modification.items(): 117 | if v is not None: 118 | _set_by_path(config, k, v) 119 | return config 120 | 121 | def _get_opt_name(flags): 122 | for flg in flags: 123 | if flg.startswith('--'): 124 | return flg.replace('--', '') 125 | return flags[0].replace('--', '') 126 | 127 | def _set_by_path(tree, keys, value): 128 | """Set a value in a nested object in tree by sequence of keys.""" 129 | keys = keys.split(';') 130 | _get_by_path(tree, keys[:-1])[keys[-1]] = value 131 | 132 | def _get_by_path(tree, keys): 133 | """Access a nested object in tree by sequence of keys.""" 134 | return reduce(getitem, keys, tree) 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scikit-learn-project-template 2 | 3 | 4 | ## About the project 5 | * Folder structure suitable for many machine learning projects. Especially for those with small amount of available training data. 6 | * `.json` config file support for convenient parameter tuning. 7 | * Customizable command line options for more convenient parameter tuning. It supports grid search, random search and bayesian search. 8 | * Abstract base classes for faster development: 9 | * `BaseOptimizer` handles execution of grid search, saving and loading of models and formation of test and train reports. 10 | * `BaseDataLoader` handles splitting of training and testing data. Spilt is performed depending on settings provided in config file. 11 | * `BaseModel` handles construction of consecutive steps defined in config file. 12 | * Suitable for tunining of machine learning models which follow `scikit-learn` nomenclature. 13 | For the time being tested open libraries: 14 | * scikit-learn 15 | * sktime 16 | * tsfresh 17 | 18 | ## Getting Started 19 | 20 | To get a local copy up and running follow steps below. 21 | ### Requirements 22 | * Python >= `3.7` 23 | * Packages included in `requirements.txt` file 24 | * (Anaconda for easy installation) 25 | 26 | ### Install dependencies 27 | 28 | Create and activate virtual environment: 29 | ```sh 30 | conda create -n yourenvname python=3.7 31 | conda activate yourenvname 32 | ``` 33 | 34 | Install packages: 35 | ```sh 36 | python -m pip install -r requirements.txt 37 | ``` 38 | 39 | ## Folder Structure 40 | ``` 41 | sklearn-project-template/ 42 | │ 43 | ├── main.py - main script to start training and (optionally) testing 44 | │ 45 | ├── base/ - abstract base classes 46 | │ ├── base_data_loader.py 47 | │ ├── base_model.py 48 | │ └── base_optimizer.py 49 | │ 50 | ├── configs/ - holds configuration for training and testing 51 | │ ├── config_classification.json 52 | │ └── config_regression.json 53 | │ 54 | ├── data/ - default directory for storing input data 55 | │ 56 | ├── data_loaders/ - anything about data loading goes here 57 | │ └── data_loaders.py 58 | │ 59 | ├── models/ - models 60 | │ ├── __init__.py - defined models by name 61 | │ └── models.py 62 | │ 63 | ├── optimizers/ - optimizers 64 | │ └── optimizers.py 65 | │ 66 | ├── saved/ - config, model and reports are saved here 67 | │ ├── Classification 68 | │ └── Regression 69 | │ 70 | ├── utils/ - utility functions 71 | │ ├── parse_config.py - class to handle config file and cli options 72 | │ ├── parse_params.py 73 | │ └── utils.py 74 | │ 75 | ├── wrappers/ - wrappers of modified sklearn models or self defined transforms 76 | │ ├── data_transformations.py 77 | │ └── wrappers.py 78 | ``` 79 | 80 | ## Usage 81 | Models in this repo are trained on two well-known datasets: iris and boston. First is used for classification and second for regression problem. 82 | 83 | Run classification: 84 | ```sh 85 | python main.py -c configs/config_classification.json 86 | ``` 87 | Run regression: 88 | ```sh 89 | python main.py -c configs/config_regression.json 90 | ``` 91 | 92 | ### Config file format 93 | Config files are in `.json` format. Example of such config is shown below: 94 | ```javascript 95 | { 96 | "name": "Classification", // session name 97 | 98 | "model": { 99 | "type": "Model", // model name 100 | "args": { 101 | "pipeline": ["scaler", "PLS", "pf", "SVC"], // pipeline of methods 102 | "unions": { // unions of methods included in pipeline 103 | } 104 | } 105 | }, 106 | 107 | "tuned_parameters":[{ // hyperparameters to be tuned with search method 108 | "SVC__kernel": ["rbf"], 109 | "SVC__gamma": [1e-5, 1e-6, 1], 110 | "SVC__C": [1, 100, 1000], 111 | "PLS__n_components": [1,2,3] 112 | }], 113 | 114 | "optimizer": "OptimizerClassification", // name of optimizer 115 | 116 | "search_method":{ 117 | "type": "GridSearchCV", // method used to search through parameters 118 | "args": { 119 | "refit": false, 120 | "n_jobs": -1, 121 | "verbose": 2, 122 | "error_score": 0 123 | } 124 | }, 125 | 126 | "cross_validation": { 127 | "type": "RepeatedStratifiedKFold", // type of cross-validation used 128 | "args": { 129 | "n_splits": 5, 130 | "n_repeats": 10, 131 | "random_state": 1 132 | } 133 | }, 134 | 135 | "data_loader": { 136 | "type": "Classification", // name of dataloader class 137 | "args":{ 138 | "data_path": "data/path-to-file", // path to data 139 | "shuffle": true, // if data shuffled before optimization 140 | "test_split": 0.2, // use split method for model testing 141 | "stratify": true, // if data stratified before optimization 142 | "random_state":1 // random state for repeaded output 143 | } 144 | }, 145 | 146 | "score": "max balanced_accuracy", // mode and metrics used for scoring 147 | "test_model": true, // if model is tested after training 148 | "debug": false, // debug model architecture 149 | "save_dir": "saved/" // directory of saved reports, models and configs 150 | } 151 | 152 | ``` 153 | 154 | Additional parameters can be added to config file. See `scikit-learn` documentation for description of tuned parameters, search method and cross validation. Possible metrics for model evaluation could be found [here](https://scikit-learn.org/stable/modules/model_evaluation.html). 155 | 156 | ### Pipeline 157 | Methods added to config pipeline must be first defined in `models/__init__.py` file. For previous example of config file the following must be added: 158 | 159 | ```python 160 | from wrappers import * 161 | from sklearn.svm import SVC 162 | from sklearn.preprocessing import StandardScaler 163 | from sklearn.preprocessing import PolynomialFeatures 164 | 165 | methods_dict = { 166 | 'pf': PolynomialFeatures, 167 | 'scaler': StandardScaler, 168 | 'PLS':PLSRegressionWrapper, 169 | 'SVC':SVC, 170 | } 171 | ``` 172 | Majority of algorithms implemented in `scikit-learn` library can be directly imported and used. Some algorithms need a little modification before usage. Such an example is Partial least squares (PLS). Modification is implemented in `wrappers/wrappers.py`. In case you want to implement your own method it can be done as well. An example wrapper for Savitzky golay filter is shown in `wrappers/data_transformations.py`. Implementation must satisfy standard method calls, eg. fit(), tranform() etc. 173 | 174 | ### Unions 175 | 176 | Unions concatenates results of multiple transformer methods. Those are applied in parallel to the input data. This is useful if you want to combine several feature mechanisms into a single transformer. For example, if you want to merge results from Principal component analysis (PCA) and Partial least squares (PLS) you can do the following: 177 | 178 | ```javascript 179 | "pipeline": ["scaler", "pca-pls", "SVC"], 180 | "unions": { 181 | "pca-pls": ["PLS", "PCA"] 182 | } 183 | ``` 184 | 185 | In pipeline you must write self made-up name of a method (in this case `pca-pls`) and then use the same name as a key in unions dictionary. Value to coresponding key must be list of methods (in this case consisting of "PCA" and "PLS"). Hyperparameters which are tuned with a chosen search method must be separated with double underscore (following `scikit-learn` nomenclature). In case you want to tune number of components of both methods you can do the following: 186 | 187 | ```javascript 188 | "tuned_parameters":[{ 189 | "pca-pls__PLS__n_components": [1,2,3], 190 | "pca-pls__PCA__n_components": [1,2,3] 191 | }], 192 | ``` 193 | 194 | Please refer to `configs/config_unions.json` for unions example. 195 | ### Debug 196 | 197 | To debug model architecture set debug flag in config file to `true`. It will print model by steps with coresponding consecutive outputs produced at each step. Model debugging will only work with `GridSearchCV` search method. In case many parameters are listed to choose from only first ones will be used for evaluation. Debugging is useful in cases when you want to get a sense of what happens at separate step. 198 | 199 | ## Customization 200 | 201 | 202 | ### Custom CLI options 203 | 204 | Changing values of config file is a clean, safe and easy way of tuning hyperparameters. However, sometimes 205 | it is better to have command line options if some values need to be changed too often or quickly. 206 | 207 | This template uses the configurations stored in the json file by default, but by registering custom options as follows 208 | you can change some of them using CLI flags. 209 | 210 | ```python 211 | # simple class-like object having 3 attributes, `flags`, `type`, `target`. 212 | CustomArgs = collections.namedtuple('CustomArgs', 'flags type target') 213 | options = [ 214 | CustomArgs(['-cv', '--cross_validation'], type=int, target='cross_validation;args;n_repeats'), 215 | # options added here can be modified by command line flags. 216 | ] 217 | ``` 218 | `target` argument should be sequence of keys, which are used to access that option in the config dict. In this example, `target` 219 | number of repeats in cross validation option is `('cross_validation', 'args', 'n_repeats')` because `config['cross_validation']['args']['n_repeats']` points to number of repeats. 220 | 221 | 222 | ### Data Loader 223 | * **Writing your own data loader** 224 | 225 | 1. **Inherit ```BaseDataLoader```** 226 | 227 | `BaseDataLoader` handles: 228 | * Train/test procedure 229 | * Data shuffling 230 | 231 | * **Usage** 232 | 233 | Loaded data must be assigned to data_handler (dh) in appropriate manner. If dh.X_data_test and dh.y_data_test are not assigned in advance, train/test split could be created by base data loader. In case `"test_split":0.0` is set in config file, whole dataset is used for training. Another option is to assign both train and test sets as shown below. In this case train data will be used for optimization and test data will be used for evaluation of a model. 234 | 235 | ```python 236 | data_handler.X_data = X_train 237 | data_handler.y_data = y_train 238 | data_handler.X_data_test = X_test 239 | data_handler.y_data_test = y_test 240 | ``` 241 | * **Example** 242 | 243 | Please refer to `data_loaders/data_loaders.py` for data loading example. 244 | 245 | ### Optimizer 246 | * **Writing your own optimizer** 247 | 248 | 1. **Inherit ```BaseOptimizer```** 249 | 250 | `BaseOptimizer` handles: 251 | * Optimization procedure 252 | * Model saving and loading 253 | * Report saving 254 | 255 | 256 | 2. **Implementing abstract methods** 257 | 258 | You need to implement `fitted_model()` which must return fitted model. 259 | Optionally you can implement format of train/test reports with `create_train_report()` and `create_test_report()`. 260 | 261 | * **Example** 262 | 263 | Please refer to `optimizers/optimizers.py` for optimizer example. 264 | 265 | ### Model 266 | * **Writing your own model** 267 | 268 | 1. **Inherit `BaseModel`** 269 | 270 | `BaseModel` handles: 271 | * Initialization defined in config pipeline 272 | * Modification of steps 273 | 274 | 2. **Implementing abstract methods** 275 | 276 | You need to implement `created_model()` which must return created model. 277 | 278 | * **Usage** 279 | 280 | Initialization of pipeline methods is performed with `create_steps()`. Steps can be later modified with the use of `change_step()`. An example on how to change a step is shown bellow where Sequential feature selector is added to the pipeline. 281 | 282 | ```python 283 | def __init__(self, pipeline): 284 | steps = self.create_steps(pipeline) 285 | 286 | rf = RandomForestRegressor(random_state=1) 287 | clf = TransformedTargetRegressor(regressor=rf, 288 | func=np.log1p, 289 | inverse_func=np.expm1) 290 | sfs = SequentialFeatureSelector(clf, n_features_to_select=2, cv=3) 291 | 292 | steps = self.change_step('sfs', sfs, steps) 293 | 294 | self.model = Pipeline(steps=steps) 295 | 296 | ``` 297 | 298 | Beware that in this case 'sfs' needs to be added to pipeline in config file. Otherwise, no step in the pipeline is changed. 299 | 300 | * **Example** 301 | 302 | Please refer to `models/models.py` model example. 303 | 304 | ## Common Questions About Hyperparameter Optimization 305 | 306 | ### How to Choose Between Random and Grid Search? 307 | * Choose the method based on your needs. I recommend starting with grid and doing a random search if you have the time. 308 | * Grid search is appropriate for small and quick searches of hyperparameter values that are known to perform well generally. 309 | * Random search is appropriate for discovering new hyperparameter values or new combinations of hyperparameters, often resulting in better performance, although it may take more time to complete. 310 | 311 | ### How to Speed-Up Hyperparameter Optimization? 312 | * Ensure that you set the “n_jobs” argument to the number of cores on your machine. 313 | * Evaluate on a smaller sample of your dataset. 314 | * Explore a smaller search space. 315 | * Use fewer repeats and/or folds for cross-validation. 316 | * Execute the search on a faster machine, such as AWS EC2. 317 | * Use an alternate model that is faster to evaluate. 318 | 319 | More on: [machinelearningmastery](https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/). 320 | 321 | ## Roadmap 322 | 323 | See [open issues](https://github.com/janezlapajne/sklearn-project-template/issues) to request a feature or report a bug. 324 | 325 | ## Contribution 326 | 327 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. 328 | 329 | How to start with contribution: 330 | 1. Fork the Project 331 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 332 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 333 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 334 | 5. Open a Pull Request 335 | 336 | Feel free to contribute any kind of function or enhancement. 337 | 338 | ## License 339 | This project is licensed under the MIT License. See LICENSE for more details. 340 | 341 | ## Acknowledgements 342 | This project is inspired by the project [pytorch-template](https://github.com/victoresque/pytorch-template) by [Victor Huang](https://github.com/victoresque). I would like to confess that some functions, architecture and some parts of readme were directly copied from this repo. But to be honest, what should I do - the project is absolutely amazing! 343 | 344 | Additionally, special thanks to the creator of Machine learning mastery, [Jason Brownlee, PhD](https://machinelearningmastery.com/about/) for insightful articles published! 345 | 346 | ## Consider supporting 347 | 348 | Do you feel generous today? I am still a student and would make a good use of some extra money :P 349 | 350 | 351 | 352 | [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/janezlapajne) 353 | 354 | 397 | 398 | --------------------------------------------------------------------------------