├── .gitattributes
├── optimizers
    ├── __init__.py
    └── optimizers.py
├── wrappers
    ├── __init__.py
    ├── wrappers.py
    └── data_transformations.py
├── utils
    ├── __init__.py
    ├── utils.py
    ├── parse_params.py
    └── parse_config.py
├── .editorconfig
├── base
    ├── __init__.py
    ├── base_model.py
    ├── base_data_loader.py
    └── base_optimizer.py
├── data_loaders
    ├── __init__.py
    └── data_loaders.py
├── models
    ├── models.py
    └── __init__.py
├── requirements.txt
├── LICENSE
├── configs
    ├── config_classification.json
    ├── config_classification_bayes.json
    ├── config_regression.json
    └── config_unions.json
├── main.py
├── .gitignore
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | 
4 | 


--------------------------------------------------------------------------------
/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | 
4 | from .data_transformations import *
5 | from .wrappers import *
6 | 
7 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | 
4 | from .utils import *
5 | from .parse_config import ConfigParser
6 | from .parse_params import *
7 | 
8 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | indent_style = space
 7 | indent_size = 4
 8 | insert_final_newline = true
 9 | trim_trailing_whitespace = true
10 | 


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 | 
4 | from .base_optimizer import BaseOptimizer
5 | from .base_model import BaseModel
6 | from .base_data_loader import BaseDataLoader
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/wrappers/wrappers.py:
--------------------------------------------------------------------------------
 1 | from sklearn.cross_decomposition import PLSRegression
 2 | from sklearn.svm import SVC
 3 | 
 4 | 
 5 | class PLSRegressionWrapper(PLSRegression):
 6 |     def transform(self, X):
 7 |         return super().transform(X)
 8 | 
 9 |     def fit_transform(self, X, Y):
10 |         return self.fit(X, Y).transform(X)
11 | 


--------------------------------------------------------------------------------
/data_loaders/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | from dataclasses import dataclass
 5 | @dataclass
 6 | class DataHandler:
 7 |     X_data: 'DataHandler' = None
 8 |     X_data_test: 'DataHandler' = None
 9 |     y_data: 'DataHandler' = None
10 |     y_data_test: 'DataHandler' = None
11 | data_handler = DataHandler()
12 | 


--------------------------------------------------------------------------------
/models/models.py:
--------------------------------------------------------------------------------
 1 | from base import BaseModel
 2 | from sklearn.pipeline import Pipeline
 3 | 
 4 | 
 5 | class Model(BaseModel):
 6 |     def __init__(self, pipeline, unions):
 7 |         steps = self.create_steps(pipeline, unions)
 8 |         self.model = Pipeline(steps=steps)
 9 | 
10 |     def created_model(self):
11 |         return self.model
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | analytics-python==1.2.9
 2 | bamboolib==1.26.0
 3 | certifi==2021.5.30
 4 | cryptography==2.9.2
 5 | cycler==0.10.0
 6 | ipyslickgrid==0.0.3
 7 | joblib==1.0.1
 8 | kiwisolver==1.3.2
 9 | matplotlib==3.4.3
10 | numpy==1.21.2
11 | pandas==1.3.3
12 | Pillow==8.3.2
13 | plotly==4.14.3
14 | ppscore==1.2.0
15 | psutil==5.8.0
16 | pyaml==21.10.1
17 | PyYAML==6.0
18 | retrying==1.3.3
19 | scikit-learn==1.0
20 | scikit-optimize==0.9.0
21 | scipy==1.7.1
22 | seaborn==0.10.1
23 | threadpoolctl==2.2.0
24 | toml==0.10.2
25 | wincertstore==0.2
26 | xlrd==2.0.1
27 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | from wrappers import *
 5 | from sklearn.svm import SVC
 6 | from sklearn.decomposition import PCA
 7 | from sklearn.preprocessing import StandardScaler
 8 | from sklearn.preprocessing import PolynomialFeatures
 9 | from sklearn.linear_model import Ridge
10 | from sklearn.dummy import DummyRegressor
11 | 
12 | methods_dict = {
13 |     'ridge': Ridge,
14 |     'pf': PolynomialFeatures,
15 |     'scaler': StandardScaler,
16 |     'PLS':PLSRegressionWrapper,
17 |     'savgol':SavgolWrapper,
18 |     'SVC':SVC,
19 |     'PCA':PCA
20 | }
21 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import csv
 3 | import os
 4 | import json
 5 | from pathlib import Path
 6 | from collections import OrderedDict
 7 | 
 8 | 
 9 | def read_csv(data_path):
10 |     with open(data_path, mode='r') as csv_file:
11 |         csv_reader = csv.reader(csv_file)
12 | 
13 |         data_list = []
14 |         line_count = 0
15 |         for row in csv_reader:
16 |             line_count += 1
17 |             data_list.append(row)
18 |         print(f'Processed {line_count} lines.')
19 |     return data_list
20 | 
21 | def read_json(fname):
22 |     fname = Path(fname)
23 |     with fname.open('rt') as handle:
24 |         return json.load(handle, object_hook=OrderedDict)
25 | 
26 | def write_json(content, fname):
27 |     fname = Path(fname)
28 |     with fname.open('wt') as handle:
29 |         json.dump(content, handle, indent=4, sort_keys=False)
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/wrappers/data_transformations.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.signal import savgol_filter
 3 | from sklearn.base import BaseEstimator, TransformerMixin
 4 | 
 5 | 
 6 | # Example of savitzky golay filter implementation
 7 | class SavgolWrapper(BaseEstimator, TransformerMixin):
 8 |     def __init__(self, win_length=7, polyorder=2, deriv=0):
 9 |         self.win_length = win_length
10 |         self.polyorder = polyorder
11 |         self.deriv = deriv
12 | 
13 |     def fit(self, X, y=None):
14 |         return self
15 | 
16 |     def transform(self, X, y=None):
17 |         signatures_sav = []
18 |         sp = [self.win_length, self.polyorder, self.deriv]
19 |         for signal in X:
20 |             if self.win_length != 0:
21 |                 signal = savgol_filter(signal, sp[0], sp[1], sp[2])
22 |             signatures_sav.append(signal)
23 |         return np.array(signatures_sav)
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 janezlapajne
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data_loaders/data_loaders.py:
--------------------------------------------------------------------------------
 1 | from base import BaseDataLoader
 2 | from data_loaders import data_handler
 3 | from sklearn.datasets import load_iris, load_boston
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | 
 7 | class Classification(BaseDataLoader):
 8 |     def __init__(self, data_path, shuffle, test_split, random_state, stratify, training):
 9 |         '''set data_path in configs if data localy stored'''
10 | 
11 |         X, y = load_iris(return_X_y=True)
12 |         data_handler.X_data = X
13 |         data_handler.y_data = y
14 | 
15 |         super().__init__(data_handler, shuffle, test_split, random_state, stratify, training)
16 | 
17 | 
18 | 
19 | class Regression(BaseDataLoader):
20 |     def __init__(self, data_path, shuffle, test_split, random_state, stratify, training):
21 |         '''set data_path in configs if data localy stored'''
22 | 
23 |         X, y = load_boston(return_X_y=True)
24 |         X_train, X_test, y_train, y_test = train_test_split(X, y,
25 |                                                             test_size=0.2,
26 |                                                             random_state=1,
27 |                                                             shuffle=True)
28 | 
29 |         data_handler.X_data = X_train
30 |         data_handler.y_data = y_train
31 |         data_handler.X_data_test = X_test
32 |         data_handler.y_data_test = y_test
33 | 
34 |         super().__init__(data_handler, shuffle, test_split, random_state, stratify, training)
35 | 


--------------------------------------------------------------------------------
/configs/config_classification.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Classification",
 3 | 
 4 |     "model": {
 5 |         "type": "Model",
 6 |         "args": {
 7 |             "pipeline": ["scaler", "PLS", "pf", "SVC"],
 8 |             "unions": {
 9 |             }
10 |         }
11 |     },
12 | 
13 |     "tuned_parameters":[{
14 |                         "SVC__kernel": ["rbf"],
15 |                         "SVC__gamma": [1e-5, 1e-6, 1],
16 |                         "SVC__C": [1, 100, 1000],
17 |                         "PLS__n_components": [1,2,3]
18 |                     }],
19 | 
20 |     "optimizer": "OptimizerClassification",
21 | 
22 |     "search_method":{
23 |         "type": "GridSearchCV",
24 |         "args": {
25 |             "refit": false,
26 |             "n_jobs": -1,
27 |             "verbose": 2,
28 |             "error_score": 0
29 |         }
30 |     },
31 | 
32 |     "cross_validation": {
33 |         "type": "RepeatedStratifiedKFold",
34 |         "args": {
35 |             "n_splits": 5,
36 |             "n_repeats": 10,
37 |             "random_state": 1
38 |         }
39 |     },
40 | 
41 |     "data_loader": {
42 |         "type": "Classification",
43 |         "args":{
44 |             "data_path": "data/path-to-file",
45 |             "shuffle": true,
46 |             "test_split": 0.2,
47 |             "stratify": true,
48 |             "random_state":1
49 |         }
50 |     },
51 | 
52 |     "score": "max balanced_accuracy",
53 |     "test_model": true,
54 |     "debug": false,
55 |     "save_dir": "saved/"
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/config_classification_bayes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Classification",
 3 | 
 4 |     "model": {
 5 |         "type": "Model",
 6 |         "args": {
 7 |             "pipeline": ["scaler", "PLS", "SVC"],
 8 |             "unions": {
 9 |             }
10 |         }
11 |     },
12 | 
13 |     "tuned_parameters":[{
14 |                         "SVC__kernel": ["rbf"],
15 |                         "SVC__gamma": ["RS", 0.000001, 0.01],
16 |                         "SVC__C": ["RS", 1, 10000],
17 |                         "PLS__n_components": ["RSI", 1, 10]
18 |                     }],
19 | 
20 |     "optimizer": "OptimizerClassification",
21 | 
22 |     "search_method":{
23 |         "type": "BayesSearchCV",
24 |         "args": {
25 |             "refit": false,
26 |             "n_jobs": -1,
27 |             "verbose": 2,
28 |             "error_score": 0,
29 |             "n_iter": 10
30 |         }
31 |     },
32 | 
33 |     "cross_validation": {
34 |         "type": "RepeatedStratifiedKFold",
35 |         "args": {
36 |             "n_splits": 5,
37 |             "n_repeats": 10,
38 |             "random_state": 1
39 |         }
40 |     },
41 | 
42 |     "data_loader": {
43 |         "type": "Classification",
44 |         "args":{
45 |             "data_path": "data/path-to-file",
46 |             "shuffle": true,
47 |             "test_split": 0.2,
48 |             "stratify": true,
49 |             "random_state":1
50 |         }
51 |     },
52 | 
53 |     "score": "max balanced_accuracy",
54 |     "test_model": true,
55 |     "debug": false,
56 |     "save_dir": "saved/"
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/configs/config_regression.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Regression",
 3 | 
 4 |     "model": {
 5 |         "type": "Model",
 6 |         "args": {
 7 |             "pipeline": ["scaler", "PCA", "ridge"],
 8 |             "unions": {
 9 |             }
10 |         }
11 |     },
12 | 
13 |     "tuned_parameters":[{
14 |                         "PCA__n_components": [5,10],
15 |                         "ridge__solver": ["svd", "cholesky", "lsqr", "sag"],
16 |                         "ridge__alpha": ["RS", 1e-5, 100],
17 |                         "ridge__fit_intercept": [true, false]
18 |                     }],
19 | 
20 |     "optimizer": "OptimizerRegression",
21 | 
22 |     "search_method":{
23 |         "type": "RandomizedSearchCV",
24 |         "args": {
25 |             "n_iter": 20,
26 |             "refit": false,
27 |             "n_jobs": -1,
28 |             "verbose": 2,
29 |             "error_score": 0
30 |         }
31 |     },
32 | 
33 |     "cross_validation": {
34 |         "type": "RepeatedKFold",
35 |         "args": {
36 |             "n_splits": 5,
37 |             "n_repeats": 50,
38 |             "random_state": 1
39 |         }
40 |     },
41 | 
42 |     "data_loader": {
43 |         "type": "Regression",
44 |         "args":{
45 |             "data_path": "data/path-to-file",
46 |             "shuffle": false,
47 |             "test_split": 0.0,
48 |             "stratify": false,
49 |             "random_state":1
50 |         }
51 |     },
52 | 
53 |     "score": "max neg_mean_absolute_error",
54 |     "test_model": true,
55 |     "debug": false,
56 |     "save_dir": "saved/"
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/configs/config_unions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "Classification",
 3 | 
 4 |     "model": {
 5 |         "type": "Model",
 6 |         "args": {
 7 |             "pipeline": ["scaler", "pca-pls", "SVC"],
 8 |             "unions": {
 9 |                 "pca-pls": ["PLS", "PCA"]
10 |             }
11 |         }
12 |     },
13 | 
14 |     "tuned_parameters":[{
15 |                         "SVC__kernel": ["rbf"],
16 |                         "SVC__gamma": [1e-5, 1e-6, 1],
17 |                         "SVC__C": [1, 100, 1000],
18 |                         "pca-pls__PLS__n_components": [1,2,3],
19 |                         "pca-pls__PCA__n_components": [1,2,3]
20 |                     }],
21 | 
22 |     "optimizer": "OptimizerClassification",
23 | 
24 |     "search_method":{
25 |         "type": "GridSearchCV",
26 |         "args": {
27 |             "refit": false,
28 |             "n_jobs": -1,
29 |             "verbose": 2,
30 |             "error_score": 0
31 |         }
32 |     },
33 | 
34 |     "cross_validation": {
35 |         "type": "RepeatedStratifiedKFold",
36 |         "args": {
37 |             "n_splits": 5,
38 |             "n_repeats": 10,
39 |             "random_state": 1
40 |         }
41 |     },
42 | 
43 |     "data_loader": {
44 |         "type": "Classification",
45 |         "args":{
46 |             "data_path": "data/path-to-file",
47 |             "shuffle": true,
48 |             "test_split": 0.2,
49 |             "stratify": true,
50 |             "random_state":1
51 |         }
52 |     },
53 | 
54 |     "score": "max balanced_accuracy",
55 |     "test_model": true,
56 |     "debug": false,
57 |     "save_dir": "saved/"
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/base/base_model.py:
--------------------------------------------------------------------------------
 1 | from abc import abstractmethod
 2 | from models import methods_dict
 3 | from sklearn.pipeline import FeatureUnion
 4 | 
 5 | class BaseModel():
 6 | 
 7 |     @abstractmethod
 8 |     def created_model(self):
 9 |         '''should return created model'''
10 |         return NotImplementedError
11 | 
12 |     def create_steps(self, pipeline, unions):
13 |         steps = list()
14 |         for model_name in pipeline:
15 |             # add features from pipeline
16 |             if model_name in methods_dict.keys():
17 |                 step = self._make_step(model_name)
18 |                 steps.append(step)
19 | 
20 |             # add combined features
21 |             elif model_name in unions.keys():
22 |                 steps_cf = list()
23 |                 for model_name_cf in unions[model_name]:
24 |                     if model_name_cf in methods_dict.keys():
25 |                         step = self._make_step(model_name_cf)
26 |                         steps_cf.append(step)
27 |                 if steps_cf:
28 |                     steps.append([model_name, FeatureUnion(steps_cf)])
29 | 
30 |             else:
31 |                 # if method not found
32 |                 steps.append([model_name, None])
33 |         return steps
34 | 
35 |     def change_step(self, model_name, model_instance, steps):
36 |         for idx in range(len((steps))):
37 |             if steps[idx][0] == model_name:
38 |                 steps[idx][1] = model_instance
39 |                 break
40 |         return steps
41 | 
42 |     def _make_step(self, model_name):
43 |         if isinstance(methods_dict[model_name], type):
44 |             step = [model_name, methods_dict[model_name]()]
45 |         else:
46 |             # if already initialized
47 |             step = [model_name, methods_dict[model_name]]
48 |         return step
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/base/base_data_loader.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | from sklearn.utils import shuffle as shuffle_data
 3 | 
 4 | class BaseDataLoader():
 5 |     def __init__(self, data_handler, shuffle, test_split, random_state, stratify, training):
 6 |         dh = data_handler
 7 | 
 8 |         if dh.X_data_test is dh.y_data_test is None:
 9 |             if 0 < test_split < 1:
10 |                 stratify = dh.y_data if stratify else None
11 |                 X_train, X_test, y_train, y_test = train_test_split(dh.X_data,
12 |                                                                     dh.y_data,
13 |                                                                     test_size=test_split,
14 |                                                                     random_state=random_state,
15 |                                                                     shuffle=shuffle,
16 |                                                                     stratify=stratify)
17 |                 self.X_out, self.y_out = (X_train, y_train) if training else (X_test, y_test)
18 |                 print("Training and test sets created regarding defined test_split percentage.")
19 |             else:
20 |                 self.X_out, self.y_out = dh.X_data, dh.y_data
21 |                 if shuffle:
22 |                     self.X_out, self.y_out = shuffle_data(self.X_out, self.y_out, random_state=random_state)
23 |                 print("Whole dataset is used for training.")
24 | 
25 |         elif dh.X_data_test is not None and dh.y_data_test is not None:
26 |             self.X_out, self.y_out = (dh.X_data, dh.y_data) if training \
27 |                             else (dh.X_data_test, dh.y_data_test)
28 |             if shuffle:
29 |                 self.X_out, self.y_out = shuffle_data(self.X_out, self.y_out, random_state=random_state)
30 |             print("For training and testing separate datasets configured in data_handler will be used.")
31 |         else:
32 |             raise ValueError('data_handler not configured properly.')
33 | 
34 |     def get_data(self):
35 |         print(f"Number of loaded data instances: {len(self.X_out)}")
36 |         return self.X_out, self.y_out
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/utils/parse_params.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sklearn.utils.fixes import loguniform
 3 | from scipy.stats import randint
 4 | from sklearn.experimental import enable_halving_search_cv  # noqa
 5 | import sklearn.model_selection as model_selection_
 6 | import skopt as skopt_
 7 | from skopt.space import Real, Categorical, Integer
 8 | 
 9 | def modify_params(search_method_params, config):
10 |     tuned_parameters = config["tuned_parameters"]
11 |     search_type = config["search_method"]["type"]
12 | 
13 |     assert search_type == "GridSearchCV" or \
14 |             search_type == "RandomizedSearchCV" or \
15 |             search_type == "HalvingGridSearchCV" or \
16 |             search_type == "HalvingGridSearchCV" or \
17 |             search_type == "BayesSearchCV" , \
18 |             f"Search type {search_type} not supported."
19 | 
20 |     if "Grid" in search_type:
21 |         search_method_params['param_grid'] = tuned_parameters
22 | 
23 |     elif search_type == "BayesSearchCV":
24 |         for method_name in tuned_parameters[0]:
25 |             temp = tuned_parameters[0][method_name]
26 |             if len(temp) == 3 and temp[0] == 'RS':
27 |                 tuned_parameters[0][method_name] = Real(temp[1], temp[2], prior="log-uniform")
28 |             elif len(temp) == 3 and temp[0] == 'RSI':
29 |                 tuned_parameters[0][method_name] = Integer(temp[1], temp[2], prior="uniform")
30 |             else:
31 |                 tuned_parameters[0][method_name] = Categorical(temp)
32 |         search_method_params['search_spaces'] = tuned_parameters
33 | 
34 |     else:
35 |         for method_name in tuned_parameters[0]:
36 |             temp = tuned_parameters[0][method_name]
37 |             if len(temp) == 3 and temp[0] == 'RS':
38 |                 tuned_parameters[0][method_name] = loguniform(temp[1], temp[2])
39 |             elif len(temp) == 3 and temp[0] == 'RSI':
40 |                 tuned_parameters[0][method_name] = randint(temp[1], temp[2])
41 |         search_method_params['param_distributions'] = tuned_parameters
42 | 
43 |     return search_method_params, search_type
44 | 
45 | 
46 | def get_lib(search_type):
47 |     if search_type == "GridSearchCV" or \
48 |             search_type == "RandomizedSearchCV" or \
49 |             search_type == "HalvingGridSearchCV" or \
50 |             search_type == "HalvingRandomSearchCV":
51 |         return model_selection_
52 |     elif search_type == "BayesSearchCV":
53 |         return skopt_
54 | 
55 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import argparse
 4 | import collections
 5 | from utils import ConfigParser, read_json, modify_params, get_lib
 6 | import data_loaders.data_loaders as data_loaders_
 7 | import models.models as models_
 8 | import optimizers.optimizers as optimizers_
 9 | import sklearn.model_selection as model_selection_
10 | 
11 | 
12 | def main(config):
13 | 
14 |     data_loader = config.init_obj('data_loader', data_loaders_, **{'training':True})
15 |     model = config.init_obj('model', models_).created_model()
16 |     cross_val = config.init_obj('cross_validation', model_selection_)
17 |     mnt, scoring = config['score'].split()
18 | 
19 |     search_method_params = {
20 |         'estimator': model,
21 |         'scoring': scoring,
22 |         'cv': cross_val
23 |     }
24 |     search_method_params, search_type = modify_params(search_method_params, config)
25 |     search_method = config.init_obj('search_method', get_lib(search_type), **search_method_params)
26 | 
27 |     Optimizer = config.import_module('optimizer', optimizers_)
28 |     optim = Optimizer(model=model,
29 |                       data_loader=data_loader,
30 |                       search_method=search_method,
31 |                       scoring=scoring,
32 |                       mnt=mnt,
33 |                       config=config)
34 | 
35 |     optim.optimize()
36 | 
37 | 
38 |     if config['test_model']:
39 |         data_loader = config.init_obj('data_loader', data_loaders_, **{'training':False})
40 |         X_test, y_test = data_loader.get_data()
41 |         model = optim.load_model()
42 |         y_pred = model.predict(X_test)
43 |         test_report = optim.create_test_report(y_test, y_pred)
44 |         optim.save_report(test_report, 'report_test.txt')
45 | 
46 | 
47 | if __name__ == '__main__':
48 | 
49 |     args = argparse.ArgumentParser(description='Sklearn Template')
50 |     args.add_argument('-c', '--config', default=None, type=str,
51 |                       help='config file path (default: None)')
52 | 
53 |     # custom cli options to modify configuration from default values given in json file.
54 |     CustomArgs = collections.namedtuple('CustomArgs', 'flags type target')
55 |     options = [
56 |         CustomArgs(['-cv', '--cross_validation'], type=int, target='cross_validation;args;n_repeats'),
57 |     ]
58 |     config = ConfigParser.from_args(args, options)
59 |     main(config)
60 | 
61 | 
62 |     # configs_list = glob.glob(os.path.join("configs", "*.json"))
63 |     # for cfg_fname in configs_list:
64 |     #     config = read_json(cfg_fname)
65 |     #     config = ConfigParser(config)
66 |     #     main(config)
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # images
132 | *.png
133 | *.tif
134 | *.jpg
135 | 
136 | .vscode
137 | 
138 | # random
139 | *.pkl
140 | config.json
141 | *.txt
142 | !requirements.txt
143 | saved
144 | 


--------------------------------------------------------------------------------
/base/base_optimizer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | from sklearn.utils import shuffle
 4 | import numpy as np
 5 | from abc import abstractmethod
 6 | 
 7 | 
 8 | class BaseOptimizer():
 9 |     def __init__(self, model, data_loader, search_method, config):
10 |         self.X_train, self.y_train = data_loader.get_data()
11 |         self.model = model
12 |         self.search_method = search_method
13 |         self.save_dir = config.save_dir
14 |         self.debug = config.debug
15 |         self.config = config
16 | 
17 |     def _perform_grid_search(self):
18 |         # sorted(sklearn.metrics.SCORERS.keys()) -> get available metrics
19 |         self.search_method.fit(self.X_train, self.y_train)
20 |         return self.search_method
21 | 
22 |     def _save_model(self, model):
23 |         save_path = os.path.join(self.save_dir, "model.pkl")
24 |         with open(save_path,'wb') as f:
25 |             pickle.dump(model, f)
26 | 
27 |     def load_model(self):
28 |         load_path = os.path.join(self.save_dir, "model.pkl")
29 |         with open(load_path, 'rb') as f:
30 |             model = pickle.load(f)
31 |             print(model)
32 |         return model
33 | 
34 |     def save_report(self, report, name_txt):
35 |         save_path = os.path.join(self.save_dir, name_txt)
36 |         with open(save_path, "w") as text_file:
37 |             text_file.write(report)
38 | 
39 |     def optimize(self):
40 |         if self.debug:
41 |             self._debug_true()
42 |         else:
43 |             self._debug_false()
44 | 
45 |     def _debug_false(self):
46 |         gs = self._perform_grid_search()
47 |         model = self.fitted_model(gs)
48 |         train_report = self.create_train_report(gs)
49 |         self._save_model(model)
50 |         self.save_report(train_report, "report_train.txt")
51 | 
52 |     def _debug_true(self):
53 |         x = self.X_train
54 |         y = self.y_train
55 |         # for each parameter take just the first element from param_grid
56 |         if hasattr(self.search_method, "param_grid"):
57 |             param_grid = self.search_method.param_grid[0].copy()
58 |             for param in param_grid.keys():
59 |                 param_grid[param] = param_grid[param][0]
60 | 
61 |             self.model.set_params(**param_grid)
62 | 
63 |             print("-----------------------------------------------------------------")
64 |             print("Model architecture:")
65 |             print("input: {}".format(x.shape))
66 |             for layer in self.model:
67 |                 if hasattr(layer, "fit_transform"):
68 |                     x = layer.fit_transform(x,y)
69 |                 elif hasattr(layer, "fit") and hasattr(layer, "predict"):
70 |                     layer.fit(x,y)
71 |                     x = layer.predict(x)
72 |                 else:
73 |                     x = np.array([])
74 |                     print(f"Warning: {layer} layer dimensions wrong!")
75 | 
76 |                 print("layer {}: {}".format(layer, x.shape))
77 |             print("-----------------------------------------------------------------")
78 |         else:
79 |             print("\n Error: Debug option only available for GridSearch")
80 |         quit()
81 | 
82 |     def create_train_report(self, cor):
83 |         '''Should return report from training'''
84 |         return "Train report not configured."
85 | 
86 |     def create_test_report(self, y_test, y_pred):
87 |         '''Should return report from testing'''
88 |         return "Test report not configured."
89 | 
90 |     @abstractmethod
91 |     def fitted_model(self, cor):
92 |         '''Should return fitted model'''
93 |         raise NotImplementedError
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/optimizers/optimizers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from base import BaseOptimizer
  3 | from sklearn.metrics import classification_report, mean_absolute_error
  4 | 
  5 | 
  6 | class OptimizerClassification(BaseOptimizer):
  7 |     def __init__(self, model, data_loader, search_method, scoring, mnt, config):
  8 |         self.scoring = scoring
  9 |         self.mnt = mnt
 10 |         super().__init__(model, data_loader, search_method, config)
 11 | 
 12 |     def fitted_model(self, cor):
 13 |         clf_results = cor.cv_results_
 14 |         params = np.array(clf_results["params"])
 15 |         means = clf_results["mean_test_score"]
 16 | 
 17 |         if self.mnt == 'min':
 18 |             sort_idx = np.argsort(means)
 19 |         if self.mnt == 'max':
 20 |             sort_idx = np.argsort(means)[::-1]
 21 | 
 22 |         params_sorted = params[sort_idx]
 23 |         self.model.set_params(**params_sorted[0]) # define the best model
 24 |         self.model.fit(self.X_train, self.y_train)
 25 | 
 26 |         return self.model
 27 | 
 28 |     def create_train_report(self, cor):
 29 |         print("Optimizing for: ", self.scoring)
 30 |         print("_________________")
 31 | 
 32 |         clf_results = cor.cv_results_
 33 |         params = np.array(clf_results["params"])
 34 |         means = clf_results["mean_test_score"]
 35 |         stds = clf_results["std_test_score"]
 36 | 
 37 |         if self.mnt == 'min':
 38 |             sort_idx = np.argsort(means)
 39 |         if self.mnt == 'max':
 40 |             sort_idx = np.argsort(means)[::-1]
 41 | 
 42 |         indexes = np.arange(len(means))
 43 | 
 44 |         indexes_sorted = indexes[sort_idx]
 45 |         means_sorted = means[sort_idx]
 46 |         stds_sorted = stds[sort_idx]
 47 |         params_sorted = params[sort_idx]
 48 | 
 49 |         train_report = f"###   Optimizing for {self.scoring}   ###\n\n"
 50 |         for idx, mean, std, params_ in zip(indexes_sorted, means_sorted, stds_sorted, params_sorted):
 51 |             print("%d   %0.3f (+/-%0.03f) for %r"
 52 |                 % (idx, mean, std * 2, params_))
 53 |             train_report += f"{mean:.3f}  +/-{std*2:.3f}  for  {params_}\n"
 54 |         train_report += f"\n###   Best model:   ###\n\n {str(self.model)}"
 55 |         train_report += f"\n Number of samples used for training: {len(self.y_train)}"
 56 | 
 57 |         return train_report
 58 | 
 59 |     def create_test_report(self, y_test, y_pred):
 60 |         test_report = str(classification_report(y_test, y_pred))
 61 |         print("\n Report on test data:")
 62 |         print(test_report)
 63 |         test_report += f"\n\n True Values:\n {y_test}"
 64 |         test_report += f"\n Pred Values:\n {y_pred}"
 65 |         return test_report
 66 | 
 67 | 
 68 | 
 69 | class OptimizerRegression(BaseOptimizer):
 70 |     def __init__(self, model, data_loader, search_method, scoring, mnt, config):
 71 |         self.scoring = scoring
 72 |         self.mnt = mnt
 73 |         super().__init__(model, data_loader, search_method, config)
 74 | 
 75 |     def fitted_model(self, cor):
 76 |         clf_results = cor.cv_results_
 77 |         params = np.array(clf_results["params"])
 78 |         means = clf_results["mean_test_score"]
 79 | 
 80 |         if self.mnt == 'min':
 81 |             sort_idx = np.argsort(means)
 82 |         if self.mnt == 'max':
 83 |             sort_idx = np.argsort(means)[::-1]
 84 | 
 85 |         params_sorted = params[sort_idx]
 86 |         self.model.set_params(**params_sorted[0]) # define the best model
 87 |         self.model.fit(self.X_train, self.y_train)
 88 | 
 89 |         return self.model
 90 | 
 91 |     def create_train_report(self, cor):
 92 |         print("Optimizing for: ", self.scoring)
 93 |         print("_________________")
 94 | 
 95 |         clf_results = cor.cv_results_
 96 |         params = np.array(clf_results["params"])
 97 |         means = clf_results["mean_test_score"]
 98 |         stds = clf_results["std_test_score"]
 99 | 
100 |         if self.mnt == 'min':
101 |             sort_idx = np.argsort(means)
102 |         if self.mnt == 'max':
103 |             sort_idx = np.argsort(means)[::-1]
104 | 
105 |         indexes = np.arange(len(means))
106 | 
107 |         indexes_sorted = indexes[sort_idx]
108 |         means_sorted = means[sort_idx]
109 |         stds_sorted = stds[sort_idx]
110 |         params_sorted = params[sort_idx]
111 | 
112 |         train_report = f"###   Optimizing for {self.scoring}   ###\n\n"
113 |         for idx, mean, std, params_ in zip(indexes_sorted, means_sorted, stds_sorted, params_sorted):
114 |             print("%d   %0.3f (+/-%0.03f) for %r"
115 |                 % (idx, mean, std * 2, params_))
116 |             train_report += f"{mean:.3f}  +/-{std*2:.3f}  for  {params_}\n"
117 |         train_report += f"\n###   Best model:   ###\n\n {str(self.model)}"
118 |         train_report += f"\n Number of samples used for training: {len(self.y_train)}"
119 | 
120 |         return train_report
121 | 
122 |     def create_test_report(self, y_test, y_pred):
123 |         mae = mean_absolute_error(y_test, y_pred)
124 |         test_report = f"True Values:\n {y_test}"
125 |         test_report += f"\n Pred Values:\n {y_pred}"
126 |         test_report += f"\n MAE: \n {mae}"
127 |         print(test_report)
128 |         return test_report
129 | 


--------------------------------------------------------------------------------
/utils/parse_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from pathlib import Path
  4 | from functools import reduce, partial
  5 | from operator import getitem
  6 | from datetime import datetime
  7 | from utils import read_json, write_json
  8 | 
  9 | 
 10 | class ConfigParser:
 11 |     def __init__(self, config, modification=None, run_id=None):
 12 |         """
 13 |         class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
 14 |         and logging module.
 15 |         :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example.
 16 |         :param modification: Dict keychain:value, specifying position values to be replaced from config dict.
 17 |         :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default
 18 |         """
 19 |         # load config file and apply modification
 20 |         self._config = _update_config(config, modification)
 21 | 
 22 |         # set save_dir where trained model and log will be saved.
 23 |         save_dir = Path(self.config['save_dir'])
 24 | 
 25 |         exper_name = self.config['name']
 26 |         if run_id is None: # use timestamp as default run-id
 27 |             run_id = datetime.now().strftime(r'%m%d_%H%M%S')
 28 |         self._save_dir = save_dir / exper_name / run_id
 29 | 
 30 |         # make directory for saving checkpoints and log.
 31 |         exist_ok = run_id == ''
 32 |         self.save_dir.mkdir(parents=True, exist_ok=exist_ok)
 33 | 
 34 |         # save updated config file to the checkpoint dir
 35 |         write_json(self.config, self.save_dir / 'config.json')
 36 | 
 37 |         self._debug = self.config['debug']
 38 | 
 39 | 
 40 |     @classmethod
 41 |     def from_args(cls, args, options=''):
 42 |         """
 43 |         Initialize this class from some cli arguments. Used in train, test.
 44 |         """
 45 |         for opt in options:
 46 |             args.add_argument(*opt.flags, default=None, type=opt.type)
 47 |         if not isinstance(args, tuple):
 48 |             args = args.parse_args()
 49 | 
 50 |         msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example."
 51 |         assert args.config is not None, msg_no_cfg
 52 |         cfg_fname = Path(args.config)
 53 | 
 54 |         config = read_json(cfg_fname)
 55 | 
 56 |         # parse custom cli options into dictionary
 57 |         modification = {opt.target : getattr(args, _get_opt_name(opt.flags)) for opt in options}
 58 |         return cls(config, modification)
 59 | 
 60 |     def init_obj(self, name, module, *args, **kwargs):
 61 |         """
 62 |         Finds a function handle with the name given as 'type' in config, and returns the
 63 |         instance initialized with corresponding arguments given.
 64 | 
 65 |         `object = config.init_obj('name', module, a, b=1)`
 66 |         is equivalent to
 67 |         `object = module.name(a, b=1)`
 68 |         """
 69 |         module_name = self[name]['type'] # can do because of __getitem__
 70 |         module_args = dict(self[name]['args'])
 71 |         assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed'
 72 |         module_args.update(kwargs)
 73 |         return getattr(module, module_name)(*args, **module_args)
 74 | 
 75 |     def init_ftn(self, name, module, *args, **kwargs):
 76 |         """
 77 |         Finds a function handle with the name given as 'type' in config, and returns the
 78 |         function with given arguments fixed with functools.partial.
 79 | 
 80 |         `function = config.init_ftn('name', module, a, b=1)`
 81 |         is equivalent to
 82 |         `function = lambda *args, **kwargs: module.name(a, *args, b=1, **kwargs)`.
 83 |         """
 84 |         module_name = self[name]['type']
 85 |         module_args = dict(self[name]['args'])
 86 |         assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed'
 87 |         module_args.update(kwargs)
 88 |         return partial(getattr(module, module_name), *args, **module_args)
 89 | 
 90 |     def import_module(self, name, module):
 91 |         name = self[name]
 92 |         return getattr(module, name)
 93 | 
 94 |     def __getitem__(self, name):
 95 |         """Access items like ordinary dict."""
 96 |         return self.config[name]
 97 | 
 98 |     # setting read-only attributes
 99 |     @property
100 |     def config(self):
101 |         return self._config
102 | 
103 |     @property
104 |     def save_dir(self):
105 |         return self._save_dir
106 | 
107 |     @property
108 |     def debug(self):
109 |         return self._debug
110 | 
111 | # helper functions to update config dict with custom cli options
112 | def _update_config(config, modification):
113 |     if modification is None:
114 |         return config
115 | 
116 |     for k, v in modification.items():
117 |         if v is not None:
118 |             _set_by_path(config, k, v)
119 |     return config
120 | 
121 | def _get_opt_name(flags):
122 |     for flg in flags:
123 |         if flg.startswith('--'):
124 |             return flg.replace('--', '')
125 |     return flags[0].replace('--', '')
126 | 
127 | def _set_by_path(tree, keys, value):
128 |     """Set a value in a nested object in tree by sequence of keys."""
129 |     keys = keys.split(';')
130 |     _get_by_path(tree, keys[:-1])[keys[-1]] = value
131 | 
132 | def _get_by_path(tree, keys):
133 |     """Access a nested object in tree by sequence of keys."""
134 |     return reduce(getitem, keys, tree)
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scikit-learn-project-template
  2 | 
  3 | 
  4 | ## About the project
  5 | * Folder structure suitable for many machine learning projects. Especially for those with small amount of available training data.
  6 | * `.json` config file support for convenient parameter tuning.
  7 | * Customizable command line options for more convenient parameter tuning. It supports grid search, random search and bayesian search.
  8 | * Abstract base classes for faster development:
  9 |   * `BaseOptimizer` handles execution of grid search, saving and loading of models and formation of test and train reports.
 10 |   * `BaseDataLoader` handles splitting of training and testing data. Spilt is performed depending on settings provided in config file.
 11 |   * `BaseModel` handles construction of consecutive steps defined in config file.
 12 | * Suitable for tunining of machine learning models which follow `scikit-learn` nomenclature.
 13 |     For the time being tested open libraries:
 14 |   * scikit-learn
 15 |   * sktime
 16 |   * tsfresh
 17 | 
 18 | ## Getting Started
 19 | 
 20 | To get a local copy up and running follow steps below.
 21 | ### Requirements
 22 | * Python >= `3.7`
 23 | * Packages included in `requirements.txt` file
 24 | * (Anaconda for easy installation)
 25 | 
 26 | ### Install dependencies
 27 | 
 28 | Create and activate virtual environment:
 29 | ```sh
 30 | conda create -n yourenvname python=3.7
 31 | conda activate yourenvname
 32 | ```
 33 | 
 34 | Install packages:
 35 | ```sh
 36 | python -m pip install -r requirements.txt
 37 | ```
 38 | 
 39 | ## Folder Structure
 40 |   ```
 41 |   sklearn-project-template/
 42 |   │
 43 |   ├── main.py - main script to start training and (optionally) testing
 44 |   │
 45 |   ├── base/ - abstract base classes
 46 |   │   ├── base_data_loader.py
 47 |   │   ├── base_model.py
 48 |   │   └── base_optimizer.py
 49 |   │
 50 |   ├── configs/ - holds configuration for training and testing
 51 |   │   ├── config_classification.json
 52 |   │   └── config_regression.json
 53 |   │
 54 |   ├── data/ - default directory for storing input data
 55 |   │
 56 |   ├── data_loaders/ - anything about data loading goes here
 57 |   │   └── data_loaders.py
 58 |   │
 59 |   ├── models/ - models
 60 |   │   ├── __init__.py - defined models by name
 61 |   │   └── models.py
 62 |   │
 63 |   ├── optimizers/ - optimizers
 64 |   │   └── optimizers.py
 65 |   │
 66 |   ├── saved/ - config, model and reports are saved here
 67 |   │   ├── Classification
 68 |   │   └── Regression
 69 |   │
 70 |   ├── utils/ - utility functions
 71 |   │   ├── parse_config.py - class to handle config file and cli options
 72 |   │   ├── parse_params.py
 73 |   │   └── utils.py
 74 |   │
 75 |   ├── wrappers/ - wrappers of modified sklearn models or self defined transforms
 76 |   │   ├── data_transformations.py
 77 |   │   └── wrappers.py
 78 |   ```
 79 | 
 80 | ## Usage
 81 | Models in this repo are trained on two well-known datasets: iris and boston. First is used for classification and second for regression problem.
 82 | 
 83 | Run classification:
 84 |    ```sh
 85 | python main.py -c configs/config_classification.json
 86 |    ```
 87 | Run regression:
 88 |    ```sh
 89 | python main.py -c configs/config_regression.json
 90 |    ```
 91 | 
 92 | ### Config file format
 93 | Config files are in `.json` format. Example of such config is shown below:
 94 | ```javascript
 95 | {
 96 |     "name": "Classification",   // session name
 97 | 
 98 |     "model": {
 99 |         "type": "Model",    // model name
100 |         "args": {
101 |             "pipeline": ["scaler", "PLS", "pf", "SVC"],     // pipeline of methods
102 |             "unions": {     // unions of methods included in pipeline
103 |             }
104 |         }
105 |     },
106 | 
107 |     "tuned_parameters":[{   // hyperparameters to be tuned with search method
108 |                         "SVC__kernel": ["rbf"],
109 |                         "SVC__gamma": [1e-5, 1e-6, 1],
110 |                         "SVC__C": [1, 100, 1000],
111 |                         "PLS__n_components": [1,2,3]
112 |                     }],
113 | 
114 |     "optimizer": "OptimizerClassification",    // name of optimizer
115 | 
116 |     "search_method":{
117 |         "type": "GridSearchCV",    // method used to search through parameters
118 |         "args": {
119 |             "refit": false,
120 |             "n_jobs": -1,
121 |             "verbose": 2,
122 |             "error_score": 0
123 |         }
124 |     },
125 | 
126 |     "cross_validation": {
127 |         "type": "RepeatedStratifiedKFold",     // type of cross-validation used
128 |         "args": {
129 |             "n_splits": 5,
130 |             "n_repeats": 10,
131 |             "random_state": 1
132 |         }
133 |     },
134 | 
135 |     "data_loader": {
136 |         "type": "Classification",      // name of dataloader class
137 |         "args":{
138 |             "data_path": "data/path-to-file",    // path to data
139 |             "shuffle": true,    // if data shuffled before optimization
140 |             "test_split": 0.2,  // use split method for model testing
141 |             "stratify": true,   // if data stratified before optimization
142 |             "random_state":1    // random state for repeaded output
143 |         }
144 |     },
145 | 
146 |     "score": "max balanced_accuracy",     // mode and metrics used for scoring
147 |     "test_model": true,     // if model is tested after training
148 |     "debug": false,         // debug model architecture
149 |     "save_dir": "saved/"    // directory of saved reports, models and configs
150 | }
151 | 
152 | ```
153 | 
154 | Additional parameters can be added to config file. See `scikit-learn` documentation for description of tuned parameters, search method and cross validation. Possible metrics for model evaluation could be found [here](https://scikit-learn.org/stable/modules/model_evaluation.html).
155 | 
156 | ### Pipeline
157 | Methods added to config pipeline must be first defined in `models/__init__.py` file. For previous example of config file the following must be added:
158 | 
159 |   ```python
160 | from wrappers import *
161 | from sklearn.svm import SVC
162 | from sklearn.preprocessing import StandardScaler
163 | from sklearn.preprocessing import PolynomialFeatures
164 | 
165 | methods_dict = {
166 |     'pf': PolynomialFeatures,
167 |     'scaler': StandardScaler,
168 |     'PLS':PLSRegressionWrapper,
169 |     'SVC':SVC,
170 | }
171 |   ```
172 | Majority of algorithms implemented in `scikit-learn` library can be directly imported and used. Some algorithms need a little modification before usage. Such an example is Partial least squares (PLS). Modification is implemented in `wrappers/wrappers.py`. In case you want to implement your own method it can be done as well. An example wrapper for Savitzky golay filter is shown in `wrappers/data_transformations.py`. Implementation must satisfy standard method calls, eg. fit(), tranform() etc.
173 | 
174 | ### Unions
175 | 
176 | Unions concatenates results of multiple transformer methods. Those are applied in parallel to the input data. This is useful if you want to combine several feature mechanisms into a single transformer. For example, if you want to merge results from Principal component analysis (PCA) and Partial least squares (PLS) you can do the following:
177 | 
178 | ```javascript
179 | "pipeline": ["scaler", "pca-pls", "SVC"],
180 | "unions": {
181 |     "pca-pls": ["PLS", "PCA"]
182 | }
183 | ```
184 | 
185 | In pipeline you must write self made-up name of a method (in this case `pca-pls`) and then use the same name as a key in unions dictionary. Value to coresponding key must be list of methods (in this case consisting of "PCA" and "PLS"). Hyperparameters which are tuned with a chosen search method must be separated with double underscore (following `scikit-learn` nomenclature). In case you want to tune number of components of both methods you can do the following:
186 | 
187 | ```javascript
188 | "tuned_parameters":[{
189 |     "pca-pls__PLS__n_components": [1,2,3],
190 |     "pca-pls__PCA__n_components": [1,2,3]
191 | }],
192 | ```
193 | 
194 | Please refer to `configs/config_unions.json` for unions example.
195 | ### Debug
196 | 
197 | To debug model architecture set debug flag in config file to `true`. It will print model  by steps with coresponding consecutive outputs produced at each step. Model debugging will only work with `GridSearchCV` search method. In case many parameters are listed to choose from only first ones will be used for evaluation. Debugging is useful in cases when you want to get a sense of what happens at separate step.
198 | 
199 | ## Customization
200 | 
201 | 
202 | ### Custom CLI options
203 | 
204 | Changing values of config file is a clean, safe and easy way of tuning hyperparameters. However, sometimes
205 | it is better to have command line options if some values need to be changed too often or quickly.
206 | 
207 | This template uses the configurations stored in the json file by default, but by registering custom options as follows
208 | you can change some of them using CLI flags.
209 | 
210 |   ```python
211 |   # simple class-like object having 3 attributes, `flags`, `type`, `target`.
212 |   CustomArgs = collections.namedtuple('CustomArgs', 'flags type target')
213 |   options = [
214 |         CustomArgs(['-cv', '--cross_validation'], type=int, target='cross_validation;args;n_repeats'),
215 |       # options added here can be modified by command line flags.
216 | ]
217 |   ```
218 | `target` argument should be sequence of keys, which are used to access that option in the config dict. In this example, `target`
219 | number of repeats in cross validation option is `('cross_validation', 'args', 'n_repeats')` because `config['cross_validation']['args']['n_repeats']` points to number of repeats.
220 | 
221 | 
222 | ### Data Loader
223 | * **Writing your own data loader**
224 | 
225 | 1. **Inherit ```BaseDataLoader```**
226 | 
227 |     `BaseDataLoader` handles:
228 |     * Train/test procedure
229 |     * Data shuffling
230 | 
231 | * **Usage**
232 | 
233 |     Loaded data must be assigned to data_handler (dh) in appropriate manner. If dh.X_data_test and dh.y_data_test are not assigned in advance, train/test split could be created by base data loader. In case `"test_split":0.0` is set in config file, whole dataset is used for training. Another option is to assign both train and test sets as shown below. In this case train data will be used for optimization and test data will be used for evaluation of a model.
234 | 
235 |     ```python
236 |     data_handler.X_data = X_train
237 |     data_handler.y_data = y_train
238 |     data_handler.X_data_test = X_test
239 |     data_handler.y_data_test = y_test
240 |     ```
241 | * **Example**
242 | 
243 |   Please refer to `data_loaders/data_loaders.py` for data loading example.
244 | 
245 | ### Optimizer
246 | * **Writing your own optimizer**
247 | 
248 | 1. **Inherit ```BaseOptimizer```**
249 | 
250 |     `BaseOptimizer` handles:
251 |     * Optimization procedure
252 |     * Model saving and loading
253 |     * Report saving
254 | 
255 | 
256 | 2. **Implementing abstract methods**
257 | 
258 |     You need to implement `fitted_model()` which must return fitted model.
259 |     Optionally you can implement format of train/test reports with `create_train_report()` and `create_test_report()`.
260 | 
261 | * **Example**
262 | 
263 |   Please refer to `optimizers/optimizers.py` for optimizer example.
264 | 
265 | ### Model
266 | * **Writing your own model**
267 | 
268 | 1. **Inherit `BaseModel`**
269 | 
270 |     `BaseModel` handles:
271 |     * Initialization defined in config pipeline
272 |     * Modification of steps
273 | 
274 | 2. **Implementing abstract methods**
275 | 
276 |     You need to implement `created_model()` which must return created model.
277 | 
278 | * **Usage**
279 | 
280 |     Initialization of pipeline methods is performed with `create_steps()`. Steps can be later modified with the use of `change_step()`. An example on how to change a step is shown bellow where Sequential feature selector is added to the pipeline.
281 | 
282 |     ```python
283 |     def __init__(self, pipeline):
284 |         steps = self.create_steps(pipeline)
285 | 
286 |         rf = RandomForestRegressor(random_state=1)
287 |         clf = TransformedTargetRegressor(regressor=rf,
288 |                                         func=np.log1p,
289 |                                         inverse_func=np.expm1)
290 |         sfs = SequentialFeatureSelector(clf, n_features_to_select=2, cv=3)
291 | 
292 |         steps = self.change_step('sfs', sfs, steps)
293 | 
294 |         self.model = Pipeline(steps=steps)
295 | 
296 |     ```
297 | 
298 |     Beware that in this case 'sfs' needs to be added to pipeline in config file. Otherwise, no step in the pipeline is changed.
299 | 
300 | * **Example**
301 | 
302 |   Please refer to `models/models.py` model example.
303 | 
304 | ## Common Questions About Hyperparameter Optimization
305 | 
306 | ### How to Choose Between Random and Grid Search?
307 | * Choose the method based on your needs. I recommend starting with grid and doing a random search if you have the time.
308 | * Grid search is appropriate for small and quick searches of hyperparameter values that are known to perform well generally.
309 | * Random search is appropriate for discovering new hyperparameter values or new combinations of hyperparameters, often resulting in better performance, although it may take more time to complete.
310 | 
311 | ### How to Speed-Up Hyperparameter Optimization?
312 | * Ensure that you set the “n_jobs” argument to the number of cores on your machine.
313 | * Evaluate on a smaller sample of your dataset.
314 | * Explore a smaller search space.
315 | * Use fewer repeats and/or folds for cross-validation.
316 | * Execute the search on a faster machine, such as AWS EC2.
317 | * Use an alternate model that is faster to evaluate.
318 | 
319 | More on: [machinelearningmastery](https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/).
320 | 
321 | ## Roadmap
322 | 
323 | See [open issues](https://github.com/janezlapajne/sklearn-project-template/issues) to request a feature or report a bug.
324 | 
325 | ## Contribution
326 | 
327 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.
328 | 
329 | How to start with contribution:
330 | 1. Fork the Project
331 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
332 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
333 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
334 | 5. Open a Pull Request
335 | 
336 | Feel free to contribute any kind of function or enhancement.
337 | 
338 | ## License
339 | This project is licensed under the MIT License. See  LICENSE for more details.
340 | 
341 | ## Acknowledgements
342 | This project is inspired by the project [pytorch-template](https://github.com/victoresque/pytorch-template) by [Victor Huang](https://github.com/victoresque). I would like to confess that some functions, architecture and some parts of readme were directly copied from this repo. But to be honest, what should I do - the project is absolutely amazing!
343 | 
344 | Additionally, special thanks to the creator of Machine learning mastery, [Jason Brownlee, PhD](https://machinelearningmastery.com/about/) for insightful articles published!
345 | 
346 | ## Consider supporting
347 | 
348 | Do you feel generous today? I am still a student and would make a good use of some extra money :P
349 | 
350 | 
351 | 
352 | [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/janezlapajne)
353 | 
354 | <!-- Odspodi ni več.
355 | 
356 | ____________
357 | 
358 | This is a simple python project template for Visual studio code.
359 | 
360 | Create and activate virtual environment:
361 | 
362 |    ```sh
363 |    python -m venv .venv
364 |    ```
365 |    ```sh
366 |    "./.venv/Scripts/activate"
367 |    ```
368 | 
369 |    or
370 | 
371 |    ```sh
372 |    conda create -n yourenvname python=x.x anaconda
373 |    ```
374 |    ```sh
375 |    conda activate yourenvname
376 |    ```
377 | 
378 | Clear git cached files and directories:
379 | 
380 |    ```sh
381 |    git rm --cached -r .vscode
382 |    ```
383 |    ```sh
384 |    git rm --cached .env
385 |    ```
386 | 
387 | Set path to project root directory in `.env`, e.g.:
388 | 
389 |    ```sh
390 |    PYTHONPATH=C:\\Users\\janezla\\Documents\\python-project-template
391 |    ```
392 | 
393 | Set python path in vscode workspace settings, e.g.:
394 |    ```sh
395 |    "python.pythonPath": "C:\\Users\\janezla\\Anaconda3\\envs\\yourenvname\\python"
396 |    ``` -->
397 | 
398 | 


--------------------------------------------------------------------------------