├── test
    ├── __init__.py
    ├── config.json
    └── test_converter.py
├── adapter
    ├── __init__.py
    ├── run_baseline.py
    ├── run_tpot.py
    ├── hyperopt_adapter.py
    ├── run_hpsklearn.py
    ├── grid_search.py
    ├── bohb.py
    ├── random_search.py
    ├── run_atm.py
    ├── robo.py
    ├── smac.py
    ├── optunity_adapter.py
    ├── btb_adapter.py
    ├── run_h2o.py
    ├── base.py
    └── run_auto_sklearn.py
├── evaluation
    ├── __init__.py
    ├── crop.sh
    ├── openml_crawler.py
    └── base.py
├── config
    ├── __init__.py
    ├── vectorizer.py
    ├── util.py
    └── base.py
├── assets
    ├── ds.pkl
    ├── pipelines.pkl
    ├── cash_configs.pkl
    ├── config_clusters-0.25.pkl
    ├── atm_sql.yaml
    ├── atm_run.yaml
    ├── preprocessing.json
    └── classifier.json
├── util
    ├── __init__.py
    ├── multiprocessor.py
    ├── logger.py
    └── mean_shift.py
├── benchmark
    ├── __init__.py
    ├── base.py
    ├── ml.py
    ├── synthetic.py
    └── open_ml.py
├── parallel_hpsklearn.sh
├── requirements.txt
├── run_tpot.py
├── run_hpsklearn.py
├── .gitignore
├── run_atm.py
├── comparison_human.py
├── Readme.md
├── run_framework.py
├── run_auto_sklearn.py
├── run.py
└── run_cash.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/adapter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .converter import *
3 | 


--------------------------------------------------------------------------------
/assets/ds.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ennosigaeon/automl_benchmark/HEAD/assets/ds.pkl


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | def flatten(l):
2 |     return [item for sublist in l for item in sublist]
3 | 


--------------------------------------------------------------------------------
/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | from .ml import *
2 | from .open_ml import *
3 | from .synthetic import *
4 | 


--------------------------------------------------------------------------------
/evaluation/crop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | for FILE in ./plots/*.pdf; do
3 |   pdfcrop ${FILE} ${FILE}
4 | done


--------------------------------------------------------------------------------
/assets/pipelines.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ennosigaeon/automl_benchmark/HEAD/assets/pipelines.pkl


--------------------------------------------------------------------------------
/assets/cash_configs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ennosigaeon/automl_benchmark/HEAD/assets/cash_configs.pkl


--------------------------------------------------------------------------------
/assets/config_clusters-0.25.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ennosigaeon/automl_benchmark/HEAD/assets/config_clusters-0.25.pkl


--------------------------------------------------------------------------------
/parallel_hpsklearn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | for i in 0 1 2 3 4 5 6 7
3 | do
4 |   echo "Starting chunk $i"
5 |   python3 run_hpsklearn.py $i > hpsklearn-$i.log 2>&1 &
6 | done


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h2o==3.26.0.8
 2 | atm==0.2.2
 3 | auto-sklearn==0.5.2
 4 | ConfigSpace==0.4.9
 5 | hpbandster==0.7.4
 6 | humanfriendly
 7 | hyperopt==0.1.2
 8 | matplotlib==3.0.3
 9 | networkx
10 | numpy==1.26.1
11 | openml==0.9.0
12 | pandas
13 | psycopg2==2.7.7
14 | pybind11==2.2.4
15 | pymongo
16 | pyrfr==0.8.0
17 | scikit-learn
18 | scipy==1.2.1
19 | smac==0.10.0
20 | tpot==0.10.2


--------------------------------------------------------------------------------
/assets/atm_sql.yaml:
--------------------------------------------------------------------------------
 1 | # SQL dialect
 2 | dialect: sqlite
 3 | # Name of the database
 4 | database: atm.db
 5 | # Username to gain access to the database
 6 | username: 
 7 | # Password to gain access to the database
 8 | password:
 9 | # Host name of the device hosting the database
10 | host: 
11 | # Port on host listening for database connections
12 | port:
13 | # Optional field for specifying login details
14 | query: 
15 | 


--------------------------------------------------------------------------------
/util/multiprocessor.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | 
 3 | 
 4 | class NoDaemonProcess(multiprocessing.Process):
 5 | 
 6 |     def __init__(self, group=None, target=None, name=None, args=(), kwargs={}):
 7 |         super().__init__(None, target, name, args, kwargs)
 8 | 
 9 |     # make 'daemon' attribute always return False
10 |     def _get_daemon(self):
11 |         return False
12 | 
13 |     def _set_daemon(self, value):
14 |         pass
15 | 
16 |     daemon = property(_get_daemon, _set_daemon)
17 | 
18 | 
19 | class NoDaemonPool(multiprocessing.pool.Pool):
20 |     Process = NoDaemonProcess
21 | 


--------------------------------------------------------------------------------
/util/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def setup(id: int = None):
 5 |     logging.basicConfig(level=40)  # 10: debug; 20: info
 6 | 
 7 |     logger = logging.getLogger()
 8 |     logger.handlers = []
 9 | 
10 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
11 | 
12 |     stream_handler = logging.StreamHandler()
13 |     stream_handler.setFormatter(formatter)
14 |     logger.addHandler(stream_handler)
15 | 
16 |     file = 'application.log'
17 |     if id is not None:
18 |         file = 'application-{}.log'.format(id)
19 | 
20 |     file_handler = logging.FileHandler(file, mode='w')
21 |     file_handler.setFormatter(formatter)
22 |     logger.addHandler(file_handler)
23 | 
24 | 
25 | def get():
26 |     logger = logging.getLogger('benchmark')
27 |     logger.setLevel(logging.DEBUG)
28 |     return logger
29 | 


--------------------------------------------------------------------------------
/config/vectorizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import numpy as np
 4 | 
 5 | from config import MetaConfigCollection, CATEGORICAL
 6 | 
 7 | 
 8 | class ConfigVectorizer:
 9 | 
10 |     def __init__(self, classifier_file='assets/classifier.json'):
11 |         self.config_space = MetaConfigCollection.from_json(classifier_file)
12 | 
13 |     def vectorize(self, dict: Dict):
14 |         algorithm = dict['algorithm']
15 |         definition = self.config_space.algos[algorithm]
16 | 
17 |         x = []
18 |         for key in sorted(definition.dict.keys()):
19 |             if key not in dict:
20 |                 value = np.nan
21 |             else:
22 |                 d = definition.dict[key]
23 |                 value = dict[key]
24 |                 if d.type == CATEGORICAL:
25 |                     value = -d.choices.index(value) / len(d.choices)
26 |                 else:
27 |                     value = (value - d.lower) / (d.upper - d.lower)
28 |             x.append(value)
29 |         return x
30 | 


--------------------------------------------------------------------------------
/adapter/run_baseline.py:
--------------------------------------------------------------------------------
 1 | import sklearn.datasets
 2 | import sklearn.metrics
 3 | import sklearn.model_selection
 4 | from sklearn.dummy import DummyClassifier
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | from sklearn.impute import SimpleImputer
 7 | 
 8 | 
 9 | def skip(id: int) -> bool:
10 |     failed = []
11 |     return id in failed
12 | 
13 | 
14 | def setup():
15 |     pass
16 | 
17 | 
18 | def main(fold, dummy: bool, score: bool = True) -> float:
19 |     setup()
20 |     X_train, y_train, X_test, y_test = fold
21 |     X_train = SimpleImputer().fit_transform(X_train)
22 |     X_test = SimpleImputer().fit_transform(X_test)
23 | 
24 |     estimator = DummyClassifier() if dummy else RandomForestClassifier()
25 |     estimator.fit(X_train, y_train)
26 | 
27 |     if score:
28 |         predictions = estimator.predict(X_test)
29 |         return 1 - sklearn.metrics.accuracy_score(y_test, predictions)
30 |     else:
31 |         # predictions = estimator.predict_proba(X_test)
32 |         # return sklearn.metrics.log_loss(y_test, predictions), estimator
33 |         predictions = estimator.predict_proba(X_test)
34 |         return sklearn.metrics.roc_auc_score(y_test, predictions[:, 1]), estimator
35 | 


--------------------------------------------------------------------------------
/benchmark/base.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def _dict_as_array(foo, data_type=np.float):
 7 |     """ Decorator to allow the first input argument to 'objective_function' to be an array.
 8 | 
 9 |         For all continuous benchmarks it is often required that the input to the benchmark
10 |         can be a (NumPy) array. By adding this to the objective function, both inputs types,
11 |         dict and array, are possible.
12 |     """
13 | 
14 |     def wrapper(self, configuration, **kwargs):
15 |         if isinstance(configuration, dict):
16 |             blastoise = np.array(
17 |                 [configuration[k] for k in configuration],
18 |                 dtype=data_type
19 |             )
20 |         else:
21 |             blastoise = configuration
22 |         return (foo(self, blastoise, **kwargs))
23 | 
24 |     return (wrapper)
25 | 
26 | 
27 | def meta_information(foo):
28 |     def wrapper(self, confguration, **kwargs):
29 |         start = time.time()
30 |         res = foo(self, confguration, **kwargs)
31 | 
32 |         res['start'] = start
33 |         res['end'] = time.time()
34 |         # res['config'] = confguration
35 |         return res
36 | 
37 |     return wrapper
38 | 


--------------------------------------------------------------------------------
/run_tpot.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from tpot import TPOTClassifier
 4 | 
 5 | from benchmark import OpenMLBenchmark
 6 | 
 7 | timeout = 3600  # in seconds
 8 | run_timeout = 360
 9 | jobs = 4
10 | 
11 | 
12 | def main(bm: OpenMLBenchmark):
13 |     X_train = bm.X_train
14 |     y_train = bm.y_train
15 |     X_test = bm.X_test
16 |     y_test = bm.y_test
17 | 
18 |     pipeline_optimizer = TPOTClassifier(
19 |         max_time_mins=timeout / 60,
20 |         max_eval_time_mins=run_timeout / 60,
21 |         scoring='accuracy',
22 |         n_jobs=jobs,
23 |         verbosity=2
24 |     )
25 |     pipeline_optimizer.fit(X_train, y_train)
26 |     print(pipeline_optimizer.fitted_pipeline_)
27 |     print('Misclassification Rate', 1 - pipeline_optimizer.score(X_test, y_test))
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     for i in range(4):
32 |         print('#######\nIteration {}\n#######'.format(i))
33 |         print('Timeout: ', timeout)
34 |         print('Run Timeout: ', run_timeout)
35 | 
36 |         task_ids = [15, 23, 24, 29, 3021, 41, 2079, 3543, 3560, 3561,
37 |                     3904, 3946, 9955, 9985, 7592, 14969, 14968, 14967, 125920, 146606]
38 |         for task in task_ids:
39 |             print('Starting task {} at {}'.format(task, datetime.datetime.now().time()))
40 |             bm = OpenMLBenchmark(task)
41 | 
42 |             main(bm)
43 | 


--------------------------------------------------------------------------------
/test/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sklearn.svm.SVC": {
 3 |     "kernel": {
 4 |       "type": "categorical",
 5 |       "choices": [
 6 |         "linear",
 7 |         "rbf",
 8 |         "poly",
 9 |         "sigmoid"
10 |       ],
11 |       "default_value": "poly"
12 |     },
13 |     "degree": {
14 |       "type": "uniform_int",
15 |       "lower": 1,
16 |       "upper": 5,
17 |       "condition": {
18 |         "parent": "kernel",
19 |         "value": [
20 |           "poly"
21 |         ]
22 |       }
23 |     },
24 |     "gamma": {
25 |       "type": "uniform_float",
26 |       "lower": 0.0001,
27 |       "upper": 8,
28 |       "default_value": 1.0,
29 |       "condition": {
30 |         "parent": "kernel",
31 |         "value": [
32 |           "rbf",
33 |           "poly",
34 |           "sigmoid"
35 |         ]
36 |       }
37 |     },
38 |     "C": {
39 |       "type": "uniform_float",
40 |       "lower": 0.001,
41 |       "upper": 1000.0,
42 |       "default_value": 1.0
43 |     },
44 |     "coef0": {
45 |       "type": "uniform_float",
46 |       "lower": 0.0,
47 |       "upper": 10.0,
48 |       "default_value": 0.0,
49 |       "condition": {
50 |         "parent": "kernel",
51 |         "value": [
52 |           "poly",
53 |           "sigmoid"
54 |         ]
55 |       }
56 |     },
57 |     "shrinking": {
58 |       "type": "categorical",
59 |       "choices": [
60 |         true,
61 |         false
62 |       ]
63 |     }
64 |   }
65 | }


--------------------------------------------------------------------------------
/assets/atm_run.yaml:
--------------------------------------------------------------------------------
 1 | # this will be overridden by the test script
 2 | train_path: 
 3 | test_path:
 4 | data_description: 
 5 | class_column: class
 6 | 
 7 | # use every method we have
 8 | methods: 
 9 |     - logreg
10 |     - svm
11 |     - sgd
12 |     - dt
13 |     - et
14 |     - rf
15 |     - gnb
16 |     - mnb
17 |     - bnb
18 |     - gp
19 |     - pa
20 |     - knn
21 |     - mlp
22 | # priority (higher number is more important)
23 | priority: 1
24 | # Should there be a classifier or walltime budget?
25 | budget_type: walltime
26 | # If budget_type is classifier, how many classifiers to try?
27 | budget: 60
28 | # How should ATM sample hyperparameters from a given frozen set?
29 | tuner: gp
30 | # r_minimum is the number of random runs performed in each hyperpartition before 
31 | # allowing bayesian opt to select parameters.
32 | r_minimum: 2
33 | # gridding determines whether or not sample selection will happen on a grid. 
34 | gridding: 0
35 | # How should ATM select a particular hyperpartition (frozen set) from the 
36 | # set of all hyperpartitions? 
37 | selector: bestk
38 | # k is number that xxx_k methods use. It is similar to r_minimum, except it is 
39 | # called k_window and determines how much "history" ATM considers for certain
40 | # frozen selection logics.
41 | k_window: 5
42 | # Which field to use for judgment of performance
43 | # options: f1, roc_auc, accuracy
44 | metric: accuracy
45 | # Which data to use for computing judgment score
46 | # cv = Cross_Validated performance on training data
47 | # test = Performance on test data
48 | score_target: test
49 | 


--------------------------------------------------------------------------------
/run_hpsklearn.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import traceback
 3 | 
 4 | import hpsklearn
 5 | import humanfriendly
 6 | import hyperopt
 7 | import sklearn
 8 | 
 9 | from benchmark import OpenMLBenchmark
10 | 
11 | max_evals = 325
12 | run_timeout = 60  # in minutes
13 | 
14 | 
15 | def main(bm: OpenMLBenchmark):
16 |     start = time.time()
17 |     X_train = bm.X_train
18 |     y_train = bm.y_train
19 |     X_test = bm.X_test
20 |     y_test = bm.y_test
21 |     estimator = hpsklearn.HyperoptEstimator(
22 |         preprocessing=hpsklearn.components.any_preprocessing('pp'),
23 |         classifier=hpsklearn.components.any_classifier('clf'),
24 |         algo=hyperopt.tpe.suggest,
25 |         trial_timeout=run_timeout,
26 |         max_evals=max_evals,
27 |         seed=int(start)
28 |     )
29 |     estimator.fit(X_train, y_train)
30 |     predictions = estimator.predict(X_test)
31 | 
32 |     print('Misclassification rate', 1 - sklearn.metrics.accuracy_score(y_test, predictions))
33 |     print('Duration', humanfriendly.format_timespan(time.time() - start))
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     for i in range(1):
38 |         print('#######\nIteration {}\n#######'.format(i))
39 |         print('Max Evals: ', max_evals)
40 |         print('Run Timeout: ', run_timeout)
41 | 
42 |         task_ids = [15, 23, 24, 29, 3021, 41, 2079, 3543, 3560, 3561,
43 |                     3904, 3946, 9955, 9985, 7592, 14969, 14968, 14967, 125920, 146606]
44 |         for task in task_ids:
45 |             print('Starting task {}'.format(task))
46 |             bm = OpenMLBenchmark(task)
47 | 
48 |             try:
49 |                 main(bm)
50 |             except Exception as e:
51 |                 if isinstance(e, KeyboardInterrupt):
52 |                     raise e
53 |                 traceback.print_exc()
54 |                 print('Misclassification rate', 1)
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | tmp*.py
  2 | evaluation/plots
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .env
 94 | .venv
 95 | env/
 96 | venv/
 97 | ENV/
 98 | env.bak/
 99 | venv.bak/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Intellij project settings
106 | .idea/
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | .dmypy.json
117 | dmypy.json
118 | 
119 | # Pyre type checker
120 | .pyre/


--------------------------------------------------------------------------------
/run_atm.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | import shutil
 4 | import subprocess
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from benchmark import OpenMLBenchmark
10 | 
11 | timeout = 60  # in minutes
12 | # run_timeout = 30
13 | jobs = 4
14 | 
15 | 
16 | def main(bm: OpenMLBenchmark):
17 |     X_train = bm.X_train
18 |     y_train = bm.y_train
19 |     X_test = bm.X_test
20 |     y_test = bm.y_test
21 | 
22 |     headers = bm.column_names + ['class']
23 |     train = np.c_[X_train, y_train]
24 |     test = np.c_[X_test, y_test]
25 | 
26 |     os.mkdir('/tmp/atm/{}'.format(bm.task_id))
27 |     train_path = '/tmp/atm/{}/train.csv'.format(bm.task_id)
28 |     test_path = '/tmp/atm/{}/test.csv'.format(bm.task_id)
29 |     pd.DataFrame(train, columns=headers).to_csv(train_path, index=None)
30 |     pd.DataFrame(test, columns=headers).to_csv(test_path, index=None)
31 | 
32 |     sql_path = '{}/assets/atm_sql.yaml'.format(os.getcwd())
33 |     cmd = 'atm enter_data --sql-config {sql} --train-path {train_path} --test-path {test_path}' \
34 |           ' --budget-type walltime --budget {budget} --metric accuracy --name {name}' \
35 |         .format(sql=sql_path, train_path=train_path, test_path=test_path, budget=timeout, name=bm.task_id)
36 |     subprocess.call(cmd, shell=True)
37 | 
38 |     cmd = 'atm worker --no-save --sql-config {}'.format(sql_path)
39 | 
40 |     procs = [subprocess.Popen(cmd, shell=True) for i in range(jobs)]
41 |     for p in procs:
42 |         p.wait()
43 | 
44 |     subprocess.call(cmd, shell=True)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     for i in range(1):
49 |         print('#######\nIteration {}\n#######'.format(i))
50 | 
51 |         try:
52 |             shutil.rmtree('/tmp/atm/')
53 |         except OSError as e:
54 |             pass
55 |         os.mkdir('/tmp/atm/')
56 | 
57 |         print('Timeout: ', timeout)
58 | 
59 |         task_ids = [15, 23, 24, 29, 3021, 41, 2079, 3543, 3560, 3561,
60 |                     3904, 3946, 9955, 9985, 7592, 14969, 14968, 14967, 125920, 146606]
61 |         for task in task_ids:
62 |             print('Starting task {} at {}'.format(task, datetime.datetime.now().time()))
63 |             bm = OpenMLBenchmark(task)
64 | 
65 |             main(bm)
66 | 
67 | ###########
68 | # Get results via
69 | #
70 | # SELECT ds.name, 1 - max(cs.test_judgment_metric) as 'Misclassification rate' FROM classifiers cs
71 | # JOIN dataruns dr ON cs.datarun_id = dr.id
72 | # JOIN datasets ds ON dr.dataset_id = ds.id
73 | # GROUP BY cs.datarun_id
74 | 


--------------------------------------------------------------------------------
/evaluation/openml_crawler.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import openml
 3 | 
 4 | from evaluation import scripts
 5 | 
 6 | task_ids = scripts.all_tasks
 7 | 
 8 | mapping = {}
 9 | 
10 | for task in task_ids:
11 |     resp = requests.post('https://www.openml.org/es/run/run/_search?size=0', json={
12 |         "query": {
13 |             "bool": {
14 |                 "must": [{
15 |                     "term": {
16 |                         "run_task.task_id": task
17 |                     }
18 |                 }, {
19 |                     "nested": {
20 |                         "path": "evaluations",
21 |                         "query": {
22 |                             "exists": {
23 |                                 "field": "evaluations"
24 |                             }
25 |                         }
26 |                     }
27 |                 }
28 |                 ]
29 |             }
30 |         },
31 |         "aggs": {
32 |             "flows": {
33 |                 "terms": {
34 |                     "field": "run_flow.flow_id",
35 |                     "size": 100
36 |                 },
37 |                 "aggs": {
38 |                     "top_score": {
39 |                         "top_hits": {
40 |                             "_source": ["run_id", "run_flow.name", "run_flow.parameters", "run_flow.flow_id",
41 |                                         "uploader", "evaluations.evaluation_measure", "evaluations.value"],
42 |                             "sort": [{
43 |                                 "evaluations.value": {
44 |                                     "order": "desc",
45 |                                     "nested_path": "evaluations",
46 |                                     "nested_filter": {
47 |                                         "term": {
48 |                                             "evaluations.evaluation_measure": "predictive_accuracy"
49 |                                         }
50 |                                     }
51 |                                 }
52 |                             }
53 |                             ],
54 |                             "size": 100
55 |                         }
56 |                     }
57 |                 }
58 |             }
59 |         }
60 |     }).json()
61 |     try:
62 |         score = -1
63 |         for flow in resp['aggregations']['flows']['buckets']:
64 |             for run in flow['top_score']['hits']['hits']:
65 |                 score = max(score, float(run['sort'][0]))
66 |     except Exception:
67 |         score = -1
68 | 
69 |     dataset = openml.tasks.get_task(task).get_dataset().dataset_id
70 |     mapping[dataset] = score
71 | print(mapping)
72 | 


--------------------------------------------------------------------------------
/assets/preprocessing.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "identity": {},
  3 |   "sklearn.cluster.FeatureAgglomeration": {
  4 |     "n_clusters": {
  5 |       "type": "uniform_int",
  6 |       "lower": 2,
  7 |       "upper": 400,
  8 |       "default_value": 25
  9 |     },
 10 |     "affinity": {
 11 |       "type": "categorical",
 12 |       "choices": [
 13 |         "euclidean",
 14 |         "manhattan",
 15 |         "cosine"
 16 |       ],
 17 |       "default_value": "euclidean"
 18 |     },
 19 |     "linkage": {
 20 |       "type": "categorical",
 21 |       "choices": [
 22 |         "ward",
 23 |         "complete",
 24 |         "average"
 25 |       ],
 26 |       "default_value": "ward"
 27 |     },
 28 |     "pooling_func": {
 29 |       "type": "categorical",
 30 |       "choices": [
 31 |         "mean",
 32 |         "median",
 33 |         "max"
 34 |       ],
 35 |       "default_value": "mean"
 36 |     }
 37 |   },
 38 |   "sklearn.decomposition.FastICA": {
 39 |     "n_components": {
 40 |       "type": "uniform_int",
 41 |       "lower": 1,
 42 |       "upper": 400,
 43 |       "default_value": 25
 44 |     },
 45 |     "whiten": {
 46 |       "type": "categorical",
 47 |       "choices": [
 48 |         true,
 49 |         false
 50 |       ],
 51 |       "default_value": false
 52 |     },
 53 |     "algorithm": {
 54 |       "type": "categorical",
 55 |       "choices": [
 56 |         "parallel",
 57 |         "deflation"
 58 |       ],
 59 |       "default_value": "parallel"
 60 |     },
 61 |     "fun": {
 62 |       "type": "categorical",
 63 |       "choices": [
 64 |         "logcosh",
 65 |         "exp",
 66 |         "cube"
 67 |       ],
 68 |       "default_value": "logcosh"
 69 |     }
 70 |   },
 71 |   "sklearn.decomposition.PCA": {
 72 |     "n_components": {
 73 |       "type": "uniform_float",
 74 |       "lower": 0.5,
 75 |       "upper": 0.999,
 76 |       "default_value": 0.75
 77 |     },
 78 |     "whiten": {
 79 |       "type": "categorical",
 80 |       "choices": [
 81 |         true,
 82 |         false
 83 |       ],
 84 |       "default_value": false
 85 |     }
 86 |   },
 87 |   "sklearn.preprocessing.PolynomialFeatures": {
 88 |     "degree": {
 89 |       "type": "uniform_int",
 90 |       "lower": 2,
 91 |       "upper": 3,
 92 |       "default_value": 2
 93 |     }
 94 |   },
 95 |   "sklearn.preprocessing.MinMaxScaler": {},
 96 |   "sklearn.preprocessing.Normalizer": {},
 97 |   "sklearn.preprocessing.RobustScaler": {
 98 |     "q_min": {
 99 |       "type": "uniform_float",
100 |       "lower": 0.001,
101 |       "upper": 0.3,
102 |       "default_value": 0.25
103 |     },
104 |     "q_max": {
105 |       "type": "uniform_float",
106 |       "lower": 0.7,
107 |       "upper": 0.999,
108 |       "default_value": 0.75
109 |     }
110 |   },
111 |   "sklearn.preprocessing.StandardScaler": {}
112 | }


--------------------------------------------------------------------------------
/evaluation/base.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from bson.errors import InvalidDocument
 4 | from hpolib.abstract_benchmark import AbstractBenchmark
 5 | from pymongo import MongoClient
 6 | 
 7 | from adapter.base import OptimizationStatistic, BenchmarkResult
 8 | import util.logger
 9 | 
10 | logger = util.logger.get()
11 | 
12 | class Persistence:
13 | 
14 |     def clear_old_results(self, benchmark: AbstractBenchmark) -> None:
15 |         pass
16 | 
17 |     def store_new_run(self, res: BenchmarkResult):
18 |         pass
19 | 
20 |     def store_results(self, res: BenchmarkResult, stats: OptimizationStatistic) -> None:
21 |         pass
22 | 
23 |     def load_single(self, benchmark: AbstractBenchmark) -> BenchmarkResult:
24 |         pass
25 | 
26 |     def load_all(self, benchmark: AbstractBenchmark) -> List[BenchmarkResult]:
27 |         pass
28 | 
29 | class MongoPersistence(Persistence):
30 | 
31 |     def __init__(self, url: str, port: int = 27017, db='benchmarks', read_only: bool = False):
32 |         self.client = MongoClient(url, port)
33 |         self.db = eval('self.client.' + db)
34 |         self.read_only = read_only
35 | 
36 |     def clear_old_results(self, benchmark: AbstractBenchmark) -> None:
37 |         if self.read_only:
38 |             return
39 |         collection = self.db[benchmark.get_meta_information()['name']]
40 |         collection.drop()
41 | 
42 |     def store_new_run(self, res: BenchmarkResult):
43 |         if self.read_only:
44 |             return
45 |         collection = self.db[res.benchmark.get_meta_information()['name']]
46 |         d = res.as_dict()
47 |         collection.insert_one(d)
48 | 
49 |     def store_results(self, res: BenchmarkResult, stats: OptimizationStatistic) -> None:
50 |         if self.read_only:
51 |             return
52 |         collection = self.db[res.benchmark.get_meta_information()['name']]
53 | 
54 |         # collection.delete_one({'algorithm': stats.algorithm})
55 | 
56 |         d = stats.as_dict(include_evaluations=True)
57 |         try:
58 |             collection.update_one({'seed': res.seed}, {'$push': {'solvers': d}})
59 |         except InvalidDocument as ex:
60 |             logger.fatal('Invalid document {}'.format(d))
61 |             raise ex
62 | 
63 |     def load_single(self, benchmark: AbstractBenchmark) -> BenchmarkResult:
64 |         collection = self.db[benchmark.get_meta_information()['name']]
65 |         d = collection.find_one()
66 |         res = BenchmarkResult.from_dict(d)
67 |         res.benchmark = benchmark
68 |         return res
69 | 
70 |     def load_all(self, benchmark: AbstractBenchmark) -> List[BenchmarkResult]:
71 |         collection = self.db[benchmark.get_meta_information()['name']]
72 |         ls = [BenchmarkResult.from_dict(doc) for doc in collection.find()]
73 |         for res in ls:
74 |             res.benchmark = benchmark
75 |         return ls
76 | 


--------------------------------------------------------------------------------
/adapter/run_tpot.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import sklearn
 4 | from sklearn.pipeline import Pipeline
 5 | from tpot import TPOTClassifier
 6 | 
 7 | 
 8 | def skip(id: int) -> bool:
 9 |     failed = []
10 |     return id in failed
11 | 
12 | 
13 | def setup():
14 |     pass
15 | 
16 | 
17 | def main(fold, timeout: int, run_timeout: int, jobs: int, score: bool = True):
18 |     setup()
19 |     X_train, y_train, X_test, y_test = fold
20 | 
21 |     pipeline_optimizer = TPOTClassifier(
22 |         max_time_mins=timeout / 60,
23 |         max_eval_time_mins=run_timeout / 60,
24 |         scoring='roc_auc',
25 |         n_jobs=jobs,
26 |         verbosity=1
27 |     )
28 |     pipeline_optimizer.fit(X_train, y_train)
29 |     print(pipeline_optimizer.fitted_pipeline_)
30 |     if score:
31 |         predictions = pipeline_optimizer.predict(X_test)
32 |         return 1 - sklearn.metrics.accuracy_score(y_test, predictions)
33 |     else:
34 |         predictions = pipeline_optimizer.predict_proba(X_test)
35 |         return sklearn.metrics.roc_auc_score(y_test, predictions[:, 1]), pipeline_optimizer
36 | 
37 | 
38 | # noinspection PyUnresolvedReferences
39 | def load_model(input: str):
40 |     from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
41 |     from sklearn.linear_model import LogisticRegression
42 |     from sklearn.naive_bayes import GaussianNB, MultinomialNB
43 |     from sklearn.pipeline import FeatureUnion
44 |     from sklearn.preprocessing import PolynomialFeatures, MaxAbsScaler, StandardScaler, MinMaxScaler
45 |     from sklearn.svm import LinearSVC
46 |     from sklearn.tree import DecisionTreeClassifier
47 |     from sklearn.decomposition import PCA, FastICA
48 |     from sklearn.ensemble import RandomForestClassifier
49 |     from sklearn.feature_selection import VarianceThreshold, SelectPercentile, SelectFwe, f_classif
50 |     from sklearn.kernel_approximation import RBFSampler
51 |     from sklearn.naive_bayes import BernoulliNB
52 |     from sklearn.neighbors import KNeighborsClassifier
53 |     from sklearn.preprocessing import Normalizer, RobustScaler, FunctionTransformer, Binarizer
54 |     from tpot.builtins import OneHotEncoder, StackingEstimator
55 |     from tpot.builtins import ZeroCount
56 |     from xgboost import XGBClassifier
57 |     from copy import copy
58 | 
59 |     pipeline: Pipeline = eval(input)
60 |     return pipeline
61 | 
62 | 
63 | # noinspection PyUnresolvedReferences
64 | def load_pipeline(input: str) -> List[List[str]]:
65 |     from sklearn.pipeline import FeatureUnion
66 |     from tpot.builtins import OneHotEncoder, StackingEstimator
67 |     pipeline: Pipeline = load_model(input)
68 | 
69 |     res = []
70 | 
71 |     def _map_algo(algo):
72 |         if isinstance(algo, StackingEstimator):
73 |             _map_algo(algo.estimator)
74 |         elif isinstance(algo, FeatureUnion):
75 |             assert len(algo.transformer_list) == 2
76 |             for t in sorted(algo.transformer_list):
77 |                 _map_algo(t[1])
78 |         else:
79 |             res.append(type(algo).__name__)
80 | 
81 |     for s in pipeline.steps:
82 |         _map_algo(s[1])
83 | 
84 |     return [res]
85 | 


--------------------------------------------------------------------------------
/adapter/hyperopt_adapter.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import numpy as np
 4 | from hpolib.abstract_benchmark import AbstractBenchmark
 5 | from hyperopt import Trials, fmin, STATUS_FAIL, STATUS_OK, tpe
 6 | 
 7 | import util.logger
 8 | from adapter.base import BaseAdapter, OptimizationStatistic, EvaluationResult, OBJECTIVE_TIME_FACTOR
 9 | from config import HyperoptConverter
10 | 
11 | logger = util.logger.get()
12 | 
13 | 
14 | class HyperoptAdapter(BaseAdapter):
15 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, objective_time: float = None,
16 |                  seed: int = None):
17 |         super().__init__(n_jobs, time_limit, iterations, seed)
18 |         self.timeout = None
19 |         self.benchmark = None
20 | 
21 |         if iterations is None:
22 |             if objective_time is None:
23 |                 raise ValueError('Unable to estimate number of iterations without objective time')
24 |             self.iterations = self.estimate_iterations(objective_time)
25 |             logger.debug('Using maximal {} iterations'.format(self.iterations))
26 | 
27 |     def estimate_iterations(self, objective_time: float) -> int:
28 |         t = 1 / (objective_time * OBJECTIVE_TIME_FACTOR + 0.04)
29 |         return int(self.time_limit * t)
30 | 
31 |     # noinspection PyMethodOverriding
32 |     def optimize(self, benchmark: AbstractBenchmark) -> OptimizationStatistic:
33 |         start = time.time()
34 |         self.timeout = start + self.time_limit if self.time_limit else None
35 |         self.benchmark = benchmark
36 | 
37 |         statistics = OptimizationStatistic('hyperopt', start)
38 | 
39 |         # noinspection PyArgumentList
40 |         conf = benchmark.get_configuration_space(HyperoptConverter(as_scope=False))
41 |         random_state = np.random.RandomState(self.seed) if self.seed is not None else None
42 | 
43 |         trials = Trials()
44 |         # trials = MongoTrials('mongo://10.0.2.2:27017/hyperopt/jobs', exp_key='exp1')
45 |         best = fmin(self.query_objective_function,
46 |                     space=conf,
47 |                     algo=tpe.suggest,
48 |                     max_evals=self.iterations,
49 |                     rstate=random_state,
50 |                     trials=trials)
51 | 
52 |         ls = []
53 |         for res in trials.results:
54 |             if res['status'] == 'fail':
55 |                 if res['status_fail'] == 'Timeout reached':
56 |                     break
57 |                 else:
58 |                     logger.error('Unexpected error: {}'.format(res['status_fail']))
59 |             ls.append(EvaluationResult(res['start'], res['end'], res['loss'], best))
60 |         statistics.add_result(ls)
61 |         statistics.stop_optimisation()
62 | 
63 |         return statistics
64 | 
65 |     def query_objective_function(self, conf):
66 |         if (self.timeout is not None and time.time() > self.timeout):
67 |             return {
68 |                 'status': STATUS_FAIL,
69 |                 'status_fail': 'Timeout reached'
70 |             }
71 | 
72 |         res = self.benchmark.objective_function(conf)
73 |         res['status'] = STATUS_OK
74 |         res['loss'] = res['function_value']
75 |         return res
76 | 


--------------------------------------------------------------------------------
/config/util.py:
--------------------------------------------------------------------------------
 1 | from autosklearn.pipeline.components.base import AutoSklearnComponent
 2 | from autosklearn.pipeline.components.classification.adaboost import AdaboostClassifier
 3 | from autosklearn.pipeline.components.classification.bernoulli_nb import BernoulliNB
 4 | from autosklearn.pipeline.components.classification.decision_tree import DecisionTree
 5 | from autosklearn.pipeline.components.classification.extra_trees import ExtraTreesClassifier
 6 | from autosklearn.pipeline.components.classification.gaussian_nb import GaussianNB
 7 | from autosklearn.pipeline.components.classification.gradient_boosting import GradientBoostingClassifier
 8 | from autosklearn.pipeline.components.classification.k_nearest_neighbors import KNearestNeighborsClassifier
 9 | from autosklearn.pipeline.components.classification.lda import LDA
10 | from autosklearn.pipeline.components.classification.liblinear_svc import LibLinear_SVC
11 | from autosklearn.pipeline.components.classification.libsvm_svc import LibSVM_SVC
12 | from autosklearn.pipeline.components.classification.multinomial_nb import MultinomialNB
13 | from autosklearn.pipeline.components.classification.passive_aggressive import PassiveAggressive
14 | from autosklearn.pipeline.components.classification.qda import QDA
15 | from autosklearn.pipeline.components.classification.random_forest import RandomForest
16 | from autosklearn.pipeline.components.classification.sgd import SGD
17 | 
18 | 
19 | class ConfigSpace:
20 | 
21 |     @staticmethod
22 |     def sklearn_mapping(sklearn: str) -> type(AutoSklearnComponent):
23 |         if sklearn == '':
24 |             return AdaboostClassifier
25 |         elif sklearn == 'sklearn.naive_bayes.BernoulliNB':
26 |             return BernoulliNB
27 |         elif sklearn == 'sklearn.tree.DecisionTreeClassifier':
28 |             return DecisionTree
29 |         elif sklearn == 'sklearn.ensemble.ExtraTreesClassifier':
30 |             return ExtraTreesClassifier
31 |         elif sklearn == 'sklearn.naive_bayes.GaussianNB':
32 |             return GaussianNB
33 |         elif sklearn == 'sklearn.ensemble.GradientBoostingClassifier':
34 |             return GradientBoostingClassifier
35 |         elif sklearn == 'sklearn.neighbors.KNeighborsClassifier':
36 |             return KNearestNeighborsClassifier
37 |         elif sklearn == 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis':
38 |             return LDA
39 |         elif sklearn == 'sklearn.svm.LinearSVC':
40 |             return LibLinear_SVC
41 |         elif sklearn == 'sklearn.svm.SVC':
42 |             return LibSVM_SVC
43 |         elif sklearn == 'sklearn.naive_bayes.MultinomialNB':
44 |             return MultinomialNB
45 |         elif sklearn == 'sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier':
46 |             return PassiveAggressive
47 |         elif sklearn == 'sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis':
48 |             return QDA
49 |         elif sklearn == 'sklearn.ensemble.RandomForestClassifier':
50 |             return RandomForest
51 |         elif sklearn == 'sklearn.linear_model.stochastic_gradient.SGDClassifier':
52 |             return SGD
53 |         # elif sklearn == '':
54 |         #     return XGradientBoostingClassifier
55 |         raise NotImplementedError('Algorithm {} is not implemented yet'.format(sklearn))
56 | 


--------------------------------------------------------------------------------
/adapter/run_hpsklearn.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import traceback
  3 | from typing import List
  4 | 
  5 | import hpsklearn
  6 | import hyperopt
  7 | import sklearn
  8 | from sklearn.pipeline import Pipeline
  9 | 
 10 | 
 11 | def skip(id: int) -> bool:
 12 |     failed = []
 13 |     return id in failed
 14 | 
 15 | 
 16 | def setup():
 17 |     pass
 18 | 
 19 | 
 20 | def main(fold, timeout: int, run_timeout: int, score: bool = True):
 21 |     def run():
 22 |         setup()
 23 |         X_train, y_train, X_test, y_test = fold
 24 |         estimator = hpsklearn.HyperoptEstimator(
 25 |             preprocessing=hpsklearn.components.any_preprocessing('pp'),
 26 |             classifier=hpsklearn.components.any_classifier('clf'),
 27 |             algo=hyperopt.tpe.suggest,
 28 |             trial_timeout=run_timeout,
 29 |             loss_fn=sklearn.metrics.roc_auc_score,
 30 |             max_evals=-1,
 31 |             timeout=timeout,
 32 |             seed=int(time.time())
 33 |         )
 34 |         estimator.fit(X_train, y_train)
 35 | 
 36 |         pipeline = load_model(str(estimator.best_model()))
 37 |         print(pipeline)
 38 |         pipeline.fit(X_train, y_train)
 39 |         if score:
 40 |             predictions = pipeline.predict(X_test)
 41 |             return 1 - sklearn.metrics.accuracy_score(y_test, predictions)
 42 |         else:
 43 |             try:
 44 |                 predictions = pipeline.predict_proba(X_test)
 45 |             except Exception as e:
 46 |                 if isinstance(e, KeyboardInterrupt):
 47 |                     raise e
 48 |                 traceback.print_exc()
 49 | 
 50 |                 import numpy as np
 51 |                 tmp = pipeline.predict(X_test)
 52 |                 predictions = np.zeros((len(tmp), 2))
 53 |                 predictions[:, 1] = tmp
 54 | 
 55 |             return sklearn.metrics.roc_auc_score(y_test, predictions[:, 1]), pipeline
 56 | 
 57 |     for j in range(100):
 58 |         print('Attempt {}...'.format(j))
 59 |         try:
 60 |             return run()
 61 |         except Exception as e:
 62 |             if isinstance(e, KeyboardInterrupt):
 63 |                 raise e
 64 |             traceback.print_exc()
 65 |     else:
 66 |         traceback.print_exc()
 67 |         return 1
 68 | 
 69 | 
 70 | # noinspection PyUnresolvedReferences
 71 | def load_model(input: str):
 72 |     from sklearn.ensemble import RandomForestClassifier
 73 |     from sklearn.preprocessing import MinMaxScaler
 74 |     from sklearn.svm import SVC
 75 |     from sklearn.ensemble import GradientBoostingClassifier
 76 |     from sklearn.preprocessing import StandardScaler
 77 |     from sklearn.preprocessing import Normalizer
 78 |     from sklearn.ensemble import ExtraTreesClassifier
 79 |     from sklearn.ensemble import AdaBoostClassifier
 80 |     from sklearn.decomposition import PCA
 81 |     from sklearn.linear_model import SGDClassifier
 82 |     from sklearn.neighbors import KNeighborsClassifier
 83 |     from xgboost import XGBClassifier
 84 |     from numpy import nan
 85 | 
 86 |     dict = eval(input)
 87 | 
 88 |     assert dict['ex_preprocs'] == ()
 89 |     steps = []
 90 |     if dict['preprocs'] != ():
 91 |         steps.append(('preprocs', dict['preprocs'][0]))
 92 |     steps.append(('learner', dict['learner']))
 93 |     pipeline = Pipeline(steps=steps)
 94 |     return pipeline
 95 | 
 96 | 
 97 | def load_pipeline(input: str) -> List[List[str]]:
 98 |     pipeline = load_model(input)
 99 | 
100 |     res = []
101 |     for s in pipeline.steps:
102 |         res.append(type(s[1]).__name__)
103 |     return [res]
104 | 


--------------------------------------------------------------------------------
/adapter/grid_search.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import multiprocessing
 3 | import time
 4 | 
 5 | import numpy as np
 6 | from hpolib.abstract_benchmark import AbstractBenchmark
 7 | from sklearn.model_selection import ParameterGrid
 8 | 
 9 | from adapter.base import OptimizationStatistic, EvaluationResult, BaseAdapter, OBJECTIVE_TIME_FACTOR
10 | from config import GridSearchConverter
11 | from util.multiprocessor import NoDaemonPool
12 | 
13 | 
14 | def query_objective_function(candidates: ParameterGrid, benchmark: AbstractBenchmark, iterations: int, timeout: float,
15 |                              lock: multiprocessing.Lock, index: multiprocessing.Value):
16 |     ls = []
17 |     idx = 0
18 |     while timeout is None or time.time() < timeout:
19 |         lock.acquire()
20 |         i = index.value
21 |         index.value += 1
22 |         lock.release()
23 | 
24 |         if iterations is not None and i >= iterations:
25 |             break
26 | 
27 |         try:
28 |             config = candidates[idx]
29 |             for key, value in config.items():
30 |                 if isinstance(value, np.int64) or isinstance(value, np.float64):
31 |                     config[key] = value.item()
32 | 
33 |             # noinspection PyTypeChecker,PyArgumentList
34 |             res = benchmark.objective_function(config)
35 |             ls.append(EvaluationResult.from_dict(res, config))
36 |             idx += 1
37 |         except IndexError:
38 |             # Done
39 |             lock.acquire()
40 |             index.value -= 1
41 |             lock.release()
42 |             break
43 |     return ls
44 | 
45 | 
46 | class ObjectiveGridSearch(BaseAdapter):
47 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None):
48 |         super().__init__(n_jobs, time_limit, iterations)
49 | 
50 |         m = multiprocessing.Manager()
51 |         self.lock = m.Lock()
52 |         self.index = m.Value('i', 0)
53 | 
54 |     def estimate_grid_size(self, dimensions: int = 0, objective_time: float = None) -> int:
55 |         if self.time_limit is not None:
56 |             t = objective_time * OBJECTIVE_TIME_FACTOR + 0.0005
57 |             n = (self.time_limit / t) ** (1 / dimensions)
58 |         elif dimensions != 0:
59 |             n = math.ceil(self.iterations ** (1 / dimensions))
60 |         else:
61 |             n = 5
62 | 
63 |         return int(max(1, n))
64 | 
65 |     # noinspection PyMethodOverriding
66 |     def optimize(self, benchmark: AbstractBenchmark, grid_size: int = 10):
67 |         start = time.time()
68 |         timeout = start + self.time_limit if self.time_limit else None
69 | 
70 |         statistics = OptimizationStatistic('Grid Search', start)
71 | 
72 |         # noinspection PyArgumentList
73 |         config_space = benchmark.get_configuration_space(GridSearchConverter(n=grid_size))
74 |         candidate_list = []
75 |         if benchmark.get_meta_information().get('cash', False):
76 |             for key, value in config_space.items():
77 |                 conf = value.copy()
78 |                 conf['algorithm'] = [key]
79 |                 candidate_list.append(ParameterGrid(conf))
80 |         else:
81 |             candidate_list.append(ParameterGrid(config_space))
82 | 
83 |         pool = NoDaemonPool(processes=self.n_jobs)
84 |         for candidates in candidate_list:
85 |             for i in range(self.n_jobs):
86 |                 pool.apply_async(query_objective_function,
87 |                                  args=(candidates, benchmark, self.iterations, timeout, self.lock, self.index),
88 |                                  callback=lambda res: statistics.add_result(res),
89 |                                  error_callback=self.log_async_error)
90 | 
91 |         pool.close()
92 |         pool.join()
93 |         statistics.stop_optimisation()
94 | 
95 |         return statistics
96 | 


--------------------------------------------------------------------------------
/benchmark/ml.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import time
  3 | 
  4 | import numpy as np
  5 | from hpolib.abstract_benchmark import AbstractBenchmark
  6 | from hpolib.util import rng_helper
  7 | from sklearn import datasets
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | import util.logger
 11 | from config import BaseConverter, NoopConverter, MetaConfigCollection
 12 | 
 13 | logger = util.logger.get()
 14 | 
 15 | 
 16 | def create_estimator(conf: dict):
 17 |     try:
 18 |         name = conf['algorithm']
 19 |         kwargs = conf.copy()
 20 |         del kwargs['algorithm']
 21 | 
 22 |         module_name = name.rpartition(".")[0]
 23 |         class_name = name.split(".")[-1]
 24 | 
 25 |         for key, value in kwargs.items():
 26 |             if isinstance(value, float) and int(value) == value:
 27 |                 kwargs[key] = int(value)
 28 | 
 29 |         module = importlib.import_module(module_name)
 30 |         class_ = getattr(module, class_name)
 31 |         return class_(**kwargs)
 32 |     except Exception as ex:
 33 |         logger.error('Invalid estimator with config {}'.format(conf))
 34 |         raise ex
 35 | 
 36 | 
 37 | class Iris(AbstractBenchmark):
 38 | 
 39 |     def __init__(self, test_size=0.3):
 40 |         super().__init__()
 41 |         iris = datasets.load_iris()
 42 | 
 43 |         X = iris.data
 44 |         y = iris.target
 45 | 
 46 |         self.X_train, self.y_train, self.X_test, self.y_test = train_test_split(X, y, test_size=test_size)
 47 |         self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(self.X_train, self.y_train,
 48 |                                                                                   test_size=test_size)
 49 | 
 50 |     def objective_function(self, configuration, dataset_fraction=1, **kwargs):
 51 |         start_time = time.time()
 52 | 
 53 |         rng = kwargs.get("rng", None)
 54 |         self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
 55 | 
 56 |         shuffle = self.rng.permutation(self.X_train.shape[0])
 57 |         size = int(dataset_fraction * self.X_train.shape[0])
 58 | 
 59 |         X_train = self.X_train[shuffle[:size]]
 60 |         y_train = self.y_train[shuffle[:size]]
 61 | 
 62 |         try:
 63 |             clf = create_estimator(configuration)
 64 |             clf.fit(X_train, y_train)
 65 |             y = 1 - clf.score(self.X_valid, self.y_valid)
 66 |         except Exception as ex:
 67 |             logger.error('Uncaught expection {} for {}'.format(ex, configuration))
 68 |             y = 1
 69 | 
 70 |         c = time.time() - start_time
 71 |         return {'function_value': y, 'cost': c, 'start': start_time, 'end': start_time + c}
 72 | 
 73 |     def objective_function_test(self, configuration, **kwargs):
 74 |         start_time = time.time()
 75 | 
 76 |         rng = kwargs.get("rng", None)
 77 |         self.rng = rng_helper.get_rng(rng=rng, self_rng=self.rng)
 78 | 
 79 |         X_train = np.concatenate((self.X_train, self.X_valid))
 80 |         y_train = np.concatenate((self.y_train, self.y_valid))
 81 | 
 82 |         try:
 83 |             clf = create_estimator(configuration)
 84 |             clf.fit(X_train, y_train)
 85 |             y = 1 - clf.score(self.X_test, self.y_test)
 86 |         except Exception as ex:
 87 |             logger.error('Uncaught expection {} for {}'.format(ex, configuration))
 88 |             y = 1
 89 | 
 90 |         c = time.time() - start_time
 91 |         return {'function_value': y, "cost": c}
 92 | 
 93 |     @staticmethod
 94 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
 95 |         return converter.convert(MetaConfigCollection.from_json('../assets/classifier.json'))
 96 | 
 97 |     @staticmethod
 98 |     def get_meta_information():
 99 |         return {'name': 'Iris', 'cash': True}
100 | 


--------------------------------------------------------------------------------
/adapter/bohb.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import hpbandster.core.nameserver as hpns
  4 | from hpbandster.optimizers import BOHB
  5 | from hpbandster.workers.hpolibbenchmark import Worker
  6 | from hpolib.abstract_benchmark import AbstractBenchmark
  7 | 
  8 | from adapter.base import BaseAdapter, OptimizationStatistic, EvaluationResult
  9 | from config import ConfigSpaceConverter
 10 | from util.multiprocessor import NoDaemonPool
 11 | 
 12 | nameserver = '127.0.0.1'
 13 | 
 14 | 
 15 | def start_worker(benchmark: AbstractBenchmark, run_id: str, id: int):
 16 |     # noinspection PyArgumentList
 17 |     conf = benchmark.get_configuration_space(ConfigSpaceConverter())
 18 | 
 19 |     w = HPOlib2Worker(benchmark, configspace=conf, nameserver=nameserver, run_id=run_id, id=id, config_as_array=False)
 20 |     w.run(background=False)
 21 | 
 22 | 
 23 | class BohbAdapter(BaseAdapter):
 24 | 
 25 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 26 |         super().__init__(n_jobs, time_limit, iterations, seed)
 27 | 
 28 |     def optimize(self, benchmark: AbstractBenchmark, min_budget: int = 0.1,
 29 |                  max_budget: int = 1) -> OptimizationStatistic:
 30 |         start = time.time()
 31 |         statistics = OptimizationStatistic('BOHB', start)
 32 | 
 33 |         run_id = '{}_{}'.format(benchmark.get_meta_information()['name'], 0)
 34 |         ns = hpns.NameServer(run_id=run_id, host=nameserver, port=None)
 35 |         ns.start()
 36 | 
 37 |         # noinspection PyArgumentList
 38 |         conf = benchmark.get_configuration_space(ConfigSpaceConverter())
 39 | 
 40 |         pool = NoDaemonPool(processes=self.n_jobs)
 41 |         for i in range(self.n_jobs):
 42 |             pool.apply_async(start_worker, args=(benchmark, run_id, i), error_callback=self.log_async_error)
 43 | 
 44 |         bohb = BOHB(configspace=conf, run_id=run_id, min_budget=min_budget, max_budget=max_budget)
 45 |         # Fix number of iterations, such that in total self.iterations objective function is called
 46 |         n = (self.iterations * 0.9) / 6
 47 |         res = bohb.run(n_iterations=n, min_n_workers=self.n_jobs)
 48 | 
 49 |         bohb.shutdown(shutdown_workers=True)
 50 |         ns.shutdown()
 51 | 
 52 |         pool.close()
 53 |         pool.join()
 54 | 
 55 |         configs = res.get_id2config_mapping()
 56 |         ls = []
 57 |         for run in res.get_all_runs():
 58 |             ls.append(EvaluationResult.from_dict(run.info, configs[run.config_id]['config']))
 59 |         statistics.add_result(ls)
 60 |         statistics.stop_optimisation()
 61 | 
 62 |         return statistics
 63 | 
 64 | 
 65 | class HPOlib2Worker(Worker):
 66 |     def __init__(self, benchmark, configspace=None, budget_name='budget', budget_preprocessor=None,
 67 |                  config_as_array=True, **kwargs):
 68 | 
 69 |         super().__init__(**kwargs)
 70 |         self.benchmark = benchmark
 71 | 
 72 |         if configspace is None:
 73 |             self.configspace = benchmark.get_configuration_space()
 74 |         else:
 75 |             self.configspace = configspace
 76 | 
 77 |         self.budget_name = budget_name
 78 | 
 79 |         if budget_preprocessor is None:
 80 |             self.budget_preprocessor = lambda b: b
 81 |         else:
 82 |             self.budget_preprocessor = budget_preprocessor
 83 | 
 84 |         self.config_as_array = config_as_array
 85 | 
 86 |     def compute(self, config, budget, **kwargs):
 87 |         c = {}
 88 | 
 89 |         algorithm = config.get('__choice__', '')
 90 |         if len(algorithm) > 0:
 91 |             n = len(algorithm) + 1
 92 |             c['algorithm'] = algorithm
 93 |         else:
 94 |             n = 0
 95 | 
 96 |         for key, value in config.items():
 97 |             if key == '__choice__':
 98 |                 continue
 99 |             c[key[n:]] = value
100 | 
101 |         kwargs = {self.budget_name: self.budget_preprocessor(budget)}
102 |         res = self.benchmark.objective_function(c, **kwargs)
103 |         return ({
104 |             'loss': res['function_value'],
105 |             'info': res
106 |         })
107 | 


--------------------------------------------------------------------------------
/adapter/random_search.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import random
  3 | import time
  4 | 
  5 | import numpy as np
  6 | from hpolib.abstract_benchmark import AbstractBenchmark
  7 | from sklearn.model_selection import ParameterSampler
  8 | from sklearn.utils import check_random_state
  9 | 
 10 | import util.logger
 11 | from adapter.base import OptimizationStatistic, EvaluationResult, BaseAdapter
 12 | from config import RandomSearchConverter
 13 | from util.multiprocessor import NoDaemonPool
 14 | 
 15 | logger = util.logger.get()
 16 | 
 17 | 
 18 | class CustomParameterSampler(ParameterSampler):
 19 |     def __iter__(self):
 20 |         sample = ParameterSampler.__iter__(self)
 21 |         rnd = check_random_state(self.random_state)
 22 |         for s in sample:
 23 |             for k, v in s.items():
 24 |                 if hasattr(v, "rvs"):
 25 |                     s[k] = v.rvs(random_state=rnd)
 26 |             yield s
 27 | 
 28 | 
 29 | def timed_query(benchmark: AbstractBenchmark, timeout: float, seed: int):
 30 |     random_state = np.random.RandomState(seed)
 31 |     ls = []
 32 |     while time.time() < timeout:
 33 |         # noinspection PyTypeChecker,PyArgumentList
 34 |         conf = list(CustomParameterSampler(benchmark.get_configuration_space(RandomSearchConverter()), 1,
 35 |                                            random_state=random_state))[0]
 36 |         res = benchmark.objective_function(conf)
 37 |         ls.append(EvaluationResult.from_dict(res, conf))
 38 |     return ls
 39 | 
 40 | 
 41 | def run_counted_query(benchmark: AbstractBenchmark, iterations: int, seed: int,
 42 |                       lock: multiprocessing.Lock, index: multiprocessing.Value, ):
 43 |     random_state = np.random.RandomState(seed)
 44 |     ls = []
 45 |     while True:
 46 |         lock.acquire()
 47 |         i = index.value
 48 |         index.value += 1
 49 |         lock.release()
 50 | 
 51 |         if i >= iterations:
 52 |             break
 53 | 
 54 |         # noinspection PyTypeChecker,PyArgumentList
 55 |         cs = benchmark.get_configuration_space(RandomSearchConverter())
 56 |         if benchmark.get_meta_information().get('cash', False):
 57 |             key, value = random.sample(list(cs.items()), 1)[0]
 58 |             cs = value.copy()
 59 |             cs['algorithm'] = [key]
 60 |         conf = list(CustomParameterSampler(cs, 1, random_state=random_state))[0]
 61 |         res = benchmark.objective_function(conf)
 62 |         ls.append(EvaluationResult.from_dict(res, conf))
 63 |     return ls
 64 | 
 65 | 
 66 | class ObjectiveRandomSearch(BaseAdapter):
 67 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 68 |         super().__init__(n_jobs, time_limit, iterations, seed)
 69 | 
 70 |         if self.seed is None:
 71 |             raise ValueError('seed is required for random search')
 72 | 
 73 |         m = multiprocessing.Manager()
 74 |         self.lock = m.Lock()
 75 |         self.index = m.Value('i', 0)
 76 | 
 77 |     # noinspection PyMethodOverriding
 78 |     def optimize(self, benchmark: AbstractBenchmark):
 79 |         start = time.time()
 80 |         statistics = OptimizationStatistic('Random Search', start)
 81 | 
 82 |         pool = NoDaemonPool(processes=self.n_jobs)
 83 |         for i in range(self.n_jobs):
 84 |             if self.time_limit is not None:
 85 |                 timeout = start + self.time_limit
 86 |                 pool.apply_async(timed_query, args=(benchmark, timeout, self.seed + i),
 87 |                                  callback=lambda res: statistics.add_result(res),
 88 |                                  error_callback=self.log_async_error)
 89 |             else:
 90 |                 pool.apply_async(run_counted_query,
 91 |                                  args=(benchmark, self.iterations, self.seed + i, self.lock, self.index),
 92 |                                  callback=lambda res: statistics.add_result(res),
 93 |                                  error_callback=self.log_async_error)
 94 | 
 95 |         pool.close()
 96 |         pool.join()
 97 |         statistics.stop_optimisation()
 98 | 
 99 |         return statistics
100 | 


--------------------------------------------------------------------------------
/adapter/run_atm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import signal
  4 | import subprocess
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | from atm import ATM, Model
 11 | from sklearn.pipeline import Pipeline
 12 | from typing import List
 13 | 
 14 | from benchmark import OpenMLBenchmark
 15 | 
 16 | 
 17 | def skip(id: int) -> bool:
 18 |     failed = []
 19 |     return id in failed
 20 | 
 21 | 
 22 | def setup():
 23 |     try:
 24 |         shutil.rmtree('/tmp/atm/')
 25 |     except OSError as e:
 26 |         pass
 27 |     os.mkdir('/tmp/atm/')
 28 | 
 29 | 
 30 | def main(fold, bm: OpenMLBenchmark, timeout: int, jobs: int, score: bool = True) -> float:
 31 |     setup()
 32 |     X_train, y_train, X_test, y_test = fold
 33 | 
 34 |     headers = bm.column_names + ['class']
 35 |     train = np.c_[X_train, y_train]
 36 |     test = np.c_[X_test, y_test]
 37 | 
 38 |     os.mkdir('/tmp/atm/{}'.format(bm.task_id))
 39 |     train_path = '/tmp/atm/{}/train.csv'.format(bm.task_id)
 40 |     test_path = '/tmp/atm/{}/test.csv'.format(bm.task_id)
 41 |     pd.DataFrame(train, columns=headers).to_csv(train_path, index=None)
 42 |     pd.DataFrame(test, columns=headers).to_csv(test_path, index=None)
 43 | 
 44 |     sql_path = '{}/assets/atm_sql.yaml'.format(os.getcwd())
 45 |     cmd = 'atm enter_data --sql-config {sql} --train-path {train_path} --test-path {test_path}' \
 46 |           ' --budget-type walltime --budget {budget} --metric accuracy --name {name}' \
 47 |         .format(sql=sql_path, train_path=train_path, test_path=test_path, budget=timeout // 60, name=bm.task_id)
 48 |     subprocess.call(cmd, shell=True)
 49 | 
 50 |     cmd = 'atm worker --no-save --sql-config {}'.format(sql_path)
 51 | 
 52 |     procs = [subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid) for i in range(jobs)]
 53 | 
 54 |     start = time.time()
 55 |     while time.time() - start <= 1.05 * timeout:
 56 |         if any(p.poll() is None for p in procs):
 57 |             time.sleep(10)
 58 |         else:
 59 |             break
 60 |     else:
 61 |         print('Grace period exceed. Killing workers.')
 62 |         for p in procs:
 63 |             os.killpg(os.getpgid(p.pid), signal.SIGTERM)
 64 |             p.terminate()
 65 | 
 66 |     # Only used to mark datarun as finished. Should terminate immediately
 67 |     proc = subprocess.Popen(cmd, shell=True)
 68 |     proc.wait()
 69 | 
 70 |     atm = ATM()
 71 |     dataruns = atm.db.get_dataruns(ignore_complete=False, ignore_running=True, ignore_pending=True)
 72 |     datarun = max(dataruns, key=lambda d: d.id)
 73 |     best = datarun.get_best_classifier()
 74 | 
 75 |     hp = atm.db.get_hyperpartition(best.hyperpartition_id)
 76 |     model = Model(hp.method, best.hyperparameter_values, '', '')
 77 |     model._make_pipeline()
 78 |     pipeline = model.pipeline
 79 | 
 80 |     pipeline.fit(X_train, y_train)
 81 | 
 82 |     if score:
 83 |         predictions = pipeline.predict(X_test)
 84 |         return 1 - sklearn.metrics.accuracy_score(y_test, predictions)
 85 |     else:
 86 |         predictions = pipeline.predict_proba(X_test)
 87 |         return sklearn.metrics.roc_auc_score(y_test, predictions[:, 1]), pipeline
 88 | 
 89 | 
 90 | # noinspection PyUnresolvedReferences
 91 | def load_model(input: str):
 92 |     import sklearn
 93 |     from sklearn.linear_model import LogisticRegression
 94 |     from sklearn.neighbors import KNeighborsClassifier
 95 |     from sklearn.preprocessing import StandardScaler
 96 |     from sklearn.tree import DecisionTreeClassifier
 97 | 
 98 |     pipeline: Pipeline = eval(input)
 99 |     return pipeline
100 | 
101 | 
102 | # noinspection PyUnresolvedReferences
103 | def load_pipeline(input: str) -> List[List[str]]:
104 |     pipeline: Pipeline = load_model(input)
105 |     res = []
106 |     for s in pipeline.steps:
107 |         res.append(type(s[1]).__name__)
108 | 
109 |     return [res]
110 | 
111 | ###########
112 | # Get results via
113 | #
114 | # SELECT ds.name, 1 - max(cs.test_judgment_metric) as 'Misclassification rate' FROM classifiers cs
115 | # JOIN dataruns dr ON cs.datarun_id = dr.id
116 | # JOIN datasets ds ON dr.dataset_id = ds.id
117 | # GROUP BY cs.datarun_id
118 | # ORDER BY CAST(name AS INTEGER)
119 | 


--------------------------------------------------------------------------------
/adapter/robo.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import time
  3 | 
  4 | import numpy as np
  5 | from hpolib.abstract_benchmark import AbstractBenchmark
  6 | from robo.fmin import bayesian_optimization
  7 | 
  8 | from adapter.base import BaseAdapter, OptimizationStatistic, EvaluationResult
  9 | from config import RoBoConverter, MetaConfigCollection, UNI_FLOAT, UNI_INT, CATEGORICAL
 10 | 
 11 | 
 12 | class RoBoAdapter(BaseAdapter):
 13 | 
 14 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 15 |         super().__init__(n_jobs, time_limit, iterations, seed)
 16 |         self.benchmark = None
 17 | 
 18 |         if self.iterations is None:
 19 |             raise NotImplementedError('Timeout not supported yet')
 20 | 
 21 |     # noinspection PyMethodOverriding
 22 |     def optimize(self, benchmark: AbstractBenchmark, model_type: str = 'gp_mcmc'):
 23 |         self.benchmark = benchmark
 24 | 
 25 |         start = time.time()
 26 |         statistics = OptimizationStatistic('RoBo {}'.format(model_type), start)
 27 | 
 28 |         ls = []
 29 |         if benchmark.get_meta_information().get('cash', False):
 30 |             # noinspection PyArgumentList
 31 |             for key, value in benchmark.get_configuration_space(RoBoConverter()).items():
 32 |                 ls.append({
 33 |                     'lower': value[0],
 34 |                     'upper': value[1],
 35 |                     'names': value[2],
 36 |                     'algorithm': key
 37 |                 })
 38 |         else:
 39 |             # noinspection PyArgumentList
 40 |             lower, upper, names = benchmark.get_configuration_space(RoBoConverter())
 41 |             ls.append({
 42 |                 'lower': lower,
 43 |                 'upper': upper,
 44 |                 'names': names
 45 |             })
 46 | 
 47 |         random_state = np.random.RandomState(self.seed) if self.seed is not None else None
 48 |         cs = benchmark.get_configuration_space()
 49 | 
 50 |         n = max(3, math.ceil(self.iterations / len(ls)))
 51 |         res = []
 52 |         for config in ls:
 53 |             tmp = bayesian_optimization(lambda x: self.objective_function(x, config, cs),
 54 |                                         config['lower'], config['upper'],
 55 |                                         model_type=model_type,
 56 |                                         num_iterations=n,
 57 |                                         rng=random_state)
 58 |             res.append(tmp)
 59 | 
 60 |         result = []
 61 |         previous = start
 62 |         for idx, r in enumerate(res):
 63 |             for i in range(len(r['runtime'])):
 64 |                 d = {}
 65 |                 for j, name in enumerate(ls[idx]['names']):
 66 |                     d[name] = r['incumbents'][i][j]
 67 | 
 68 |                 begin = previous + r['overhead'][i]
 69 |                 result.append(
 70 |                     EvaluationResult(begin, start + r['runtime'][i], r['incumbent_values'][i], d)
 71 |                 )
 72 |                 previous = start + r['runtime'][i]
 73 | 
 74 |         statistics.add_result(result)
 75 |         statistics.stop_optimisation()
 76 | 
 77 |         return statistics
 78 | 
 79 |     def objective_function(self, x, config: dict, config_space: MetaConfigCollection):
 80 |         if 'algorithm' not in config:
 81 |             res = self.benchmark.objective_function(x)
 82 |         else:
 83 |             d = {
 84 |                 'algorithm': config['algorithm']
 85 |             }
 86 |             cs = config_space.algos[d['algorithm']]
 87 |             for idx, name in enumerate(config['names']):
 88 |                 feature = cs.dict[name]
 89 |                 if feature.type == UNI_FLOAT:
 90 |                     d[name] = x[idx]
 91 |                 else:
 92 |                     x_int = int(round(x[idx]))
 93 |                     if feature.type == UNI_INT:
 94 |                         d[name] = x_int
 95 |                     elif feature.type == CATEGORICAL:
 96 |                         d[name] = feature.choices[x_int]
 97 |                     else:
 98 |                         raise ValueError('Unknown type {}'.format(feature.type))
 99 |             res = self.benchmark.objective_function(d)
100 | 
101 |         return res['function_value']
102 | 


--------------------------------------------------------------------------------
/comparison_human.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import multiprocessing
 3 | import sys
 4 | import time
 5 | import traceback
 6 | import warnings
 7 | 
 8 | import humanfriendly
 9 | 
10 | from benchmark import SantanderBenchmark
11 | 
12 | timeout = 3600  # in seconds
13 | run_timeout = 600  # in seconds
14 | jobs = 4
15 | 
16 | 
17 | def run(it) -> None:
18 |     bm = SantanderBenchmark()
19 |     print('##\nIteration {} at {}\n##'.format(i, datetime.datetime.now().time()))
20 |     with warnings.catch_warnings():
21 |         warnings.simplefilter("ignore", category=RuntimeWarning)
22 |         res = []
23 | 
24 |         for fold in bm.folds:
25 |             try:
26 |                 if algorithm == 'atm':
27 |                     from adapter import run_atm
28 |                     score, estimator = run_atm.main(fold, bm, timeout, jobs, score=False)
29 |                     predictions = estimator.predict_proba(bm.X_test)
30 |                 elif algorithm == 'random':
31 |                     from adapter import run_auto_sklearn
32 |                     score, estimator = run_auto_sklearn.main(fold, bm, timeout, run_timeout, jobs, random=True,
33 |                                                              score=False)
34 |                     predictions = estimator.predict_proba(bm.X_test)
35 |                 elif algorithm == 'auto-sklearn':
36 |                     from adapter import run_auto_sklearn
37 |                     score, estimator = run_auto_sklearn.main(fold, bm, timeout, run_timeout, jobs, random=False,
38 |                                                              score=False)
39 |                     predictions = estimator.predict_proba(bm.X_test)
40 |                 elif algorithm == 'dummy':
41 |                     from adapter import run_baseline
42 |                     score, estimator = run_baseline.main(fold, dummy=True, score=False)
43 |                     predictions = estimator.predict_proba(bm.X_test)
44 |                 elif algorithm == 'rf':
45 |                     from adapter import run_baseline
46 |                     score, estimator = run_baseline.main(fold, dummy=False, score=False)
47 |                     predictions = estimator.predict_proba(bm.X_test)
48 |                 elif algorithm == 'h2o':
49 |                     from adapter import run_h2o
50 |                     score, estimator = run_h2o.main(fold, bm, timeout, run_timeout, jobs, score=False)
51 |                     df = estimator.predict(run_h2o._createFrame(bm.X_test)).as_data_frame()
52 |                     predictions = df.values[:, 1:]
53 |                     run_h2o._cleanup(None)
54 |                 elif algorithm == 'hpsklearn':
55 |                     from adapter import run_hpsklearn
56 |                     score, estimator = run_hpsklearn.main(fold, timeout, run_timeout, score=False)
57 |                     predictions = estimator.predict_proba(bm.X_test)
58 |                 elif algorithm == 'tpot':
59 |                     from adapter import run_tpot
60 |                     score, estimator = run_tpot.main(fold, timeout, run_timeout, jobs, score=False)
61 |                     predictions = estimator.predict_proba(bm.X_test)
62 |                 else:
63 |                     raise ValueError('Unknown algorithm {}'.format(algorithm))
64 |                 print(score)
65 | 
66 |                 bm.format_output(predictions, algorithm, it * 10 + len(res))
67 |                 res.append((score, predictions))
68 |             except Exception:
69 |                 traceback.print_exc()
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     algorithm = sys.argv[1]
74 |     idx = int(sys.argv[2]) if len(sys.argv) > 2 else None
75 | 
76 |     print('Algorithm: ', algorithm)
77 |     print('Timeout: ', timeout)
78 |     print('Run Timeout: ', run_timeout)
79 | 
80 |     for i in range(0, 3):
81 |         try:
82 |             start = time.time()
83 | 
84 |             p = multiprocessing.Process(target=run, args=(i,))
85 |             p.start()
86 | 
87 |             p.join(timeout * 1.5)
88 | 
89 |             if p.is_alive():
90 |                 print('Grace period exceed. Stopping benchmark.')
91 |                 p.terminate()
92 |                 p.join()
93 |             print('Duration', humanfriendly.format_timespan(time.time() - start))
94 |         except Exception as e:
95 |             if isinstance(e, KeyboardInterrupt):
96 |                 raise e
97 |             traceback.print_exc()
98 | 


--------------------------------------------------------------------------------
/adapter/smac.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Dict, List
  3 | 
  4 | import numpy as np
  5 | from ConfigSpace import Configuration
  6 | from hpolib.abstract_benchmark import AbstractBenchmark
  7 | from smac.facade.smac_facade import SMAC
  8 | from smac.runhistory.runhistory import RunKey, RunValue
  9 | from smac.scenario.scenario import Scenario
 10 | 
 11 | from adapter.base import OptimizationStatistic, EvaluationResult, BaseAdapter
 12 | from config import ConfigSpaceConverter
 13 | from util import multiprocessor
 14 | 
 15 | 
 16 | def query_objective_function(benchmark: AbstractBenchmark, idx: int, seed: int,
 17 |                              time_limit: float = None, iterations=None):
 18 |     # noinspection PyArgumentList
 19 |     cs = benchmark.get_configuration_space(ConfigSpaceConverter())
 20 |     name = benchmark.get_meta_information()['name']
 21 |     random_state = np.random.RandomState(seed)
 22 | 
 23 |     scenario = {
 24 |         'abort_on_first_run_crash': True,
 25 |         'run_obj': 'quality',
 26 |         'deterministic': True,
 27 |         'shared-model': True,
 28 | 
 29 |         # 'cutoff_time': 10,
 30 |         'cs': cs,
 31 |         'initial_incumbent': 'RANDOM',
 32 | 
 33 |         'input_psmac_dirs': '/tmp/smac/{:s}/in/'.format(name),
 34 |         'output_dir': '/tmp/smac/{:s}/out/{:d}/{:d}'.format(name, int(time.time()), idx)
 35 |     }
 36 | 
 37 |     if time_limit is not None:
 38 |         scenario['wallclock_limit'] = time_limit
 39 |     else:
 40 |         scenario['runcount_limit'] = iterations
 41 | 
 42 |     def objective_function(configuration, **kwargs):
 43 |         d = {}
 44 |         algorithm = configuration._values.get('__choice__', '')
 45 |         if len(algorithm) > 0:
 46 |             n = len(algorithm) + 1
 47 |             d['algorithm'] = algorithm
 48 |         else:
 49 |             n = 0
 50 | 
 51 |         for key, value in configuration._values.items():
 52 |             if key == '__choice__':
 53 |                 continue
 54 |             d[key[n:]] = value
 55 |         return benchmark.objective_function(d, **kwargs)['function_value']
 56 | 
 57 |     smac = SMAC(scenario=Scenario(scenario), tae_runner=objective_function, rng=random_state)
 58 |     x_star = smac.optimize()
 59 | 
 60 |     return smac.runhistory.data, x_star
 61 | 
 62 | 
 63 | class SmacAdapter(BaseAdapter):
 64 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 65 |         super().__init__(n_jobs, time_limit, iterations, seed)
 66 | 
 67 |         if self.seed is None:
 68 |             raise ValueError('seed is required for smac')
 69 | 
 70 |     # noinspection PyMethodOverriding
 71 |     def optimize(self, benchmark: AbstractBenchmark, mean_objective_time: float = 0.1):
 72 |         start = time.time()
 73 |         statistics = OptimizationStatistic('SMAC', start)
 74 | 
 75 |         pool = multiprocessor.NoDaemonPool(processes=self.n_jobs)
 76 |         for i in range(self.n_jobs):
 77 |             pool.apply_async(query_objective_function,
 78 |                              args=(benchmark, i, self.seed + i, self.time_limit, self.iterations / self.n_jobs),
 79 |                              callback=lambda res: statistics.add_result(
 80 |                                  self._transform_result(res[0], res[1], start, mean_objective_time)
 81 |                              ),
 82 |                              error_callback=self.log_async_error)
 83 |         pool.close()
 84 |         pool.join()
 85 |         statistics.stop_optimisation()
 86 | 
 87 |         return statistics
 88 | 
 89 |     @staticmethod
 90 |     def _transform_result(history: Dict[RunKey, RunValue], best: Configuration,
 91 |                           start: float, mean_objective_time: float) -> List:
 92 |         end = time.time()
 93 |         n = len(history.values())
 94 | 
 95 |         # Exact overhead is not known. Mean and standard deviation empirically computed and now faked
 96 |         total = end - start - n * mean_objective_time
 97 |         overhead = np.random.normal(0.02149568831957658, 0.002992145452598064, n)
 98 |         overhead = (overhead / overhead.sum()) * total
 99 | 
100 |         res = []
101 |         for i, run_value in enumerate(history.values()):
102 |             t = start + mean_objective_time * i + np.cumsum(overhead)[i]
103 | 
104 |             res.append(EvaluationResult(t, t + mean_objective_time, run_value.cost, best.get_dictionary()))
105 |         return res
106 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # AutoML-Benchmark
  2 | This project evaluates the performance of various AutoML frameworks on different benchmark datasets. A detailed description of the evaluated frameworks and detailed evaluation results is available in our [survey paper](https://arxiv.org/abs/1904.12054). The source code is available on [GitHub](https://github.com/Ennosigaeon/automl_benchmark).
  3 | 
  4 | ## Installation
  5 | - Install swig `sudo apt install swig`
  6 | - Install build-essential `sudo apt install build-essential`
  7 | - Install auto-sklearn requirements `curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip3 install`
  8 | - Install hpolib2 `pip3 install git+https://github.com/automl/HPOlib1.5.git`
  9 | - Install mysql client `sudo apt install libmysqlclient-dev`
 10 | - Install all other requirements via `pip3 install -r requirements.txt`
 11 | - Install ATM (https://github.com/HDI-Project/ATM)
 12 | - Install RoBo (https://github.com/automl/RoBO)
 13 | - Install Optunity (https://optunity.readthedocs.io/en/latest/user/installation.html)
 14 | 
 15 | Some of the tested AutoML frameworks require some hotfixes to actually work. Unfortunately, it is not possible to list all required changes as most of the frameworks are still in development and regularly updated.
 16 | 
 17 | 
 18 | ## Usage
 19 | To actually use the benchmark, adapt the _run.py_ file. The tested frameworks are configured at the head of the file
 20 | ```python
 21 | config_dict = {
 22 |         'n_jobs': 1,
 23 |         'timeout': None,
 24 |         'iterations': 500,
 25 |         'seed': int(time.time()),
 26 | 
 27 |         'random_search': True,
 28 |         'grid_search': False,
 29 |         'smac': False,
 30 |         'hyperopt': False,
 31 |         'bohb': False,
 32 |         'robo': False,
 33 |         'optunity': False,
 34 |         'btb': False 
 35 | }
 36 | ```
 37 | Parameters are:
 38 | * `n_jobs` defines the number of parallel processes
 39 | * `timeout` defines the maximum evaluation time. This option is not supported by all frameworks. Can not be used together with `iterations`.
 40 | * `iterations` defines the maximum number of iterations. Can not be used together with `timeout`.
 41 | * `seed` defines the random state to make evaluations reproducible.
 42 | * A list of supported frameworks with boolean parameters, whether this framework should be evaluated or not
 43 | 
 44 | Next, configure the tested benchmark at the bottom of the file.
 45 | 
 46 | ```python
 47 | logger.info('Main start')
 48 | try:
 49 |     persistence = MongoPersistence(args.database, read_only=False)
 50 |     b = benchmark.Rosenbrock20D()
 51 |     for i in range(20):
 52 |         run(persistence, b)
 53 | except (SystemExit, KeyboardInterrupt, Exception) as e:
 54 |     logger.error(e, exc_info=True)
 55 | logger.info('Main finished')
 56 | ```
 57 | 
 58 | Finally, execute the _run.py_ script with the mandatory parameter `--database`
 59 | ```bash
 60 | python3 run.py --database localhost
 61 | ```
 62 | All results are stored in the provided MongoDB.
 63 | 
 64 | ### Implemented AutoML Frameworks
 65 | Currently implemented are adapters for
 66 | * [BoHB](https://github.com/automl/HpBandSter)
 67 | * [BTB](https://github.com/HDI-Project/BTB)
 68 | * [Grid Search](https://scikit-learn.org/stable/modules/grid_search.html)
 69 | * [hyperopt](https://github.com/hyperopt/hyperopt)
 70 | * [Optunity](https://github.com/claesenm/optunity)
 71 | * [Random Search](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)
 72 | * [RoBO](https://github.com/automl/RoBO)
 73 | * [SMAC](https://github.com/automl/SMAC3)
 74 | 
 75 | Each of these frameworks is configured via a unified search space configuration. An example configuration is provided in the _assets_ folder.
 76 | 
 77 | ### Implemented Benchmarks
 78 | Implemented is a bunch of synthetic test functions, the Iris dataset and an OpenML benchmark. The OpenML benchmark is able to test a single OpenML dataset or a complete OpenML suite.
 79 | 
 80 | Example usage:
 81 | ```python
 82 | logger.info('Main start')
 83 | try:
 84 |     persistence = MongoPersistence(args.database, read_only=False)
 85 |     for i in range(20):
 86 |         for b in benchmark.OpenML100Suite().load(chunk=args.chunk):
 87 |              logger.info('Starting OpenML benchmark {}'.format(b.task_id))
 88 |              run(persistence, b)
 89 | except (SystemExit, KeyboardInterrupt, Exception) as e:
 90 |     logger.error(e, exc_info=True)
 91 | 
 92 | logger.info('Main finished')
 93 | ```
 94 | Using the optional parameter `--chunk` only a part of the datasets is evaluated. This option can be used to distribute the evaluation in a cluster.
 95 | 
 96 | 
 97 | ## Evaluating complete ML Pipelines
 98 | This code also allows the evaluation of frameworks building complete ML pipelines. Currently implemented are
 99 | * [ATM](https://github.com/HDI-Project/ATM)
100 | * [auto-sklearn](https://github.com/automl/auto-sklearn)
101 | * [hyperopt-sklearn](https://github.com/hyperopt/hyperopt-sklearn)
102 | * [TPOT](https://github.com/EpistasisLab/tpot)
103 | 
104 | For each framework, a dedicated run script exists.
105 | 


--------------------------------------------------------------------------------
/adapter/optunity_adapter.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import re
  3 | import time
  4 | 
  5 | from hpolib.abstract_benchmark import AbstractBenchmark
  6 | from optunity import search_spaces
  7 | from optunity.api import suggest_solver, make_solver, _wrap_hard_box_constraints, optimize
  8 | from optunity.functions import wraps, CallLog
  9 | 
 10 | from adapter.base import BaseAdapter, OptimizationStatistic, EvaluationResult
 11 | from config import OptunityConverter
 12 | 
 13 | 
 14 | def _fun(f, q_in, q_out):
 15 |     while True:
 16 |         i, x = q_in.get()
 17 |         if i is None:
 18 |             break
 19 |         value = f(*x)
 20 |         if hasattr(f, 'call_log'):
 21 |             k = list(f.call_log.keys())[-1]
 22 |             q_out.put((i, value, k))
 23 |         else:
 24 |             q_out.put((i, value))
 25 | 
 26 | 
 27 | # http://stackoverflow.com/a/16071616
 28 | def pmap(f, *args, **kwargs):
 29 |     """Parallel map using multiprocessing.
 30 | 
 31 |     :param f: the callable
 32 |     :param args: arguments to f, as iterables
 33 |     :returns: a list containing the results
 34 | 
 35 |     .. warning::
 36 |         This function will not work in IPython: https://github.com/claesenm/optunity/issues/8.
 37 | 
 38 |     .. warning::
 39 |         Python's multiprocessing library is incompatible with Jython.
 40 | 
 41 |     """
 42 |     nprocs = kwargs.get('number_of_processes', multiprocessing.cpu_count())
 43 |     q_in = multiprocessing.Queue(1)
 44 |     q_out = multiprocessing.Queue()
 45 | 
 46 |     proc = [multiprocessing.Process(target=_fun, args=(f, q_in, q_out))
 47 |             for _ in range(nprocs)]
 48 |     for p in proc:
 49 |         p.daemon = False
 50 |         p.start()
 51 | 
 52 |     sent = [q_in.put((i, x)) for i, x in enumerate(zip(*args))]
 53 |     [q_in.put((None, None)) for _ in range(nprocs)]
 54 |     res = [q_out.get() for _ in range(len(sent))]
 55 |     [p.join() for p in proc]
 56 | 
 57 |     # FIXME: strong coupling between pmap and functions.logged
 58 |     if hasattr(f, 'call_log'):
 59 |         for _, value, k in sorted(res):
 60 |             f.call_log[k] = value
 61 |         return [x for i, x, _ in sorted(res)]
 62 |     else:
 63 |         return [x for i, x in sorted(res)]
 64 | 
 65 | 
 66 | def create_pmap(number_of_processes):
 67 |     def pmap_bound(f, *args):
 68 |         return pmap(f, *args, number_of_processes=number_of_processes)
 69 | 
 70 |     return pmap_bound
 71 | 
 72 | 
 73 | def logged(f):
 74 |     if hasattr(f, 'call_log'):
 75 |         return f
 76 | 
 77 |     @wraps(f)
 78 |     def wrapped_f(*args, **kwargs):
 79 |         config = {k: v for k, v in kwargs.items() if v is not None and k != ''}
 80 |         for key, value in config.items():
 81 |             if isinstance(value, str):
 82 |                 if re.match('^\\d+$', value) is not None:
 83 |                     config[key] = int(value)
 84 |                 elif value == 'True' or value == 'False':
 85 |                     config[key] = bool(value)
 86 | 
 87 |         value = f(*args, **config)
 88 |         wrapped_f.call_log.insert(value['function_value'], start=value['start'], end=value['end'], **config)
 89 |         return value['function_value']
 90 | 
 91 |     wrapped_f.call_log = CallLog()
 92 |     return wrapped_f
 93 | 
 94 | 
 95 | class OptunityAdapter(BaseAdapter):
 96 | 
 97 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 98 |         super().__init__(n_jobs, time_limit, iterations, seed)
 99 |         self.benchmark = None
100 | 
101 |         if self.iterations is None:
102 |             raise NotImplementedError('Timeout not supported yet')
103 | 
104 |     def optimize(self, benchmark: AbstractBenchmark, **kwargs) -> OptimizationStatistic:
105 |         start = time.time()
106 |         self.benchmark = benchmark
107 |         statistics = OptimizationStatistic('Optunity', start)
108 | 
109 |         # noinspection PyArgumentList
110 |         conf = benchmark.get_configuration_space(OptunityConverter())
111 | 
112 |         tree = search_spaces.SearchTree(conf)
113 |         box = tree.to_box()
114 | 
115 |         f = logged(self.objective_function)
116 |         f = tree.wrap_decoder(f)
117 |         f = _wrap_hard_box_constraints(f, box, 1)
118 | 
119 |         suggestion = suggest_solver(self.iterations, "particle swarm", **box)
120 |         solver = make_solver(**suggestion)
121 | 
122 |         solution, details = optimize(solver, f, maximize=False, max_evals=self.iterations, decoder=tree.decode,
123 |                                      # pmap=map)
124 |                                      pmap=create_pmap(self.n_jobs))
125 | 
126 |         ls = []
127 |         for meta, value in f.call_log.data.items():
128 |             d = meta._asdict()
129 |             start = d.pop('start', None)
130 |             end = d.pop('end', None)
131 |             ls.append(EvaluationResult(start, end, value, d))
132 | 
133 |         # Optunity sometimes does not use self.iterations but a little less. Fix number for plotting
134 |         while len(ls) < self.iterations:
135 |             ls.append(ls[-1])
136 | 
137 |         del f.call_log
138 | 
139 |         statistics.add_result(ls)
140 |         statistics.stop_optimisation()
141 | 
142 |         return statistics
143 | 
144 |     def objective_function(self, **kwargs):
145 |         return self.benchmark.objective_function(kwargs)
146 | 


--------------------------------------------------------------------------------
/adapter/btb_adapter.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import json
  4 | import os
  5 | import time
  6 | from typing import List, Union
  7 | 
  8 | import numpy as np
  9 | from atm.method import Method, HyperPartition
 10 | from btb.selection.selector import Selector
 11 | from btb.tuning import GP, GPEi, Uniform
 12 | from hpolib.abstract_benchmark import AbstractBenchmark
 13 | from scipy.stats import norm
 14 | from sklearn.gaussian_process import GaussianProcessRegressor
 15 | 
 16 | from adapter.base import BaseAdapter, OptimizationStatistic, EvaluationResult
 17 | from config import BtbConverter
 18 | 
 19 | 
 20 | class BtbAdapter(BaseAdapter):
 21 | 
 22 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None, seed: int = None):
 23 |         super().__init__(n_jobs, time_limit, iterations, seed)
 24 | 
 25 |     # noinspection PyMethodOverriding
 26 |     def optimize(self, benchmark: AbstractBenchmark) -> OptimizationStatistic:
 27 |         start = time.time()
 28 |         statistics = OptimizationStatistic('BTB', start)
 29 | 
 30 |         # noinspection PyArgumentList,PyTypeChecker
 31 |         methods = self._create_method(benchmark.get_configuration_space(BtbConverter()))
 32 | 
 33 |         hyperpartitions = []
 34 |         for method in methods:
 35 |             if len(method.name) > 0:
 36 |                 for hp in method.get_hyperpartitions():
 37 |                     hp.categoricals.append(('algorithm', method.name))
 38 |                     hyperpartitions.append(hp)
 39 |             else:
 40 |                 hyperpartitions += method.get_hyperpartitions()
 41 | 
 42 |         tuners = [FixedGP(hp.tunables, r_minimum=1) for hp in hyperpartitions]
 43 |         scores = {idx: tuner.y for idx, tuner in enumerate(tuners)}
 44 | 
 45 |         ls = []
 46 |         selector = FixedSelector(scores.keys())
 47 |         for i in range(self.iterations):
 48 |             idx = selector.select(scores)
 49 | 
 50 |             params = tuners[idx].propose()
 51 |             params = self.__get_configuration_dict(hyperpartitions[idx], params)
 52 | 
 53 |             for key, value in params.items():
 54 |                 if isinstance(value, np.int64):
 55 |                     params[key] = int(value)
 56 | 
 57 |             res = benchmark.objective_function(params)
 58 |             score = -1 * res['function_value']
 59 |             tuners[idx].add(params, score)
 60 |             scores[idx] = tuners[idx].y
 61 | 
 62 |             res['config'] = params
 63 |             # print(res)
 64 |             ls.append(EvaluationResult.from_dict(res, params))
 65 | 
 66 |         statistics.add_result(ls)
 67 |         statistics.stop_optimisation()
 68 | 
 69 |         return statistics
 70 | 
 71 |     @staticmethod
 72 |     def __get_configuration_dict(hyperpartition: HyperPartition, tunables: dict) -> dict:
 73 |         tunables.update(dict(hyperpartition.constants))
 74 |         tunables.update(dict(hyperpartition.categoricals))
 75 |         return tunables
 76 | 
 77 |     @staticmethod
 78 |     def _create_method(conf: Union[dict, list]) -> List[Method]:
 79 |         if isinstance(conf, dict):
 80 |             conf = [conf]
 81 | 
 82 |         ls = []
 83 |         for c in conf:
 84 |             name = '/tmp/{}'.format(time.time())
 85 |             with open(name, 'w') as f:
 86 |                 json.dump(c, f)
 87 |             ls.append(Method(name))
 88 |             os.remove(name)
 89 | 
 90 |         return ls
 91 | 
 92 | 
 93 | class FixedGP(GPEi):
 94 |     # predict and _acquire can be removed if inherited from GP
 95 | 
 96 |     def predict(self, X):
 97 |         if self.X.shape[0] < self.r_minimum:
 98 |             y = Uniform(self.tunables).predict(X)
 99 |             stdev = np.ones(len(y), dtype=np.float64)
100 |         else:
101 |             y, stdev = self.gp.predict(X, return_std=True)
102 |         return np.array(list(zip(y, stdev)))
103 | 
104 |     def _acquire(self, predictions):
105 |         Phi = norm.cdf
106 |         N = norm.pdf
107 | 
108 |         mu, sigma = predictions.T
109 |         if len(self.y) == 0:
110 |             y_best = max(mu)
111 |         else:
112 |             y_best = max(self.y)
113 | 
114 |         # because we are maximizing the scores, we do mu-y_best rather than the inverse, as is
115 |         # shown in most reference materials
116 |         z = ((mu - y_best) / sigma).astype(np.float64)
117 | 
118 |         ei = sigma * (z * Phi(z) + N(z))
119 | 
120 |         return np.argmax(ei)
121 | 
122 |     def fit(self, X, y):
123 |         """ Use X and y to train a Gaussian process. """
124 |         super(GP, self).fit(X, y)
125 | 
126 |         # skip training the process if there aren't enough samples
127 |         if X.shape[0] < self.r_minimum:
128 |             return
129 | 
130 |         if X.ndim == 1:
131 |             X = X.reshape(-1, 1)
132 | 
133 |         self.gp = GaussianProcessRegressor(normalize_y=True)
134 |         self.gp.fit(X, y)
135 | 
136 | 
137 | class FixedSelector(Selector):
138 | 
139 |     def bandit(self, choice_rewards):
140 |         def non_nan(x):
141 |             if len(x) == 0:
142 |                 return 1000000
143 |             else:
144 |                 return np.mean(x)
145 | 
146 |         # keys = choice_rewards.keys()
147 |         # values = np.array([non_nan(x) for x in choice_rewards.values()])
148 |         #
149 |         # idx = np.random.choice(np.flatnonzero(values == values.max()))
150 |         # return keys[idx]
151 | 
152 |         return max(choice_rewards, key=lambda a: non_nan(choice_rewards[a]))
153 | 


--------------------------------------------------------------------------------
/adapter/run_h2o.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import shutil
  3 | import tempfile
  4 | from typing import List, Optional
  5 | 
  6 | import h2o
  7 | import numpy as np
  8 | import pandas as pd
  9 | import sklearn
 10 | from h2o.automl import H2OAutoML
 11 | from h2o.estimators import H2OXGBoostEstimator, H2OGeneralizedLinearEstimator, H2OGradientBoostingEstimator, \
 12 |     H2ODeepLearningEstimator, H2ORandomForestEstimator
 13 | 
 14 | from benchmark import OpenMLBenchmark
 15 | 
 16 | 
 17 | def skip(id: int) -> bool:
 18 |     failed = [167125]
 19 |     return id in failed
 20 | 
 21 | 
 22 | def setup():
 23 |     pass
 24 | 
 25 | 
 26 | def main(fold, bm: OpenMLBenchmark, timeout: int, run_timeout: int, jobs: int, score: bool = True) -> float:
 27 |     try:
 28 |         log_dir = tempfile.mkdtemp()
 29 | 
 30 |         setup()
 31 |         X_train, y_train, X_test, y_test = fold
 32 | 
 33 |         h2o.init(nthreads=jobs, max_mem_size=4 * jobs, port=str(60000 + random.randrange(0, 5000)), ice_root=log_dir)
 34 |         h2o.no_progress()
 35 | 
 36 |         train = _createFrame(X_train, y_train)
 37 |         test = _createFrame(X_test)
 38 | 
 39 |         for i in range(len(bm.categorical)):
 40 |             if bm.categorical[i]:
 41 |                 train[i] = train[i].asfactor()
 42 |                 test[i] = test[i].asfactor()
 43 |         train['class'] = train['class'].asfactor()
 44 | 
 45 |         aml = H2OAutoML(max_runtime_secs=timeout,
 46 |                         max_runtime_secs_per_model=run_timeout)
 47 |         aml.train(y='class', training_frame=train)
 48 | 
 49 |         params = aml.leader.get_params()
 50 |         del params['model_id']
 51 |         del params['training_frame']
 52 |         del params['validation_frame']
 53 | 
 54 |         for key in params.keys():
 55 |             params[key] = params[key]['actual_value']
 56 | 
 57 |         print(aml.leader.algo, '(', params, ')')
 58 |         if score:
 59 |             predictions = aml.leader.predict(test)
 60 |             return 1 - sklearn.metrics.accuracy_score(y_test, predictions['predict'].as_data_frame())
 61 |         else:
 62 |             predictions = aml.leader.predict(test)
 63 |             return sklearn.metrics.roc_auc_score(y_test, predictions['predict'].as_data_frame()), aml.leader
 64 |     finally:
 65 |         if score:
 66 |             _cleanup(log_dir)
 67 | 
 68 | 
 69 | def _cleanup(log_dir: Optional[str]):
 70 |     h2o.cluster().shutdown()
 71 |     if log_dir is not None:
 72 |         shutil.rmtree(log_dir)
 73 | 
 74 | 
 75 | def _createFrame(x, y=None):
 76 |     if y is not None:
 77 |         data = np.append(x, np.atleast_2d(y).T, axis=1)
 78 |         columns = ['f' + str(i) for i in range(data.shape[1] - 1)] + ['class']
 79 |     else:
 80 |         data = x
 81 |         columns = ['f' + str(i) for i in range(data.shape[1])]
 82 | 
 83 |     df = pd.DataFrame(data=data[0:, 0:],
 84 |                       index=[i for i in range(data.shape[0])],
 85 |                       columns=columns)
 86 |     return h2o.H2OFrame(df)
 87 | 
 88 | 
 89 | def load_model(input: str):
 90 |     def _map_algo(algo: str, args):
 91 |         if algo == 'xgboost':
 92 |             return H2OXGBoostEstimator(**args)
 93 |         elif algo == 'gbm':
 94 |             return H2OGradientBoostingEstimator(**args)
 95 |         elif algo == 'glm':
 96 |             return H2OGeneralizedLinearEstimator(**args)
 97 |         elif algo == 'deeplearning':
 98 |             return H2ODeepLearningEstimator(**args)
 99 |         elif algo == 'drf' or algo == 'xrt':
100 |             return H2ORandomForestEstimator(**args)
101 |         else:
102 |             raise ValueError(algo)
103 | 
104 |     algo = input.split(' ')[0]
105 |     if algo == 'stackedensemble':
106 |         # TODO ignore for now
107 |         return None
108 | 
109 |     args = eval(input[len(algo) + 2:-2])
110 |     del args['response_column']
111 |     return _map_algo(algo, args)
112 | 
113 | 
114 | def load_pipeline(input: str) -> List[List[str]]:
115 |     def _map_algo(algo: str):
116 |         if algo == 'xgboost':
117 |             return H2OXGBoostEstimator()
118 |         elif algo == 'gbm':
119 |             return H2OGradientBoostingEstimator()
120 |         elif algo == 'glm':
121 |             return H2OGeneralizedLinearEstimator()
122 |         elif algo == 'deeplearning':
123 |             return H2ODeepLearningEstimator()
124 |         elif algo == 'drf' or algo == 'xrt':
125 |             return H2ORandomForestEstimator()
126 |         else:
127 |             raise ValueError(algo)
128 | 
129 |     res = []
130 | 
131 |     prefix = input.split(' ')[0]
132 |     if prefix == 'stackedensemble':
133 |         models = eval(input[len(prefix) + 2:-2])['base_models']
134 |         for idx, m in enumerate(models):
135 |             n = m['name'].split('_')[0].lower()
136 |             res.append([type(_map_algo(n)).__name__])
137 |     else:
138 |         res.append([type(_map_algo(prefix)).__name__])
139 | 
140 |     for j in range(len(res)):
141 |         for i in range(len(res[j])):
142 |             n = res[j][i]
143 | 
144 |             if n == 'H2ORandomForestEstimator':
145 |                 n = 'RandomForestClassifier'
146 |             if n == 'H2ODeepLearningEstimator':
147 |                 n = 'DeepLearningClassifier'
148 |             if n == 'H2OXGBoostEstimator':
149 |                 n = 'XGBClassifier'
150 |             if n == 'H2OGeneralizedLinearEstimator':
151 |                 n = 'GeneralizedLinearClassifier'
152 |             if n == 'H2OGradientBoostingEstimator':
153 |                 n = 'GradientBoostingClassifier'
154 | 
155 |             res[j][i] = n
156 | 
157 |     return sorted(res)
158 | 


--------------------------------------------------------------------------------
/run_framework.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import itertools
  3 | import multiprocessing
  4 | import sys
  5 | import time
  6 | import traceback
  7 | import warnings
  8 | 
  9 | import humanfriendly
 10 | 
 11 | from benchmark import OpenMLBenchmark
 12 | 
 13 | timeout = 3600  # in seconds
 14 | run_timeout = 600  # in seconds
 15 | jobs = 4
 16 | 
 17 | 
 18 | def run(task: int, conn) -> None:
 19 |     try:
 20 |         print('##\nIteration {} at {}\n##'.format(i, datetime.datetime.now().time()))
 21 |         bm = OpenMLBenchmark(task)
 22 | 
 23 |         with warnings.catch_warnings():
 24 |             warnings.simplefilter("ignore", category=RuntimeWarning)
 25 |             avg_score = 0
 26 |             for fold in bm.folds:
 27 |                 if algorithm == 'atm':
 28 |                     from adapter import run_atm
 29 |                     if run_atm.skip(task):
 30 |                         avg_score += -1
 31 |                     else:
 32 |                         avg_score += run_atm.main(fold, bm, timeout, jobs)
 33 |                 elif algorithm == 'random':
 34 |                     from adapter import run_auto_sklearn
 35 |                     if run_auto_sklearn.skip(task):
 36 |                         avg_score += -1
 37 |                     else:
 38 |                         avg_score += run_auto_sklearn.main(fold, bm, timeout, run_timeout, jobs, random=True)
 39 |                 elif algorithm == 'auto-sklearn':
 40 |                     from adapter import run_auto_sklearn
 41 |                     if run_auto_sklearn.skip(task):
 42 |                         avg_score += -1
 43 |                     else:
 44 |                         avg_score += run_auto_sklearn.main(fold, bm, timeout, run_timeout, jobs, random=False)
 45 |                 elif algorithm == 'dummy':
 46 |                     from adapter import run_baseline
 47 |                     if run_baseline.skip(task):
 48 |                         avg_score += -1
 49 |                     else:
 50 |                         avg_score += run_baseline.main(fold, dummy=True)
 51 |                 elif algorithm == 'rf':
 52 |                     from adapter import run_baseline
 53 |                     if run_baseline.skip(task):
 54 |                         avg_score += -1
 55 |                     else:
 56 |                         avg_score += run_baseline.main(fold, dummy=False)
 57 |                 elif algorithm == 'h2o':
 58 |                     from adapter import run_h2o
 59 |                     if run_h2o.skip(task):
 60 |                         avg_score += -1
 61 |                     else:
 62 |                         avg_score += run_h2o.main(fold, bm, timeout, run_timeout, jobs)
 63 |                 elif algorithm == 'hpsklearn':
 64 |                     from adapter import run_hpsklearn
 65 |                     if run_hpsklearn.skip(task):
 66 |                         avg_score += -1
 67 |                     else:
 68 |                         avg_score += run_hpsklearn.main(fold, timeout, run_timeout)
 69 |                 elif algorithm == 'tpot':
 70 |                     from adapter import run_tpot
 71 |                     if run_tpot.skip(task):
 72 |                         avg_score += -1
 73 |                     else:
 74 |                         avg_score += run_tpot.main(fold, timeout, run_timeout, jobs)
 75 |                 else:
 76 |                     raise ValueError('Unknown algorithm {}'.format(algorithm))
 77 |         conn.send(avg_score / len(bm.folds))
 78 |     except Exception:
 79 |         traceback.print_exc()
 80 |         conn.send(1)
 81 | 
 82 | 
 83 | if __name__ == '__main__':
 84 |     algorithm = sys.argv[1]
 85 |     idx = int(sys.argv[2]) if len(sys.argv) > 2 else None
 86 | 
 87 |     print('Algorithm: ', algorithm)
 88 |     print('Timeout: ', timeout)
 89 |     print('Run Timeout: ', run_timeout)
 90 | 
 91 |     task_ids = [
 92 |         [3, 12, 31, 53, 3917, 7593, 9952, 9977, 9981, 10101],
 93 |         [14965, 34539, 146195, 146212, 146818, 146821, 146822, 146825, 167119, 167120],
 94 |         [167121, 167124, 168329, 168330, 168331, 168332, 168335, 168337, 168338],
 95 |         [168868, 168908, 168909, 168910, 168911, 168912, 189354, 189355, 189356],
 96 |     ]
 97 | 
 98 |     if idx is not None:
 99 |         print('Using chunk {}/{}'.format(idx, len(task_ids)))
100 |         task_ids = task_ids[idx]
101 |     else:
102 |         print('Using all tasks')
103 |         task_ids = list(itertools.chain.from_iterable(task_ids))
104 | 
105 |     recv_end, send_end = multiprocessing.Pipe(False)
106 |     res = []
107 |     for task in task_ids:
108 |         print('#######\nStarting task {}\n#######'.format(task))
109 |         res.append([])
110 |         for i in range(5):
111 |             try:
112 |                 start = time.time()
113 | 
114 |                 p = multiprocessing.Process(target=run, args=(task, send_end))
115 |                 p.start()
116 | 
117 |                 p.join(timeout * 1.5)
118 | 
119 |                 if p.is_alive():
120 |                     print('Grace period exceed. Stopping benchmark.')
121 |                     p.terminate()
122 |                     p.join()
123 |                     score = 1
124 |                 else:
125 |                     score = recv_end.recv()
126 | 
127 |                 if score != -1:
128 |                     res[-1].append(score)
129 |                     print('Misclassification Rate', score)
130 |                     print('Duration', humanfriendly.format_timespan(time.time() - start))
131 |             except Exception as e:
132 |                 if isinstance(e, KeyboardInterrupt):
133 |                     print(res)
134 |                     raise e
135 |                 traceback.print_exc()
136 |                 print('Misclassification rate', 1)
137 |         print(res[-1])
138 | 
139 |     for i in range(len(res)):
140 |         print('        {},  # {}'.format(res[i], task_ids[i]))
141 | 
142 |     print(res)
143 | 


--------------------------------------------------------------------------------
/run_auto_sklearn.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import multiprocessing
  3 | import shutil
  4 | import time
  5 | import warnings
  6 | 
  7 | import sklearn.datasets
  8 | import sklearn.metrics
  9 | import sklearn.model_selection
 10 | from autosklearn.classification import AutoSklearnClassifier
 11 | from autosklearn.constants import *
 12 | from autosklearn.metrics import accuracy
 13 | from smac.facade.roar_facade import ROAR
 14 | from smac.scenario.scenario import Scenario
 15 | 
 16 | from benchmark import OpenMLBenchmark
 17 | 
 18 | timeout = 3600  # in seconds
 19 | run_timeout = 360  # in seconds
 20 | jobs = 4
 21 | random = True
 22 | 
 23 | ensemble_size = 1 if random else 20
 24 | 
 25 | 
 26 | def get_random_search_object_callback(scenario_dict, seed, ta, backend, metalearning_configurations, runhistory):
 27 |     """Random search."""
 28 |     scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob()
 29 |     scenario_dict['minR'] = len(scenario_dict['instances'])
 30 |     scenario_dict['initial_incumbent'] = 'RANDOM'
 31 |     scenario = Scenario(scenario_dict)
 32 |     return ROAR(
 33 |         scenario=scenario,
 34 |         rng=seed,
 35 |         tae_runner=ta,
 36 |         runhistory=runhistory,
 37 |         run_id=seed
 38 |     )
 39 | 
 40 | 
 41 | def get_spawn_classifier(X_train, y_train, tmp_folder, output_folder, seed0):
 42 |     def spawn_classifier(seed, dataset_name):
 43 |         # Use the initial configurations from meta-learning only in one out of
 44 |         # the processes spawned. This prevents auto-sklearn from evaluating the
 45 |         # same configurations in all processes.
 46 |         if seed == seed0 and not random:
 47 |             initial_configurations_via_metalearning = 25
 48 |             smac_scenario_args = {}
 49 |         else:
 50 |             initial_configurations_via_metalearning = 0
 51 |             smac_scenario_args = {'initial_incumbent': 'RANDOM'}
 52 | 
 53 |         callback = None
 54 |         if random:
 55 |             callback = get_random_search_object_callback
 56 | 
 57 |         # Arguments which are different to other runs of auto-sklearn:
 58 |         # 1. all classifiers write to the same output directory
 59 |         # 2. shared_mode is set to True, this enables sharing of data between
 60 |         # models.
 61 |         # 3. all instances of the AutoSklearnClassifier must have a different seed!
 62 |         automl = AutoSklearnClassifier(
 63 |             time_left_for_this_task=timeout,
 64 |             per_run_time_limit=run_timeout,
 65 |             shared_mode=True,
 66 |             tmp_folder=tmp_folder,
 67 |             output_folder=output_folder,
 68 |             delete_tmp_folder_after_terminate=False,
 69 |             ensemble_size=0,
 70 |             initial_configurations_via_metalearning=initial_configurations_via_metalearning,
 71 |             seed=seed,
 72 |             smac_scenario_args=smac_scenario_args,
 73 |             get_smac_object_callback=callback
 74 |         )
 75 |         automl.fit(X_train, y_train, dataset_name=dataset_name)
 76 |         print(automl.sprint_statistics())
 77 | 
 78 |     return spawn_classifier
 79 | 
 80 | 
 81 | def main(bm: OpenMLBenchmark):
 82 |     name = bm.get_meta_information()['name']
 83 | 
 84 |     X_train = bm.X_train
 85 |     y_train = bm.y_train
 86 |     X_test = bm.X_test
 87 |     y_test = bm.y_test
 88 | 
 89 |     tmp_folder = '/tmp/autosklearn/{}/tmp'.format(name)
 90 |     output_folder = '/tmp/autosklearn/{}/out'.format(name)
 91 | 
 92 |     seed = int(time.time())
 93 | 
 94 |     processes = []
 95 |     spawn_classifier = get_spawn_classifier(X_train, y_train, tmp_folder, output_folder, seed)
 96 |     for i in range(jobs):
 97 |         p = multiprocessing.Process(target=spawn_classifier, args=(seed + i, name))
 98 |         p.start()
 99 |         processes.append(p)
100 | 
101 |     start = time.time()
102 |     while time.time() - start <= 1.5 * timeout:
103 |         if any(p.is_alive() for p in processes):
104 |             time.sleep(10)
105 |         else:
106 |             break
107 |     else:
108 |         print('Grace period exceed. Killing workers.')
109 |         for p in processes:
110 |             p.terminate()
111 |             p.join()
112 | 
113 |     print('Starting to build an ensemble!')
114 |     automl = AutoSklearnClassifier(
115 |         time_left_for_this_task=3600,
116 |         per_run_time_limit=run_timeout,
117 |         shared_mode=True,
118 |         ensemble_size=ensemble_size,
119 |         tmp_folder=tmp_folder,
120 |         output_folder=output_folder,
121 |         initial_configurations_via_metalearning=0,
122 |         seed=seed,
123 |     )
124 |     automl.fit_ensemble(
125 |         y_train,
126 |         task=MULTICLASS_CLASSIFICATION,
127 |         metric=accuracy,
128 |         precision='32',
129 |         dataset_name=name,
130 |         ensemble_size=ensemble_size
131 |     )
132 | 
133 |     predictions = automl.predict(X_test)
134 |     # print(automl.show_models())
135 |     print('Misclassification rate', 1 - sklearn.metrics.accuracy_score(y_test, predictions))
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     for i in range(4):
140 |         print('#######\nIteration {}\n#######'.format(i))
141 | 
142 |         try:
143 |             shutil.rmtree('/tmp/autosklearn/')
144 |         except OSError as e:
145 |             pass
146 | 
147 |         print('Timeout: ', timeout)
148 |         print('Run Timeout: ', run_timeout)
149 |         print('Random Search: ', random)
150 | 
151 |         task_ids = [15, 23, 24, 29, 3021, 41, 2079, 3543, 3560, 3561,
152 |                     3904, 3946, 9955, 9985, 7592, 14969, 14968, 14967, 125920, 146606]
153 |         for task in task_ids:
154 |             print('Starting task {} at {}'.format(task, datetime.datetime.now().time()))
155 |             bm = OpenMLBenchmark(task)
156 | 
157 |             with warnings.catch_warnings():
158 |                 warnings.simplefilter("ignore", category=RuntimeWarning)
159 |                 main(bm)
160 | 


--------------------------------------------------------------------------------
/config/base.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | from typing import Dict, Union, List
  4 | 
  5 | import networkx as nx
  6 | import numpy as np
  7 | 
  8 | CATEGORICAL = "categorical"
  9 | UNI_FLOAT = "uniform_float"
 10 | UNI_INT = "uniform_int"
 11 | PARENT = "parent"
 12 | VALUE = "value"
 13 | 
 14 | 
 15 | class MetaConfigCollection:
 16 | 
 17 |     def __init__(self, d: Dict = None):
 18 |         self.algos: Dict[str, MetaConfig] = {}
 19 |         if (d is not None):
 20 |             for key, value in d.items():
 21 |                 conf = MetaConfig(value)
 22 |                 conf.sort_keys()
 23 |                 self.algos[key] = conf
 24 | 
 25 |     def keys(self):
 26 |         return self.algos.keys()
 27 | 
 28 |     def items(self):
 29 |         return self.algos.items()
 30 | 
 31 |     @staticmethod
 32 |     def from_json(file: str, validate: bool = True) -> 'MetaConfigCollection':
 33 |         with open(file) as f:
 34 |             d = json.load(f, object_pairs_hook=collections.OrderedDict)
 35 |             return MetaConfigCollection(d)
 36 | 
 37 |     @staticmethod
 38 |     def __validate(json_file: json) -> bool:
 39 |         # with open("config\schema.json") as schema:  # TODO import schema
 40 |         #     schema = json.load(schema)
 41 |         #     validate(json_file, schema)  # TODO improve schema
 42 |         return True
 43 | 
 44 | 
 45 | class MetaConfig:
 46 | 
 47 |     def __init__(self, d: Dict = None):
 48 |         self.dict: Dict[str, ConfigFeature] = collections.OrderedDict()
 49 | 
 50 |         if (d is not None):
 51 |             for key, value in d.items():
 52 |                 self.dict[key] = ConfigFeature(value)
 53 | 
 54 |     def add_feature(self, name: str, definition: Union[Dict, str]):
 55 |         self.dict[name] = ConfigFeature(definition)
 56 | 
 57 |     def sort_keys(self) -> None:
 58 |         """
 59 |         Process all previously stored ~MetaConfigFeature and order them depending on their dependencies.
 60 |         :return:
 61 |         """
 62 |         graph = ConfigInheritanceGraph(self)
 63 |         if (len(graph.simple_cycles()) > 0):
 64 |             raise ValueError('Encountered circular dependencies while sorting config features. '
 65 |                              'Please check your configuration file')
 66 | 
 67 |         d = collections.OrderedDict()
 68 |         nodes = graph.bfs_tree(graph.ROOT)
 69 |         for key in nodes:
 70 |             if key in self.dict.keys():
 71 |                 d[key] = self.dict[key]
 72 |         self.dict = d
 73 | 
 74 |     def items(self):
 75 |         return self.dict.items()
 76 | 
 77 |     @staticmethod
 78 |     def continuous_from_bounds(bounds: np.ndarray):
 79 |         res = MetaConfig()
 80 |         for i in range(bounds.shape[0]):
 81 |             res.add_feature(f"x{i}", {
 82 |                 "type": "uniform_float",
 83 |                 "lower": bounds[i][0],
 84 |                 "upper": bounds[i][1],
 85 |             })
 86 |         return res
 87 | 
 88 | 
 89 | class ConfigFeature(collections.MutableMapping):
 90 |     TYPE = "type"
 91 | 
 92 |     def __init__(self, *args, **kwargs):
 93 |         self.store = dict()
 94 |         self.update(dict(*args, **kwargs))  # use the free update to set keys
 95 | 
 96 |     def __getitem__(self, key):
 97 |         return self.store[key]
 98 | 
 99 |     def __setitem__(self, key, value):
100 |         self.store[key] = value
101 | 
102 |     def __delitem__(self, key):
103 |         del self.store[key]
104 | 
105 |     def __iter__(self):
106 |         return iter(self.store)
107 | 
108 |     def __len__(self):
109 |         return len(self.store)
110 | 
111 |     @property
112 |     def type(self):
113 |         return self.store[self.TYPE].lower()
114 | 
115 |     @property
116 |     def choices(self):
117 |         return self.store["choices"]
118 | 
119 |     @property
120 |     def lower(self):
121 |         return self.store["lower"]
122 | 
123 |     @property
124 |     def upper(self):
125 |         return self.store["upper"]
126 | 
127 |     @property
128 |     def default(self):
129 |         return self.store.get('default_value')
130 | 
131 |     @property
132 |     def log(self):
133 |         return self.store.get("log", False)
134 | 
135 |     @property
136 |     def condition(self):
137 |         return self.store.get("condition")
138 | 
139 |     def has_condition(self):
140 |         return self.condition is not None
141 | 
142 | 
143 | class ConfigInheritanceGraph:
144 |     ROOT = 'root'
145 |     CONFIG = 'config'
146 | 
147 |     def __init__(self, config: MetaConfig, ignore_options: bool = False):
148 |         self.G = nx.DiGraph()
149 |         self.G.add_node(self.ROOT)
150 | 
151 |         self.__add_nodes(config, ignore_options)
152 |         self.__add_edges(config)
153 | 
154 |     def __add_nodes(self, algo: MetaConfig, ignore_options=False) -> None:
155 |         for key, value in algo.items():
156 |             self.G.add_node(key, config=value)
157 |             if value.type == CATEGORICAL and not ignore_options:
158 |                 for choice in value.choices:
159 |                     if not self.G.has_node(choice):
160 |                         self.G.add_node(choice, config=None)
161 |                     self.G.add_edge(key, choice)
162 | 
163 |     def __add_edges(self, config: MetaConfig) -> None:
164 |         for key, config in config.items():
165 |             if config.has_condition():
166 |                 for value in config.condition[VALUE]:
167 |                     self.G.add_edge(value, key)
168 |             else:
169 |                 self.G.add_edge(self.ROOT, key)
170 | 
171 |     def get_config(self) -> dict:
172 |         return nx.get_node_attributes(self.G, self.CONFIG)
173 | 
174 |     def successors(self, node: str) -> nx.DiGraph:
175 |         return self.G.successors(node)
176 | 
177 |     def edge_dfs(self, source: str) -> List[str]:
178 |         return list(nx.edge_dfs(self.G, source))
179 | 
180 |     def bfs_tree(self, source: str) -> List[str]:
181 |         return list(nx.bfs_tree(self.G, source))
182 | 
183 |     def simple_cycles(self) -> List:
184 |         return list(nx.simple_cycles(self.G))
185 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import warnings
  4 | from argparse import Namespace
  5 | 
  6 | from hpolib.abstract_benchmark import AbstractBenchmark
  7 | 
  8 | import benchmark
  9 | import util.logger
 10 | from adapter.base import BenchmarkResult
 11 | from evaluation.base import MongoPersistence
 12 | 
 13 | 
 14 | def run(persistence: MongoPersistence, b: AbstractBenchmark):
 15 |     # db.Branin.drop()
 16 |     # db.Branin.find({}, {'solvers.incumbents': 0}).pretty()
 17 |     # db.Branin.count()
 18 | 
 19 |     config_dict = {
 20 |         'n_jobs': 1,
 21 |         'timeout': None,
 22 |         'iterations': 500,
 23 |         'seed': int(time.time()),
 24 | 
 25 |         'random_search': True,
 26 |         'grid_search': False,
 27 |         'smac': False,
 28 |         'hyperopt': False,  # Only single threaded
 29 |         'bohb': False,
 30 |         'robo': False,  # Only single threaded
 31 |         'optunity': False,
 32 |         'btb': False  # Only single threaded
 33 |     }
 34 |     config = Namespace(**config_dict)
 35 | 
 36 |     benchmark_result = BenchmarkResult(b, config.n_jobs, config.seed)
 37 |     persistence.store_new_run(benchmark_result)
 38 | 
 39 |     objective_time = 1
 40 | 
 41 |     # Random Search
 42 |     if config.random_search:
 43 |         from adapter.random_search import ObjectiveRandomSearch
 44 |         logger.info('Start random search')
 45 |         rs = ObjectiveRandomSearch(config.n_jobs, config.timeout, config.iterations, config.seed)
 46 |         stats = rs.optimize(b)
 47 |         benchmark_result.add_result(stats)
 48 |         persistence.store_results(benchmark_result, stats)
 49 | 
 50 |         # Estimate of objective time. Used to select iterations for fixed iterations procedures
 51 |         objective_time = stats.runtime['objective_function'][0]
 52 | 
 53 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
 54 |         logger.info(stats)
 55 | 
 56 |     # Grid Search
 57 |     if config.grid_search:
 58 |         from adapter.grid_search import ObjectiveGridSearch
 59 |         logger.info('Start grid search')
 60 |         gs = ObjectiveGridSearch(config.n_jobs, config.timeout, config.iterations)
 61 |         n = gs.estimate_grid_size(len(b.get_meta_information().get('bounds', [])), objective_time)
 62 |         logger.info('Using grid size of {}'.format(n))
 63 |         stats = gs.optimize(b, n)
 64 |         benchmark_result.add_result(stats)
 65 |         persistence.store_results(benchmark_result, stats)
 66 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
 67 |         logger.info(stats)
 68 | 
 69 |     # SMAC
 70 |     if config.smac:
 71 |         from adapter.smac import SmacAdapter
 72 |         logger.info('Start SMAC')
 73 |         smac = SmacAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
 74 |         stats = smac.optimize(b, objective_time)
 75 |         benchmark_result.add_result(stats)
 76 |         persistence.store_results(benchmark_result, stats)
 77 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
 78 |         logger.info(stats)
 79 | 
 80 |     # hyperopt
 81 |     if config.hyperopt:
 82 |         from adapter.hyperopt_adapter import HyperoptAdapter
 83 |         logger.info('Start hyperopt')
 84 |         hyperopt = HyperoptAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
 85 |         stats = hyperopt.optimize(b)
 86 |         benchmark_result.add_result(stats)
 87 |         persistence.store_results(benchmark_result, stats)
 88 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
 89 |         logger.info(stats)
 90 | 
 91 |     # bohb
 92 |     if config.bohb:
 93 |         from adapter.bohb import BohbAdapter
 94 |         logger.info('Start bohb')
 95 |         bohb = BohbAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
 96 |         stats = bohb.optimize(b)
 97 |         benchmark_result.add_result(stats)
 98 |         persistence.store_results(benchmark_result, stats)
 99 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
100 |         logger.info(stats)
101 | 
102 |     # RoBo
103 |     if config.robo:
104 |         from adapter.robo import RoBoAdapter
105 |         logger.info('Start robo')
106 |         robo = RoBoAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
107 |         stats = robo.optimize(b, model_type='gp')
108 |         benchmark_result.add_result(stats)
109 |         persistence.store_results(benchmark_result, stats)
110 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
111 |         logger.info(stats)
112 | 
113 |     # Optunity
114 |     if config.optunity:
115 |         from adapter.optunity_adapter import OptunityAdapter
116 |         logger.info('Start optunity')
117 |         optunity = OptunityAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
118 |         stats = optunity.optimize(b)
119 |         benchmark_result.add_result(stats)
120 |         persistence.store_results(benchmark_result, stats)
121 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
122 |         logger.info(stats)
123 | 
124 |     # BTB
125 |     if config.btb:
126 |         from adapter.btb_adapter import BtbAdapter
127 |         logger.info('Start btb')
128 |         btb = BtbAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
129 |         stats = btb.optimize(b)
130 |         benchmark_result.add_result(stats)
131 |         persistence.store_results(benchmark_result, stats)
132 |         logger.info('Finished after {}s'.format(stats.end - stats.start))
133 |         logger.info(stats)
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     parser = argparse.ArgumentParser()
138 |     parser.add_argument('--database', type=str, default='localhost')
139 |     parser.add_argument('--chunk', type=int, default=None)
140 |     args = parser.parse_args()
141 | 
142 |     util.logger.setup(args.chunk)
143 |     logger = util.logger.get()
144 | 
145 |     warnings.simplefilter(action='ignore', category=FutureWarning)
146 | 
147 |     logger.info('Main start')
148 |     try:
149 |         persistence = MongoPersistence(args.database, read_only=False)
150 |         b = benchmark.Rosenbrock20D()
151 |         for i in range(20):
152 |             run(persistence, b)
153 | 
154 |         # for b in benchmark.OpenML100Suite().load(chunk=args.chunk):
155 |         #     logger.info('Starting OpenML benchmark {}'.format(b.task_id))
156 |         #     for i in range(1):
157 |         #         run(persistence, b)
158 |     except (SystemExit, KeyboardInterrupt, Exception) as e:
159 |         logger.error(e, exc_info=True)
160 | 
161 |     logger.info('Main finished')
162 | 


--------------------------------------------------------------------------------
/adapter/base.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import time
  3 | import traceback
  4 | from typing import List, Union, Optional
  5 | 
  6 | import numpy as np
  7 | from hpolib.abstract_benchmark import AbstractBenchmark
  8 | 
  9 | import util.logger
 10 | 
 11 | OBJECTIVE_TIME_FACTOR = 1.5
 12 | 
 13 | logger = util.logger.get()
 14 | 
 15 | 
 16 | class EvaluationResult:
 17 | 
 18 |     def __init__(self, start: float, end: float, score: float, config: dict):
 19 |         self.start = start
 20 |         self.end = end
 21 |         self.score = score
 22 |         self.config = config
 23 | 
 24 |     def __str__(self):
 25 |         return str(self.as_dict())
 26 | 
 27 |     def __repr__(self):
 28 |         return str(self)
 29 | 
 30 |     def as_dict(self):
 31 |         return {
 32 |             'start': self.start,
 33 |             'end': self.end,
 34 |             'score': self.score,
 35 |             'config': self.config
 36 |         }
 37 | 
 38 |     @staticmethod
 39 |     def from_dict(d: dict, conf: dict) -> 'EvaluationResult':
 40 |         return EvaluationResult(d['start'], d['end'], d['function_value'], conf)
 41 | 
 42 | 
 43 | class OptimizationStatistic:
 44 | 
 45 |     def __init__(self, algorithm: str, start: float):
 46 |         self.algorithm = algorithm
 47 | 
 48 |         self.start = start
 49 |         self.end = None
 50 | 
 51 |         self.iterations = 0
 52 |         self.score = None
 53 |         self.best = None
 54 |         self.runtime = {}
 55 | 
 56 |         self.evaluations: List[EvaluationResult] = []
 57 | 
 58 |     def add_result(self, result: List[EvaluationResult]):
 59 |         self.evaluations.extend(result)
 60 | 
 61 |     def stop_optimisation(self):
 62 |         self.end = time.time()
 63 |         self.evaluations = sorted(self.evaluations, key=lambda ev: ev.end)
 64 |         self.iterations = len(self.evaluations)
 65 | 
 66 |         best_score = float("inf")
 67 |         best = None
 68 |         for ev in self.evaluations:
 69 |             if ev.score < best_score:
 70 |                 best_score = ev.score
 71 |                 best = ev.config
 72 |         self.score = best_score
 73 |         self.best = best
 74 | 
 75 |         total = self.end - self.start
 76 |         objective_function = np.array([ev.end - ev.start for ev in self.evaluations])
 77 | 
 78 |         overhead = []
 79 |         previous = self.start
 80 |         for ev in self.evaluations:
 81 |             overhead.append(ev.start - previous)
 82 |             previous = ev.end
 83 |         overhead.append(self.end - previous)
 84 |         overhead = np.array(overhead)
 85 | 
 86 |         self.runtime = {
 87 |             'total': total,
 88 |             'objective_function': [objective_function.mean(), objective_function.var(), objective_function.sum()],
 89 |             'overhead': [overhead.mean(), overhead.var(), overhead.sum()]
 90 |         }
 91 | 
 92 |     @property
 93 |     def incumbents(self):
 94 |         ls = []
 95 | 
 96 |         current_best = float("inf")
 97 |         for ev in self.evaluations:
 98 |             if ev.score < current_best:
 99 |                 ls.append(ev)
100 |                 current_best = ev.score
101 |         return ls
102 | 
103 |     def as_numpy(self, incumbent: bool = True, x_axis: str = 'iterations'):
104 |         """
105 |         Returns the evaluations as two one-dimensional numpy arrays.
106 |         :param incumbent: Only include improvements and not all runs
107 |         :param x_axis: Specifies the type of the x-axis. Can be either 'iterations' or 'time'
108 |         :return: x, y array
109 |         """
110 |         x = []
111 |         y = []
112 | 
113 |         if x_axis == 'time':
114 |             ls = self.incumbents if incumbent else self.evaluations
115 |             for ev in ls:
116 |                 x.append(ev.end - self.start)
117 |                 y.append(ev.score)
118 |         elif x_axis == 'iterations':
119 |             current_best = float("inf")
120 |             for idx, ev in enumerate(self.evaluations):
121 |                 if not incumbent or ev.score < current_best:
122 |                     current_best = ev.score
123 |                 x.append(idx)
124 |                 y.append(current_best)
125 |         else:
126 |             raise ValueError('Unknown x_axis {}'.format(x_axis))
127 | 
128 |         return np.array(x), np.array(y)
129 | 
130 |     def as_dict(self, include_evaluations=False):
131 |         ls = self.evaluations if include_evaluations else []
132 |         return {
133 |             'algorithm': self.algorithm,
134 | 
135 |             'start': self.start,
136 |             'end': self.end,
137 | 
138 |             'iterations': self.iterations,
139 |             'score': self.score,
140 |             'best': self.best,
141 |             'runtime': self.runtime,
142 |             'incumbents': [ev.as_dict() for ev in ls]
143 |         }
144 | 
145 |     @staticmethod
146 |     def from_dict(d: dict) -> 'OptimizationStatistic':
147 |         instance = OptimizationStatistic(d['algorithm'], d['start'])
148 |         instance.end = d['end']
149 |         instance.iterations = d['iterations']
150 |         instance.score = d['score']
151 |         instance.best = d['best']
152 |         instance.runtime = d['runtime']
153 |         instance.evaluations = [EvaluationResult(**f) for f in d['incumbents']]
154 |         return instance
155 | 
156 |     def __str__(self):
157 |         return str(self.as_dict(include_evaluations=False))
158 | 
159 | 
160 | class BenchmarkResult:
161 |     solvers: List[OptimizationStatistic]
162 | 
163 |     def __init__(self, benchmark: Union[None, AbstractBenchmark], n_jobs: int, seed: int):
164 |         self.benchmark = benchmark
165 |         self.n_jobs = n_jobs
166 |         self.seed = seed
167 |         self.solvers = []
168 | 
169 |     def add_result(self, stats: OptimizationStatistic):
170 |         self.solvers.append(stats)
171 | 
172 |     def get_result(self, algorithm: str) -> Optional[OptimizationStatistic]:
173 |         for solver in self.solvers:
174 |             if solver.algorithm == algorithm:
175 |                 return solver
176 |         return None
177 | 
178 |     @property
179 |     def name(self):
180 |         return self.benchmark.get_meta_information()['name']
181 | 
182 |     def as_dict(self):
183 |         return {
184 |             'name': self.benchmark.get_meta_information()['name'],
185 |             'seed': self.seed,
186 |             'n_jobs': self.n_jobs,
187 |             'solvers': []
188 |         }
189 | 
190 |     @staticmethod
191 |     def from_dict(d: dict) -> 'BenchmarkResult':
192 |         instance = BenchmarkResult(None, d['n_jobs'], d['seed'])
193 |         instance.solvers = [OptimizationStatistic.from_dict(f) for f in d['solvers']]
194 |         return instance
195 | 
196 | 
197 | class BaseAdapter(abc.ABC):
198 | 
199 |     @staticmethod
200 |     def log_async_error(ex: Exception):
201 |         msg = traceback.format_exception(type(ex), ex, None)
202 |         logger.error('Encountered error in adapter execution: {}'.format(''.join(msg)))
203 | 
204 |     def __init__(self, n_jobs: int, time_limit: float = None, iterations: int = None,
205 |                  seed: Union[None, int] = None):
206 |         self.n_jobs = n_jobs
207 |         self.time_limit = time_limit
208 |         self.iterations = iterations
209 |         self.seed = seed
210 | 
211 |         if time_limit is None and iterations is None:
212 |             raise ValueError('Expecting limited runtime or limited number of iterations')
213 | 
214 |     @abc.abstractmethod
215 |     def optimize(self, benchmark: AbstractBenchmark, **kwargs) -> OptimizationStatistic:
216 |         pass
217 | 


--------------------------------------------------------------------------------
/run_cash.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import warnings
  4 | from argparse import Namespace
  5 | 
  6 | from hpolib.abstract_benchmark import AbstractBenchmark
  7 | 
  8 | import benchmark
  9 | import util.logger
 10 | from adapter.base import BenchmarkResult
 11 | from evaluation.base import Persistence, MongoPersistence
 12 | 
 13 | 
 14 | def run(persistence: Persistence, b: AbstractBenchmark, idx: int):
 15 |     # db.Branin.drop()
 16 |     # db.Branin.find({}, {'solvers.incumbents': 0}).pretty()
 17 |     # db.Branin.count()
 18 |     # { $where: "this.solvers.length == 0" }
 19 |     # db.getCollectionNames().forEach(function(collname) { db[collname].deleteMany({ $where: "this.solvers.length == 0" }) })
 20 | 
 21 |     config_dict = {
 22 |         'n_jobs': 3,
 23 |         'timeout': None,
 24 |         'iterations': 325,
 25 |         'seed': idx,
 26 | 
 27 |         'random_search': True,
 28 |         'grid_search': True,
 29 |         'smac': True,
 30 |         'hyperopt': True,  # Only single threaded
 31 |         'bohb': True,
 32 |         'robo': True,  # Only single threaded
 33 |         'optunity': True,
 34 |         'btb': True  # Only single threaded
 35 |     }
 36 |     config = Namespace(**config_dict)
 37 | 
 38 |     benchmark_result = BenchmarkResult(b, config.n_jobs, config.seed)
 39 |     persistence.store_new_run(benchmark_result)
 40 | 
 41 |     objective_time = 1
 42 | 
 43 |     for old in persistence.load_all(b):
 44 |         if old.seed == idx:
 45 |             old_results = old
 46 |             break
 47 |     else:
 48 |         old_results = BenchmarkResult(None, -1, -1)
 49 | 
 50 |     # Random Search
 51 |     if config.random_search:
 52 |         from adapter.random_search import ObjectiveRandomSearch
 53 |         logger.info('Start random search')
 54 |         old = old_results.get_result('Random Search')
 55 |         if old is not None:
 56 |             logger.info('Reusing old score of {}'.format(old.score))
 57 |         else:
 58 |             rs = ObjectiveRandomSearch(config.n_jobs, config.timeout, config.iterations, config.seed)
 59 |             stats = rs.optimize(b)
 60 |             benchmark_result.add_result(stats)
 61 |             persistence.store_results(benchmark_result, stats)
 62 | 
 63 |             # Estimate of objective time. Used to select iterations for fixed iterations procedures
 64 |             objective_time = stats.runtime['objective_function'][0]
 65 | 
 66 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
 67 |             logger.info(stats)
 68 | 
 69 |     # Grid Search
 70 |     if config.grid_search:
 71 |         from adapter.grid_search import ObjectiveGridSearch
 72 |         logger.info('Start grid search')
 73 |         old = old_results.get_result('Grid Search')
 74 |         if old is not None:
 75 |             logger.info('Reusing old score of {}'.format(old.score))
 76 |         else:
 77 |             gs = ObjectiveGridSearch(config.n_jobs, config.timeout, config.iterations)
 78 |             n = gs.estimate_grid_size(len(b.get_meta_information().get('bounds', [])), objective_time)
 79 |             logger.info('Using grid size of {}'.format(n))
 80 |             stats = gs.optimize(b, n)
 81 |             benchmark_result.add_result(stats)
 82 |             persistence.store_results(benchmark_result, stats)
 83 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
 84 |             logger.info(stats)
 85 | 
 86 |     # SMAC
 87 |     if config.smac:
 88 |         from adapter.smac import SmacAdapter
 89 |         logger.info('Start SMAC')
 90 |         old = old_results.get_result('SMAC')
 91 |         if old is not None:
 92 |             logger.info('Reusing old score of {}'.format(old.score))
 93 |         else:
 94 |             smac = SmacAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
 95 |             stats = smac.optimize(b, objective_time)
 96 |             benchmark_result.add_result(stats)
 97 |             persistence.store_results(benchmark_result, stats)
 98 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
 99 |             logger.info(stats)
100 | 
101 |     # hyperopt
102 |     if config.hyperopt:
103 |         from adapter.hyperopt_adapter import HyperoptAdapter
104 |         logger.info('Start hyperopt')
105 |         old = old_results.get_result('hyperopt')
106 |         if old is not None:
107 |             logger.info('Reusing old score of {}'.format(old.score))
108 |         else:
109 |             hyperopt = HyperoptAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
110 |             stats = hyperopt.optimize(b)
111 |             benchmark_result.add_result(stats)
112 |             persistence.store_results(benchmark_result, stats)
113 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
114 |             logger.info(stats)
115 | 
116 |     # bohb
117 |     if config.bohb:
118 |         from adapter.bohb import BohbAdapter
119 |         logger.info('Start bohb')
120 |         old = old_results.get_result('BOHB')
121 |         if old is not None:
122 |             logger.info('Reusing old score of {}'.format(old.score))
123 |         else:
124 |             bohb = BohbAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
125 |             stats = bohb.optimize(b)
126 |             benchmark_result.add_result(stats)
127 |             persistence.store_results(benchmark_result, stats)
128 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
129 |             logger.info(stats)
130 | 
131 |     # RoBo
132 |     if config.robo:
133 |         from adapter.robo import RoBoAdapter
134 |         logger.info('Start robo')
135 |         old = old_results.get_result('RoBo gp')
136 |         if old is not None:
137 |             logger.info('Reusing old score of {}'.format(old.score))
138 |         else:
139 |             robo = RoBoAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
140 |             stats = robo.optimize(b, model_type='gp')
141 |             benchmark_result.add_result(stats)
142 |             persistence.store_results(benchmark_result, stats)
143 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
144 |             logger.info(stats)
145 | 
146 |     # Optunity
147 |     if config.optunity:
148 |         from adapter.optunity_adapter import OptunityAdapter
149 |         logger.info('Start optunity')
150 |         old = old_results.get_result('Optunity')
151 |         if old is not None:
152 |             logger.info('Reusing old score of {}'.format(old.score))
153 |         else:
154 |             optunity = OptunityAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
155 |             stats = optunity.optimize(b)
156 |             benchmark_result.add_result(stats)
157 |             persistence.store_results(benchmark_result, stats)
158 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
159 |             logger.info(stats)
160 | 
161 |     # BTB
162 |     if config.btb:
163 |         from adapter.btb_adapter import BtbAdapter
164 |         logger.info('Start btb')
165 |         old = old_results.get_result('BTB')
166 |         if old is not None:
167 |             logger.info('Reusing old score of {}'.format(old.score))
168 |         else:
169 |             btb = BtbAdapter(config.n_jobs, config.timeout, config.iterations, config.seed)
170 |             stats = btb.optimize(b)
171 |             benchmark_result.add_result(stats)
172 |             persistence.store_results(benchmark_result, stats)
173 |             logger.info('Finished after {}s'.format(stats.end - stats.start))
174 |             logger.info(stats)
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     parser = argparse.ArgumentParser()
179 |     parser.add_argument('--database', type=str, default='localhost')
180 |     parser.add_argument('--chunk', type=int, default=None)
181 |     args = parser.parse_args()
182 | 
183 |     util.logger.setup(args.chunk)
184 |     logger = util.logger.get()
185 | 
186 |     warnings.simplefilter(action='ignore', category=FutureWarning)
187 | 
188 |     logger.info('Main start')
189 |     try:
190 |         persistence = MongoPersistence(url='localhost', db='tmp')
191 | 
192 |         task_ids = [9910, 14952, 146817, 146819, 146820, 146824, 167121, 167124, 167125, 167140, 167141]
193 |         for task in task_ids:
194 |             logger.info('#######\nStarting task {}\n#######'.format(task))
195 |             for i in range(10):
196 |                 logger.info('##\nIteration {} at {}\n##'.format(i, datetime.datetime.now().time()))
197 |                 bm = benchmark.OpenMLBenchmark(task, test_size=None)
198 |                 for fold in range(len(bm.folds)):
199 |                     bm.fold = fold
200 |                     run(persistence, bm, i)
201 |     except (SystemExit, KeyboardInterrupt, Exception) as e:
202 |         logger.error(e, exc_info=True)
203 | 
204 |     logger.info('Main finished')
205 | 


--------------------------------------------------------------------------------
/adapter/run_auto_sklearn.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import shutil
  3 | import time
  4 | from typing import List
  5 | 
  6 | import sklearn.datasets
  7 | import sklearn.metrics
  8 | import sklearn.model_selection
  9 | from autosklearn.classification import AutoSklearnClassifier
 10 | from autosklearn.constants import *
 11 | from autosklearn.metrics import accuracy
 12 | from smac.facade.roar_facade import ROAR
 13 | from smac.scenario.scenario import Scenario
 14 | 
 15 | from benchmark import OpenMLBenchmark
 16 | 
 17 | 
 18 | def get_random_search_object_callback(scenario_dict, seed, ta, backend, metalearning_configurations, runhistory):
 19 |     """Random search."""
 20 |     scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob()
 21 |     scenario_dict['minR'] = len(scenario_dict['instances'])
 22 |     scenario_dict['initial_incumbent'] = 'RANDOM'
 23 |     scenario = Scenario(scenario_dict)
 24 |     return ROAR(
 25 |         scenario=scenario,
 26 |         rng=seed,
 27 |         tae_runner=ta,
 28 |         runhistory=runhistory,
 29 |         run_id=seed
 30 |     )
 31 | 
 32 | 
 33 | def skip(id: int) -> bool:
 34 |     failed = [167124]
 35 |     return id in failed
 36 | 
 37 | 
 38 | def setup():
 39 |     try:
 40 |         shutil.rmtree('/tmp/autosklearn/')
 41 |     except OSError as e:
 42 |         pass
 43 | 
 44 | 
 45 | def main(fold, bm: OpenMLBenchmark, timeout: int, run_timeout: int, jobs: int, random: bool,
 46 |          score: bool = True) -> float:
 47 |     def get_spawn_classifier(X_train, y_train, tmp_folder, output_folder, seed0):
 48 |         def spawn_classifier(seed, dataset_name):
 49 |             # Use the initial configurations from meta-learning only in one out of
 50 |             # the processes spawned. This prevents auto-sklearn from evaluating the
 51 |             # same configurations in all processes.
 52 |             if seed == seed0 and not random:
 53 |                 initial_configurations_via_metalearning = 25
 54 |                 smac_scenario_args = {}
 55 |             else:
 56 |                 initial_configurations_via_metalearning = 0
 57 |                 smac_scenario_args = {'initial_incumbent': 'RANDOM'}
 58 | 
 59 |             callback = None
 60 |             if random:
 61 |                 callback = get_random_search_object_callback
 62 | 
 63 |             # Arguments which are different to other runs of auto-sklearn:
 64 |             # 1. all classifiers write to the same output directory
 65 |             # 2. shared_mode is set to True, this enables sharing of data between
 66 |             # models.
 67 |             # 3. all instances of the AutoSklearnClassifier must have a different seed!
 68 |             automl = AutoSklearnClassifier(
 69 |                 time_left_for_this_task=timeout,
 70 |                 per_run_time_limit=run_timeout,
 71 |                 shared_mode=True,
 72 |                 tmp_folder=tmp_folder,
 73 |                 output_folder=output_folder,
 74 |                 delete_tmp_folder_after_terminate=False,
 75 |                 ensemble_size=0,
 76 |                 initial_configurations_via_metalearning=initial_configurations_via_metalearning,
 77 |                 seed=seed,
 78 |                 smac_scenario_args=smac_scenario_args,
 79 |                 get_smac_object_callback=callback,
 80 |                 ml_memory_limit=4096
 81 |             )
 82 |             automl.fit(X_train, y_train, dataset_name=dataset_name)
 83 |             print(automl.sprint_statistics())
 84 | 
 85 |         return spawn_classifier
 86 | 
 87 |     name = bm.task_id
 88 | 
 89 |     setup()
 90 |     X_train, y_train, X_test, y_test = fold
 91 | 
 92 |     tmp_folder = '/tmp/autosklearn/{}/tmp'.format(name)
 93 |     output_folder = '/tmp/autosklearn/{}/out'.format(name)
 94 | 
 95 |     seed = int(time.time())
 96 |     ensemble_size = 1 if random else 20
 97 | 
 98 |     processes = []
 99 |     spawn_classifier = get_spawn_classifier(X_train, y_train, tmp_folder, output_folder, seed)
100 |     for i in range(jobs):
101 |         p = multiprocessing.Process(target=spawn_classifier, args=(seed + i, name))
102 |         p.start()
103 |         processes.append(p)
104 | 
105 |     start = time.time()
106 |     while time.time() - start <= 1.05 * timeout:
107 |         if any(p.is_alive() for p in processes):
108 |             time.sleep(10)
109 |         else:
110 |             break
111 |     else:
112 |         print('Grace period exceed. Killing workers.')
113 |         for p in processes:
114 |             p.terminate()
115 |             p.join()
116 | 
117 |     print('Starting to build an ensemble!')
118 |     automl = AutoSklearnClassifier(
119 |         time_left_for_this_task=3600,
120 |         per_run_time_limit=run_timeout,
121 |         shared_mode=True,
122 |         ensemble_size=ensemble_size,
123 |         tmp_folder=tmp_folder,
124 |         output_folder=output_folder,
125 |         initial_configurations_via_metalearning=0,
126 |         seed=seed,
127 |         ml_memory_limit=4096
128 |     )
129 |     automl.fit_ensemble(
130 |         y_train,
131 |         task=MULTICLASS_CLASSIFICATION,
132 |         metric=accuracy,
133 |         precision='32',
134 |         dataset_name=name,
135 |         ensemble_size=ensemble_size
136 |     )
137 | 
138 |     print(automl.show_models())
139 |     if score:
140 |         predictions = automl.predict(X_test)
141 |         return 1 - sklearn.metrics.accuracy_score(y_test, predictions)
142 |     else:
143 |         automl.target_type = 'multilabel-indicator'
144 |         predictions = automl.predict_proba(X_test)
145 |         return sklearn.metrics.roc_auc_score(y_test, predictions[:, 1]), automl
146 | 
147 | 
148 | # noinspection PyUnresolvedReferences
149 | def load_pipeline(input: str) -> List[List[str]]:
150 |     from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier
151 |     from autosklearn.pipeline.classification import SimpleClassificationPipeline
152 |     from autosklearn.pipeline.components.classification import ClassifierChoice
153 |     from autosklearn.pipeline.components.data_preprocessing.rescaling import RescalingChoice
154 |     from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
155 |     from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing
156 |     from autosklearn.pipeline.components.data_preprocessing.rescaling.none import NoRescalingComponent
157 |     from autosklearn.pipeline.components.data_preprocessing.one_hot_encoding import OHEChoice
158 |     from autosklearn.pipeline.components.data_preprocessing.one_hot_encoding.no_encoding import NoEncoding
159 | 
160 |     res = []
161 |     try:
162 |         pipelines: List[Union[SimpleClassificationPipeline, MyDummyClassifier]] = load_model(input)
163 |         for pipeline in pipelines:
164 |             res.append([])
165 |             if isinstance(pipeline[1], MyDummyClassifier):
166 |                 res[-1].append(type(pipeline[1]).__name__)
167 |             else:
168 |                 for s in pipeline[1].steps:
169 |                     if isinstance(s[1], FeaturePreprocessorChoice) or isinstance(s[1], ClassifierChoice) or \
170 |                             isinstance(s[1], RescalingChoice) or isinstance(s[1], OHEChoice):
171 |                         choice = s[1].choice
172 |                         if isinstance(choice, NoPreprocessing) or isinstance(choice, NoRescalingComponent) or \
173 |                                 isinstance(choice, NoEncoding):
174 |                             continue
175 |                         res[-1].append(type(s[1].choice).__name__)
176 |                     else:
177 |                         res[-1].append(type(s[1]).__name__)
178 | 
179 |         for i in range(len(res[-1])):
180 |             n = res[-1][i]
181 | 
182 |             if n.endswith('Component'):
183 |                 n = n[:-len('Component')]
184 |             if n == 'LibLinear_SVC':
185 |                 n = 'LinearSVC'
186 |             if n == 'LibSVM_SVC':
187 |                 n = 'SVC'
188 |             if n == 'KNearestNeighborsClassifier':
189 |                 n = 'KNeighborsClassifier'
190 |             if n == 'RandomForest':
191 |                 n = 'RandomForestClassifier'
192 |             if n == 'SelectPercentileClassification':
193 |                 n = 'SelectPercentile'
194 |             if n == 'ExtraTreesPreprocessorClassification':
195 |                 n = 'SelectFromModel'
196 | 
197 |             res[-1][i] = n
198 |     except Exception:
199 |         print(input)
200 |         raise
201 |     return res
202 | 
203 | 
204 | # noinspection PyUnresolvedReferences
205 | def load_model(input: str) -> List[List[str]]:
206 |     from autosklearn.evaluation.abstract_evaluator import MyDummyClassifier
207 |     from autosklearn.pipeline.classification import SimpleClassificationPipeline
208 |     from autosklearn.pipeline.components.classification import ClassifierChoice
209 |     from autosklearn.pipeline.components.data_preprocessing.rescaling import RescalingChoice
210 |     from autosklearn.pipeline.components.feature_preprocessing import FeaturePreprocessorChoice
211 |     from autosklearn.pipeline.components.feature_preprocessing.no_preprocessing import NoPreprocessing
212 |     from autosklearn.pipeline.components.data_preprocessing.rescaling.none import NoRescalingComponent
213 |     from autosklearn.pipeline.components.data_preprocessing.one_hot_encoding import OHEChoice
214 |     from autosklearn.pipeline.components.data_preprocessing.one_hot_encoding.no_encoding import NoEncoding
215 | 
216 |     pipelines: List[Union[SimpleClassificationPipeline, MyDummyClassifier]] = eval(input)
217 |     return pipelines
218 | 


--------------------------------------------------------------------------------
/assets/classifier.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "sklearn.naive_bayes.BernoulliNB": {
  3 |     "alpha": {
  4 |       "type": "uniform_float",
  5 |       "lower": 1e-2,
  6 |       "upper": 100,
  7 |       "default_value": 1
  8 |     },
  9 |     "fit_prior": {
 10 |       "type": "categorical",
 11 |       "choices": [
 12 |         true,
 13 |         false
 14 |       ],
 15 |       "default_value": true
 16 |     }
 17 |   },
 18 |   "sklearn.naive_bayes.MultinomialNB": {
 19 |     "alpha": {
 20 |       "type": "uniform_float",
 21 |       "lower": 1e-2,
 22 |       "upper": 100,
 23 |       "default_value": 1,
 24 |       "log": true
 25 |     },
 26 |     "fit_prior": {
 27 |       "type": "categorical",
 28 |       "choices": [
 29 |         true,
 30 |         false
 31 |       ],
 32 |       "default_value": true
 33 |     }
 34 |   },
 35 |   "sklearn.tree.DecisionTreeClassifier": {
 36 |     "criterion": {
 37 |       "type": "categorical",
 38 |       "choices": [
 39 |         "gini",
 40 |         "entropy"
 41 |       ],
 42 |       "default_value": "gini"
 43 |     },
 44 |     "max_depth": {
 45 |       "type": "uniform_float",
 46 |       "lower": 0.1,
 47 |       "upper": 2.0,
 48 |       "default_value": 0.5
 49 |     },
 50 |     "min_samples_split": {
 51 |       "type": "uniform_int",
 52 |       "lower": 2,
 53 |       "upper": 20,
 54 |       "default_value": 2
 55 |     },
 56 |     "min_samples_leaf": {
 57 |       "type": "uniform_int",
 58 |       "lower": 1,
 59 |       "upper": 20,
 60 |       "default_value": 1
 61 |     }
 62 |   },
 63 |   "sklearn.ensemble.ExtraTreesClassifier": {
 64 |     "criterion": {
 65 |       "type": "categorical",
 66 |       "choices": [
 67 |         "gini",
 68 |         "entropy"
 69 |       ],
 70 |       "default_value": "gini"
 71 |     },
 72 |     "max_features": {
 73 |       "type": "uniform_float",
 74 |       "lower": 0.0,
 75 |       "upper": 1.0,
 76 |       "default_value": 0.5
 77 |     },
 78 |     "min_samples_split": {
 79 |       "type": "uniform_int",
 80 |       "lower": 2,
 81 |       "upper": 20,
 82 |       "default_value": 2
 83 |     },
 84 |     "min_samples_leaf": {
 85 |       "type": "uniform_int",
 86 |       "lower": 1,
 87 |       "upper": 20,
 88 |       "default_value": 1
 89 |     },
 90 |     "bootstrap": {
 91 |       "type": "categorical",
 92 |       "choices": [
 93 |         true,
 94 |         false
 95 |       ],
 96 |       "default_value": false
 97 |     }
 98 |   },
 99 |   "sklearn.ensemble.GradientBoostingClassifier": {
100 |     "n_estimators": {
101 |       "type": "uniform_int",
102 |       "lower": 50,
103 |       "upper": 500,
104 |       "default_value": 100
105 |     },
106 |     "learning_rate": {
107 |       "type": "uniform_float",
108 |       "lower": 0.01,
109 |       "upper": 1.0,
110 |       "default_value": 0.1,
111 |       "log": true
112 |     },
113 |     "max_depth": {
114 |       "type": "uniform_int",
115 |       "lower": 1,
116 |       "upper": 10,
117 |       "default_value": 3
118 |     },
119 |     "criterion": {
120 |       "type": "categorical",
121 |       "choices": [
122 |         "friedman_mse",
123 |         "mse",
124 |         "mae"
125 |       ],
126 |       "default_value": "mse"
127 |     },
128 |     "min_samples_split": {
129 |       "type": "uniform_int",
130 |       "lower": 2,
131 |       "upper": 20,
132 |       "default_value": 2
133 |     },
134 |     "min_samples_leaf": {
135 |       "type": "uniform_int",
136 |       "lower": 1,
137 |       "upper": 20,
138 |       "default_value": 1
139 |     }
140 |   },
141 |   "sklearn.ensemble.RandomForestClassifier": {
142 |     "criterion": {
143 |       "type": "categorical",
144 |       "choices": [
145 |         "gini",
146 |         "entropy"
147 |       ],
148 |       "default_value": "gini"
149 |     },
150 |     "max_features": {
151 |       "type": "uniform_float",
152 |       "lower": 0.0,
153 |       "upper": 1.0,
154 |       "default_value": 0.5
155 |     },
156 |     "n_estimators": {
157 |       "type": "uniform_int",
158 |       "lower": 2,
159 |       "upper": 100,
160 |       "default_value": 10
161 |     },
162 |     "min_samples_split": {
163 |       "type": "uniform_int",
164 |       "lower": 2,
165 |       "upper": 20,
166 |       "default_value": 2
167 |     },
168 |     "min_samples_leaf": {
169 |       "type": "uniform_int",
170 |       "lower": 1,
171 |       "upper": 20,
172 |       "default_value": 1
173 |     },
174 |     "bootstrap": {
175 |       "type": "categorical",
176 |       "choices": [
177 |         true,
178 |         false
179 |       ],
180 |       "default_value": true
181 |     }
182 |   },
183 |   "sklearn.neighbors.KNeighborsClassifier": {
184 |     "n_neighbors": {
185 |       "type": "uniform_int",
186 |       "lower": 1,
187 |       "upper": 100,
188 |       "default_value": 1,
189 |       "log": true
190 |     },
191 |     "weights": {
192 |       "type": "categorical",
193 |       "choices": [
194 |         "uniform",
195 |         "distance"
196 |       ],
197 |       "default_value": "uniform"
198 |     },
199 |     "p": {
200 |       "type": "categorical",
201 |       "choices": [
202 |         1,
203 |         2
204 |       ],
205 |       "default_value": 2
206 |     }
207 |   },
208 |   "sklearn.discriminant_analysis.LinearDiscriminantAnalysis": {
209 |     "solver": {
210 |       "type": "categorical",
211 |       "choices": [
212 |         "svd",
213 |         "lsqr",
214 |         "eigen"
215 |       ],
216 |       "default_value": "svd"
217 |     },
218 |     "shrinkage": {
219 |       "type": "uniform_float",
220 |       "lower": 0.0,
221 |       "upper": 1.0,
222 |       "default_value": 0.5,
223 |       "condition": {
224 |         "parent": "solver",
225 |         "value": [
226 |           "lsqr",
227 |           "eigen"
228 |         ]
229 |       }
230 |     },
231 |     "n_components": {
232 |       "type": "uniform_int",
233 |       "lower": 1,
234 |       "upper": 250,
235 |       "default_value": 10
236 |     },
237 |     "tol": {
238 |       "type": "uniform_float",
239 |       "lower": 1e-5,
240 |       "upper": 1e-1,
241 |       "default_value": 1e-4,
242 |       "log": true
243 |     }
244 |   },
245 |   "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis": {
246 |     "reg_param": {
247 |       "type": "uniform_float",
248 |       "lower": 0.0,
249 |       "upper": 1.0,
250 |       "default_value": 0.0
251 |     }
252 |   },
253 |   "sklearn.svm.LinearSVC": {
254 |     "penalty": {
255 |       "type": "categorical",
256 |       "choices": [
257 |         "l1",
258 |         "l2"
259 |       ],
260 |       "default_value": "l2"
261 |     },
262 |     "loss": {
263 |       "type": "categorical",
264 |       "choices": [
265 |         "hinge",
266 |         "squared_hinge"
267 |       ],
268 |       "default_value": "squared_hinge"
269 |     },
270 |     "tol": {
271 |       "type": "uniform_float",
272 |       "lower": 1e-5,
273 |       "upper": 1e-1,
274 |       "default_value": 1e-4,
275 |       "log": true
276 |     },
277 |     "C": {
278 |       "type": "uniform_float",
279 |       "lower": 0.03125,
280 |       "upper": 32768,
281 |       "default_value": 1.0,
282 |       "log": true
283 |     }
284 |   },
285 |   "sklearn.svm.SVC": {
286 |     "C": {
287 |       "type": "uniform_float",
288 |       "lower": 0.03125,
289 |       "upper": 32768,
290 |       "default_value": 1.0,
291 |       "log": true
292 |     },
293 |     "kernel": {
294 |       "type": "categorical",
295 |       "choices": [
296 |         "rbf",
297 |         "poly",
298 |         "sigmoid"
299 |       ],
300 |       "default_value": "rbf"
301 |     },
302 |     "degree": {
303 |       "type": "uniform_int",
304 |       "lower": 2,
305 |       "upper": 5,
306 |       "default_value": 3,
307 |       "condition": {
308 |         "parent": "kernel",
309 |         "value": [
310 |           "poly"
311 |         ]
312 |       }
313 |     },
314 |     "gamma": {
315 |       "type": "uniform_float",
316 |       "lower": 3.0517578125e-05,
317 |       "upper": 8,
318 |       "default_value": 0.1,
319 |       "log": true
320 |     },
321 |     "coef0": {
322 |       "type": "uniform_float",
323 |       "lower": -1.0,
324 |       "upper": 1.0,
325 |       "default_value": 0.0,
326 |       "condition": {
327 |         "parent": "kernel",
328 |         "value": [
329 |           "poly",
330 |           "sigmoid"
331 |         ]
332 |       }
333 |     },
334 |     "shrinking": {
335 |       "type": "categorical",
336 |       "choices": [
337 |         true,
338 |         false
339 |       ],
340 |       "default_value": true
341 |     },
342 |     "tol": {
343 |       "type": "uniform_float",
344 |       "lower": 1e-5,
345 |       "upper": 1e-1,
346 |       "default_value": 1e-3,
347 |       "log": true
348 |     }
349 |   },
350 |   "sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier": {
351 |     "C": {
352 |       "type": "uniform_float",
353 |       "lower": 1e-5,
354 |       "upper": 10,
355 |       "default_value": 1.0,
356 |       "log": true
357 |     },
358 |     "loss": {
359 |       "type": "categorical",
360 |       "choices": [
361 |         "hinge",
362 |         "squared_hinge"
363 |       ],
364 |       "default_value": "hinge"
365 |     },
366 |     "tol": {
367 |       "type": "uniform_float",
368 |       "lower": 1e-5,
369 |       "upper": 1e-1,
370 |       "default_value": 1e-4,
371 |       "log": true
372 |     },
373 |     "average": {
374 |       "type": "categorical",
375 |       "choices": [
376 |         false,
377 |         true
378 |       ],
379 |       "default_value": false
380 |     }
381 |   },
382 |   "sklearn.linear_model.stochastic_gradient.SGDClassifier": {
383 |     "loss": {
384 |       "type": "categorical",
385 |       "choices": [
386 |         "hinge",
387 |         "log",
388 |         "modified_huber",
389 |         "squared_hinge",
390 |         "perceptron"
391 |       ],
392 |       "default_value": "log"
393 |     },
394 |     "penalty": {
395 |       "type": "categorical",
396 |       "choices": [
397 |         "l1",
398 |         "l2",
399 |         "elasticnet"
400 |       ],
401 |       "default_value": "l2"
402 |     },
403 |     "alpha": {
404 |       "type": "uniform_float",
405 |       "lower": 1e-7,
406 |       "upper": 1e-1,
407 |       "default_value": 0.0001,
408 |       "log": true
409 |     },
410 |     "l1_ratio": {
411 |       "type": "uniform_float",
412 |       "lower": 1e-9,
413 |       "upper": 1,
414 |       "default_value": 0.15,
415 |       "log": true
416 |     },
417 |     "tol": {
418 |       "type": "uniform_float",
419 |       "lower": 1e-5,
420 |       "upper": 1e-1,
421 |       "default_value": 1e-4,
422 |       "log": true
423 |     },
424 |     "epsilon": {
425 |       "type": "uniform_float",
426 |       "lower": 1e-5,
427 |       "upper": 1e-1,
428 |       "default_value": 1e-4,
429 |       "log": true
430 |     },
431 |     "learning_rate": {
432 |       "type": "categorical",
433 |       "choices": [
434 |         "optimal",
435 |         "invscaling",
436 |         "constant"
437 |       ],
438 |       "default_value": "invscaling"
439 |     },
440 |     "eta0": {
441 |       "type": "uniform_float",
442 |       "lower": 1e-7,
443 |       "upper": 1e-1,
444 |       "default_value": 0.01,
445 |       "log": true
446 |     },
447 |     "power_t": {
448 |       "type": "uniform_float",
449 |       "lower": 1e-5,
450 |       "upper": 1.0,
451 |       "default_value": 0.5
452 |     },
453 |     "average": {
454 |       "type": "categorical",
455 |       "choices": [
456 |         false,
457 |         true
458 |       ],
459 |       "default_value": false
460 |     }
461 |   }
462 | }


--------------------------------------------------------------------------------
/benchmark/synthetic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from hpolib.abstract_benchmark import AbstractBenchmark
  3 | from hpolib.benchmarks import synthetic_functions
  4 | 
  5 | from benchmark.base import _dict_as_array, meta_information
  6 | from config import BaseConverter, MetaConfig, NoopConverter
  7 | 
  8 | 
  9 | class Bohachevsky(synthetic_functions.Bohachevsky):
 10 | 
 11 |     @_dict_as_array
 12 |     @AbstractBenchmark._configuration_as_array
 13 |     @meta_information
 14 |     def objective_function(self, x, **kwargs):
 15 |         y = 0.7 + x[0] ** 2 + 2.0 * x[1] ** 2
 16 |         y -= 0.3 * np.cos(3.0 * np.pi * x[0])
 17 |         y -= 0.4 * np.cos(4.0 * np.pi * x[1])
 18 | 
 19 |         return {'function_value': y}
 20 | 
 21 |     @staticmethod
 22 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
 23 |         cs = {
 24 |             "x0": {
 25 |                 "type": "uniform_float",
 26 |                 "lower": -100,
 27 |                 "upper": 100
 28 |             },
 29 |             "x1": {
 30 |                 "type": "uniform_float",
 31 |                 "lower": -100,
 32 |                 "upper": 100
 33 |             }
 34 |         }
 35 |         return converter.convert_single(MetaConfig(cs))
 36 | 
 37 | 
 38 | class Branin(synthetic_functions.Branin):
 39 | 
 40 |     @_dict_as_array
 41 |     @AbstractBenchmark._configuration_as_array
 42 |     @meta_information
 43 |     def objective_function(self, x, **kwargs):
 44 |         y = (x[1] - (5.1 / (4 * np.pi ** 2)) * x[0] ** 2 + 5 * x[0] / np.pi - 6) ** 2
 45 |         y += 10 * (1 - 1 / (8 * np.pi)) * np.cos(x[0]) + 10
 46 | 
 47 |         return {'function_value': y}
 48 | 
 49 |     def objective_function_test(self, x, **kwargs):
 50 |         return self.objective_function(x)
 51 | 
 52 |     @staticmethod
 53 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
 54 |         cs = {
 55 |             "x0": {
 56 |                 "type": "uniform_float",
 57 |                 "lower": -5,
 58 |                 "upper": 10
 59 |             },
 60 |             "x1": {
 61 |                 "type": "uniform_float",
 62 |                 "lower": 0,
 63 |                 "upper": 15
 64 |             }
 65 |         }
 66 |         return converter.convert_single(MetaConfig(cs))
 67 | 
 68 | 
 69 | class Camelback(synthetic_functions.Camelback):
 70 | 
 71 |     @_dict_as_array
 72 |     @AbstractBenchmark._configuration_as_array
 73 |     @meta_information
 74 |     def objective_function(self, x, **kwargs):
 75 |         y = (4 - 2.1 * (x[0] ** 2) + ((x[0] ** 4) / 3)) * (x[0] ** 2) + x[0] * x[1] + (-4 + 4 * (x[1] ** 2)) * \
 76 |             (x[1] ** 2)
 77 |         return {'function_value': y}
 78 | 
 79 |     @staticmethod
 80 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
 81 |         cs = {
 82 |             "x0": {
 83 |                 "type": "uniform_float",
 84 |                 "lower": -5,
 85 |                 "upper": 5
 86 |             },
 87 |             "x1": {
 88 |                 "type": "uniform_float",
 89 |                 "lower": -5,
 90 |                 "upper": 5
 91 |             }
 92 |         }
 93 |         return converter.convert_single(MetaConfig(cs))
 94 | 
 95 | 
 96 | class Forrester(synthetic_functions.Forrester):
 97 | 
 98 |     @_dict_as_array
 99 |     @AbstractBenchmark._configuration_as_array
100 |     @meta_information
101 |     def objective_function(self, x, fidelity=1, **kwargs):
102 |         x = x[0]
103 |         y1 = np.power(6 * x - 2, 2) * np.sin(12 * x - 4)
104 | 
105 |         # best least-squared fit with cubic polynomial
106 |         y2 = 131.09227753 * (x ** 3) - 164.50286816 * (x ** 2) + 50.7228373 * x - 2.84345244
107 |         return {'function_value': fidelity * y1 + (1 - fidelity) * y2, 'cost': fidelity ** 2}
108 | 
109 |     @staticmethod
110 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
111 |         cs = {
112 |             "x": {
113 |                 "type": "uniform_float",
114 |                 "lower": 0,
115 |                 "upper": 1
116 |             }
117 |         }
118 |         return converter.convert_single(MetaConfig(cs))
119 | 
120 | 
121 | class GoldsteinPrice(synthetic_functions.GoldsteinPrice):
122 | 
123 |     @_dict_as_array
124 |     @AbstractBenchmark._configuration_as_array
125 |     @meta_information
126 |     def objective_function(self, x, **kwargs):
127 |         y = (1 + (x[0] + x[1] + 1) ** 2 * (
128 |                 19 - 14 * x[0] + 3 * x[0] ** 2 - 14 * x[1] + 6 * x[0] * x[1] + 3 * x[1] ** 2)) \
129 |             * (30 + (2 * x[0] - 3 * x[1]) ** 2 * (
130 |                 18 - 32 * x[0] + 12 * x[0] ** 2 + 48 * x[1] - 36 * x[0] * x[1] + 27 * x[1] ** 2))
131 | 
132 |         return {'function_value': y}
133 | 
134 |     @staticmethod
135 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
136 |         cs = {
137 |             "x0": {
138 |                 "type": "uniform_float",
139 |                 "lower": -2,
140 |                 "upper": 2
141 |             },
142 |             "x1": {
143 |                 "type": "uniform_float",
144 |                 "lower": -2,
145 |                 "upper": 2
146 |             }
147 |         }
148 |         return converter.convert_single(MetaConfig(cs))
149 | 
150 | 
151 | class Hartmann3(synthetic_functions.Hartmann3):
152 | 
153 |     @_dict_as_array
154 |     @AbstractBenchmark._configuration_as_array
155 |     @meta_information
156 |     def objective_function(self, x, **kwargs):
157 |         external_sum = 0
158 |         for i in range(4):
159 |             internal_sum = 0
160 |             for j in range(3):
161 |                 internal_sum += self.A[i, j] * (x[j] - self.P[i, j]) ** 2
162 |             external_sum += self.alpha[i] * np.exp(-internal_sum)
163 | 
164 |         return {'function_value': -external_sum}
165 | 
166 |     @staticmethod
167 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
168 |         cs = {}
169 |         for i in range(3):
170 |             cs[f"x{i}"] = {
171 |                 "type": "uniform_float",
172 |                 "lower": 0,
173 |                 "upper": 1
174 |             }
175 |         return converter.convert_single(MetaConfig(cs))
176 | 
177 | 
178 | class Hartmann6(synthetic_functions.Hartmann6):
179 | 
180 |     @_dict_as_array
181 |     @AbstractBenchmark._configuration_as_array
182 |     @meta_information
183 |     def objective_function(self, x, **kwargs):
184 |         """6d Hartmann test function
185 |             input bounds:  0 <= xi <= 1, i = 1..6
186 |             global optimum: (0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573),
187 |             min function value = -3.32237
188 |         """
189 | 
190 |         external_sum = 0
191 |         for i in range(4):
192 |             internal_sum = 0
193 |             for j in range(6):
194 |                 internal_sum += self.A[i, j] * (x[j] - self.P[i, j]) ** 2
195 |             external_sum += self.alpha[i] * np.exp(-internal_sum)
196 | 
197 |         return {'function_value': -external_sum}
198 | 
199 |     @staticmethod
200 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
201 |         cs = {}
202 |         for i in range(6):
203 |             cs[f"x{i}"] = {
204 |                 "type": "uniform_float",
205 |                 "lower": 0,
206 |                 "upper": 1
207 |             }
208 |         return converter.convert_single(MetaConfig(cs))
209 | 
210 | 
211 | class Levy(synthetic_functions.Levy):
212 | 
213 |     @_dict_as_array
214 |     @AbstractBenchmark._configuration_as_array
215 |     @meta_information
216 |     def objective_function(self, x, **kwargs):
217 |         z = 1 + ((x[0] - 1.) / 4.)
218 |         s = np.power((np.sin(np.pi * z)), 2)
219 |         y = (s + ((z - 1) ** 2) * (1 + np.power((np.sin(2 * np.pi * z)), 2)))
220 | 
221 |         return {'function_value': y}
222 | 
223 |     @staticmethod
224 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
225 |         cs = {
226 |             "x": {
227 |                 "type": "uniform_float",
228 |                 "lower": -15,
229 |                 "upper": 10
230 |             }
231 |         }
232 |         return converter.convert_single(MetaConfig(cs))
233 | 
234 | 
235 | # Rosenbrock2D and Rosenbrock5D are not implemented
236 | 
237 | class Rosenbrock10D(synthetic_functions.rosenbrock.Rosenbrock10D):
238 | 
239 |     @_dict_as_array
240 |     @AbstractBenchmark._configuration_as_array
241 |     @meta_information
242 |     def objective_function(self, x, **kwargs):
243 |         y = 0
244 |         d = 10
245 |         for i in range(d - 1):
246 |             y += 100 * (x[i + 1] - x[i] ** 2) ** 2
247 |             y += (x[i] - 1) ** 2
248 | 
249 |         return {'function_value': y}
250 | 
251 |     @staticmethod
252 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
253 |         cs = {}
254 |         for i in range(10):
255 |             cs[f"x{i}"] = {
256 |                 "type": "uniform_float",
257 |                 "lower": 0,
258 |                 "upper": 1
259 |             }
260 |         return converter.convert_single(MetaConfig(cs))
261 | 
262 | 
263 | class Rosenbrock20D(synthetic_functions.rosenbrock.Rosenbrock20D):
264 | 
265 |     @_dict_as_array
266 |     @AbstractBenchmark._configuration_as_array
267 |     @meta_information
268 |     def objective_function(self, x, **kwargs):
269 |         y = 0
270 |         d = 20
271 |         for i in range(d - 1):
272 |             y += 100 * (x[i + 1] - x[i] ** 2) ** 2
273 |             y += (x[i] - 1) ** 2
274 | 
275 |         return {'function_value': y}
276 | 
277 |     @staticmethod
278 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
279 |         cs = {}
280 |         for i in range(20):
281 |             cs[f"x{i}"] = {
282 |                 "type": "uniform_float",
283 |                 "lower": 0,
284 |                 "upper": 1
285 |             }
286 |         return converter.convert_single(MetaConfig(cs))
287 | 
288 | 
289 | class SinOne(synthetic_functions.SinOne):
290 | 
291 |     @_dict_as_array
292 |     @AbstractBenchmark._configuration_as_array
293 |     @meta_information
294 |     def objective_function(self, x, **kwargs):
295 |         y = 0.5 * np.sin(13 * x[0]) * np.sin(27 * x[0]) + 0.5
296 | 
297 |         return {'function_value': y}
298 | 
299 |     @staticmethod
300 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
301 |         cs = {
302 |             "x": {
303 |                 "type": "uniform_float",
304 |                 "lower": 0,
305 |                 "upper": 1
306 |             }
307 |         }
308 |         return converter.convert_single(MetaConfig(cs))
309 | 
310 | 
311 | class SinTwo(synthetic_functions.SinTwo):
312 | 
313 |     @_dict_as_array
314 |     @AbstractBenchmark._configuration_as_array
315 |     @meta_information
316 |     def objective_function(self, x, **kwargs):
317 |         y = (0.5 * np.sin(13 * x[0]) * np.sin(27 * x[0]) + 0.5) * (0.5 * np.sin(13 * x[1]) * np.sin(27 * x[1]) + 0.5)
318 | 
319 |         return {'function_value': y}
320 | 
321 |     @staticmethod
322 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
323 |         cs = {
324 |             "x0": {
325 |                 "type": "uniform_float",
326 |                 "lower": 0,
327 |                 "upper": 1
328 |             },
329 |             "x1": {
330 |                 "type": "uniform_float",
331 |                 "lower": 0,
332 |                 "upper": 1
333 |             }
334 |         }
335 |         return converter.convert_single(MetaConfig(cs))
336 | 


--------------------------------------------------------------------------------
/test/test_converter.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from unittest import TestCase
  3 | 
  4 | import numpy as np
  5 | import scipy.stats
  6 | from ConfigSpace.conditions import InCondition
  7 | from ConfigSpace.hyperparameters import CategoricalHyperparameter, UniformFloatHyperparameter, \
  8 |     UniformIntegerHyperparameter
  9 | from hyperopt import hp
 10 | from hyperopt.pyll import scope
 11 | 
 12 | from config import MetaConfigCollection, ConfigurationSpace, TpotConverter, HyperoptConverter, RandomSearchConverter, \
 13 |     GridSearchConverter, OptunityConverter, BtbConverter
 14 | from config.converter import ConfigSpaceConverter
 15 | 
 16 | 
 17 | class SaneEqualityArray(np.ndarray):
 18 |     def __eq__(self, other):
 19 |         return (self.shape == other.shape and
 20 |                 np.allclose(self, other))
 21 | 
 22 | 
 23 | class SaneEqualityDist(scipy.stats.distributions.uniform_gen):
 24 |     def __eq__(self, other):
 25 |         return (self.a == other.kwds['loc'] and self.b == other.kwds['scale'])
 26 | 
 27 | 
 28 | class TestConfigSpaceConverter(TestCase):
 29 | 
 30 |     def setUp(self):
 31 |         self.config = MetaConfigCollection.from_json(os.path.join(os.path.dirname(__file__), 'config.json'))
 32 | 
 33 |     def test_cs_convert(self):
 34 |         instance = ConfigSpaceConverter()
 35 | 
 36 |         actual = instance.convert(self.config)
 37 |         expected = self.__get_expected_cs()
 38 |         self.assertEqual(expected, actual)
 39 | 
 40 |     def test_tpot_convert(self):
 41 |         instance = TpotConverter()
 42 | 
 43 |         actual = instance.convert(self.config)
 44 |         expected = self.__get_expected_tpot()
 45 |         self.assertEqual(expected, actual)
 46 | 
 47 |     def test_hyperopt_convert(self):
 48 |         instance = HyperoptConverter(as_scope=False)
 49 | 
 50 |         actual = instance.convert(self.config)
 51 |         expected = self.__get_expected_hp()
 52 | 
 53 |         self.assertEqual(str(expected), str(actual))
 54 | 
 55 |     def test_random_search_convert(self):
 56 |         instance = RandomSearchConverter()
 57 | 
 58 |         actual = instance.convert(self.config)
 59 |         expected = self.__get_expected_random_search()
 60 |         self.assertEqual(expected, actual)
 61 | 
 62 |     def test_grid_search_convert(self):
 63 |         instance = GridSearchConverter()
 64 | 
 65 |         actual = instance.convert(self.config)
 66 |         expected = self.__get_expected_grid_search()
 67 |         self.assertEqual(expected, actual)
 68 | 
 69 |     def test_optunity_convert(self):
 70 |         instance = OptunityConverter()
 71 | 
 72 |         actual = instance.convert(self.config)
 73 |         expected = self.__get_expected_optunity()
 74 |         self.assertEqual(expected, actual)
 75 | 
 76 |     def test_btb_convert(self):
 77 |         instance = BtbConverter()
 78 | 
 79 |         actual = instance.convert(self.config)
 80 |         expected = self.__get_expected_btb()
 81 |         self.assertEqual(expected, actual)
 82 | 
 83 |     @staticmethod
 84 |     def __get_expected_cs():
 85 |         svc = ConfigurationSpace()
 86 |         kernel = CategoricalHyperparameter('kernel', ['linear', 'rbf', 'poly', 'sigmoid'], default_value='poly')
 87 |         svc.add_hyperparameter(kernel)
 88 |         C = UniformFloatHyperparameter('C', 0.001, 1000.0, default_value=1.0)
 89 |         shrinking = CategoricalHyperparameter('shrinking', [True, False], default_value=True)
 90 |         svc.add_hyperparameters([C, shrinking])
 91 |         degree = UniformIntegerHyperparameter('degree', 1, 5, default_value=3)  # Only used by kernel poly
 92 |         coef0 = UniformFloatHyperparameter('coef0', 0.0, 10.0, default_value=0.0)  # poly, sigmoid
 93 |         svc.add_hyperparameters([degree, coef0])
 94 |         use_degree = InCondition(child=degree, parent=kernel, values=['poly'])
 95 |         use_coef0 = InCondition(child=coef0, parent=kernel, values=['poly', 'sigmoid'])
 96 |         svc.add_conditions([use_degree, use_coef0])
 97 |         gamma = UniformFloatHyperparameter('gamma', 0.0001, 8, default_value=1)
 98 |         svc.add_hyperparameters([gamma])
 99 |         svc.add_condition(InCondition(child=gamma, parent=kernel, values=['rbf', 'poly', 'sigmoid']))
100 | 
101 |         cs = ConfigurationSpace()
102 |         estimator = CategoricalHyperparameter('__choice__', ['sklearn.svm.SVC'], default_value='sklearn.svm.SVC')
103 |         cs.add_hyperparameter(estimator)
104 |         parent_hyperparameter = {'parent': estimator, 'value': 'sklearn.svm.SVC'}
105 |         cs.add_configuration_space('sklearn.svm.SVC', svc, parent_hyperparameter=parent_hyperparameter)
106 | 
107 |         return cs
108 | 
109 |     @staticmethod
110 |     def __get_expected_tpot():
111 |         SaneEqualityArray((2,), buffer=np.array([10.0, 1.0]))
112 | 
113 |         return {
114 |             'sklearn.svm.SVC': {
115 |                 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
116 |                 'C': SaneEqualityArray((10,), buffer=np.array(
117 |                     [1.000000e-03, 1.000009e+02, 2.000008e+02, 3.000007e+02, 4.000006e+02, 5.000005e+02, 6.000004e+02,
118 |                      7.000003e+02, 8.000002e+02, 9.000001e+02])),
119 |                 'shrinking': [True, False],
120 |                 'degree': range(1, 5),
121 |                 'gamma': SaneEqualityArray((10,), buffer=np.array(
122 |                     [1.00000e-04, 8.00090e-01, 1.60008e+00, 2.40007e+00, 3.20006e+00, 4.00005e+00,
123 |                      4.80004e+00, 5.60003e+00, 6.40002e+00, 7.20001e+00])),
124 | 
125 |                 'coef0': SaneEqualityArray((10,), buffer=np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]))
126 |             }
127 |         }
128 | 
129 |     @staticmethod
130 |     def __get_expected_hp():
131 |         expected_hp_space = hp.choice('estimator_type', [
132 |             hp.choice('custom_sklearn.svm.SVC', [
133 |                 {
134 |                     'algorithm': 'sklearn.svm.SVC',
135 |                     'kernel': 'linear',
136 |                     'C': hp.uniform('custom_sklearn.svm.SVC_linear_C', 0.001, 1000.0),
137 |                     'shrinking': hp.choice('custom_sklearn.svm.SVC_linear_shrinking', [True, False])
138 |                 },
139 |                 {
140 |                     'algorithm': 'sklearn.svm.SVC',
141 |                     'kernel': 'rbf',
142 |                     'C': hp.uniform('custom_sklearn.svm.SVC_rbf_C', 0.001, 1000.0),
143 |                     'gamma': hp.uniform('custom_sklearn.svm.SVC_rbf_gamma', 0.0001, 8),
144 |                     'shrinking': hp.choice('custom_sklearn.svm.SVC_rbf_shrinking', [True, False])
145 |                 },
146 |                 {
147 |                     'algorithm': 'sklearn.svm.SVC',
148 |                     'kernel': 'poly',
149 |                     'C': hp.uniform('custom_sklearn.svm.SVC_poly_C', 0.001, 1000.0),
150 |                     'gamma': hp.uniform('custom_sklearn.svm.SVC_poly_gamma', 0.0001, 8),
151 |                     'degree': scope.int(hp.quniform('custom_sklearn.svm.SVC_poly_degree', 1, 5, 1)),
152 |                     'coef0': hp.uniform('custom_sklearn.svm.SVC_poly_coef0', 0.0, 10.0),
153 |                     'shrinking': hp.choice('custom_sklearn.svm.SVC_poly_shrinking', [True, False])
154 |                 },
155 |                 {
156 |                     'algorithm': 'sklearn.svm.SVC',
157 |                     'kernel': 'sigmoid',
158 |                     'C': hp.uniform('custom_sklearn.svm.SVC_sigmoid_C', 0.001, 1000.0),
159 |                     'gamma': hp.uniform('custom_sklearn.svm.SVC_sigmoid_gamma', 0.0001, 8),
160 |                     'coef0': hp.uniform('custom_sklearn.svm.SVC_sigmoid_coef0', 0.0, 10.0),
161 |                     'shrinking': hp.choice('custom_sklearn.svm.SVC_sigmoid_shrinking', [True, False])
162 |                 }
163 |             ])
164 |         ])
165 | 
166 |         return expected_hp_space
167 | 
168 |     @staticmethod
169 |     def __get_expected_random_search():
170 |         return {
171 |             'sklearn.svm.SVC': {
172 |                 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
173 |                 'C': SaneEqualityDist(a=0.001, b=999.999),
174 |                 'shrinking': [True, False],
175 |                 'degree': range(1, 5),
176 |                 'gamma': SaneEqualityDist(a=0.0001, b=7.9999),
177 |                 'coef0': SaneEqualityDist(a=0.0, b=10)
178 |             }
179 |         }
180 | 
181 |     @staticmethod
182 |     def __get_expected_grid_search():
183 |         return {
184 |             'sklearn.svm.SVC': {
185 |                 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
186 |                 'C': SaneEqualityArray((10,), buffer=np.array(
187 |                     [1.00000e-03, 1.11112e+02, 2.22223e+02, 3.33334e+02, 4.44445e+02, 5.55556e+02, 6.66667e+02,
188 |                      7.77778e+02, 8.88889e+02, 1.00000e+03])),
189 |                 'shrinking': [True, False],
190 |                 'degree': SaneEqualityArray((4,), buffer=np.array([1., 2., 3., 4.])),
191 |                 'gamma': SaneEqualityArray((10,), buffer=np.array(
192 |                     [0.0001, 0.8889777777777778, 1.7778555555555555, 2.6667333333333336, 3.5556111111111113,
193 |                      4.4444888888888885, 5.333366666666667, 6.222244444444444, 7.111122222222222, 8.0])),
194 |                 'coef0': SaneEqualityArray((10,), buffer=np.array(
195 |                     [0., 1.11111111, 2.22222222, 3.33333333, 4.44444444, 5.55555556, 6.66666667, 7.77777778, 8.88888889,
196 |                      10.]))
197 |             }
198 |         }
199 | 
200 |     @staticmethod
201 |     def __get_expected_optunity():
202 |         return {
203 |             'algorithm': {
204 |                 'sklearn.svm.SVC': {
205 |                     'kernel': {
206 |                         'linear': {
207 |                             'C': [0.001, 1000.0],
208 |                             'shrinking': {'True': None, 'False': None}
209 |                         },
210 |                         'rbf': {
211 |                             'C': [0.001, 1000.0],
212 |                             'gamma': [0.0001, 8],
213 |                             'shrinking': {'True': None, 'False': None}
214 |                         },
215 |                         'poly': {
216 |                             'C': [0.001, 1000.0],
217 |                             'gamma': [0.0001, 8],
218 |                             'degree': {'1': None, '2': None, '3': None, '4': None},
219 |                             'coef0': [0, 10],
220 |                             'shrinking': {'True': None, 'False': None}
221 |                         },
222 |                         'sigmoid': {
223 |                             'C': [0.001, 1000.0],
224 |                             'gamma': [0.0001, 8],
225 |                             'coef0': [0, 10],
226 |                             'shrinking': {'True': None, 'False': None}
227 |                         }
228 |                     }
229 |                 }
230 |             }
231 |         }
232 | 
233 |     @staticmethod
234 |     def __get_expected_btb():
235 |         return [
236 |             {
237 |                 'name': 'sklearn.svm.SVC',
238 |                 'class': 'sklearn.svm.SVC',
239 |                 'hyperparameters': {
240 |                     'C': {
241 |                         'type': 'float',
242 |                         'range': [0.001, 1000.0]
243 |                     },
244 |                     'gamma': {
245 |                         'type': 'float',
246 |                         'range': [0.0001, 8],
247 |                     },
248 |                     'kernel': {
249 |                         'type': 'string',
250 |                         'values': ['linear', 'rbf', 'poly', 'sigmoid']
251 |                     },
252 |                     'degree': {
253 |                         'type': 'int',
254 |                         'range': [1, 5]
255 |                     },
256 |                     'coef0': {
257 |                         'type': 'float',
258 |                         'range': [0.0, 10.0]
259 |                     },
260 |                     'shrinking': {
261 |                         'type': 'bool',
262 |                         'values': [True, False]
263 |                     }
264 |                 },
265 |                 'root_hyperparameters': ['kernel', 'C', 'shrinking'],
266 |                 'conditional_hyperparameters': {
267 |                     'kernel': {
268 |                         'rbf': ['gamma'],
269 |                         'sigmoid': ['gamma', 'coef0'],
270 |                         'poly': ['gamma', 'degree', 'coef0']
271 |                     }
272 |                 }
273 |             }
274 |         ]
275 | 


--------------------------------------------------------------------------------
/benchmark/open_ml.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import time
  4 | from typing import Generator, Optional
  5 | 
  6 | import math
  7 | import numpy as np
  8 | import openml
  9 | import pandas as pd
 10 | from sklearn.model_selection import train_test_split, KFold
 11 | from sklearn.preprocessing import LabelEncoder
 12 | 
 13 | import util.logger
 14 | from benchmark import AbstractBenchmark, create_estimator
 15 | from config import BaseConverter, NoopConverter, MetaConfigCollection
 16 | 
 17 | logger = util.logger.get()
 18 | 
 19 | 
 20 | class OpenMLDataManager():
 21 |     def __init__(self, openml_task_id: int, rng=None):
 22 |         self.X = None
 23 |         self.y = None
 24 |         self.categorical = None
 25 |         self.names = None
 26 |         self.folds = []
 27 | 
 28 |         self.save_to = os.path.expanduser('~/OpenML')
 29 |         self.task_id = openml_task_id
 30 | 
 31 |         if rng is None:
 32 |             self.rng = np.random.RandomState()
 33 |         else:
 34 |             self.rng = rng
 35 | 
 36 |         if not os.path.isdir(self.save_to):
 37 |             logger.debug('Create directory {}'.format(self.save_to))
 38 |             os.makedirs(self.save_to)
 39 | 
 40 |         openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'
 41 |         openml.config.set_cache_directory(self.save_to)
 42 | 
 43 |     def load(self, shuffle: bool = False) -> 'OpenMLDataManager':
 44 |         '''
 45 |         Loads dataset from OpenML in _config.data_directory.
 46 |         Downloads data if necessary.
 47 | 
 48 |         Returns
 49 |         -------
 50 |         X_train: np.array
 51 |         y_train: np.array
 52 |         X_test: np.array
 53 |         y_test: np.array
 54 |         '''
 55 | 
 56 |         task = openml.tasks.get_task(self.task_id)
 57 | 
 58 |         dataset = openml.datasets.get_dataset(dataset_id=task.dataset_id)
 59 |         X, y, categorical, self.names = dataset.get_data(
 60 |             target=dataset.default_target_attribute
 61 |         )
 62 | 
 63 |         for name, cat in zip(self.names, categorical):
 64 |             if cat:
 65 |                 enc = LabelEncoder()
 66 |                 missing = np.any(pd.isna(X[name]))
 67 | 
 68 |                 missing_vec = pd.isna(X[name])
 69 | 
 70 |                 x_tmp = X[name].cat.add_categories(['<MISSING>']).fillna('<MISSING>')
 71 |                 X[name] = enc.fit_transform(x_tmp)
 72 | 
 73 |                 if missing:
 74 |                     idx = enc.transform(['<MISSING>'])[0]
 75 |                     X[name][X[name] == idx] = np.nan
 76 |                     assert pd.isna(X[name]).equals(missing_vec)
 77 | 
 78 |         X = X.values
 79 |         y = y.values.__array__()
 80 |         self.y = LabelEncoder().fit_transform(y)
 81 |         self.X = X.astype(np.float64)
 82 | 
 83 |         if shuffle:
 84 |             shuffle = self.rng.permutation(X.shape[0])
 85 |             self.X = self.X[shuffle[:]]
 86 |             self.y = self.y[shuffle[:]]
 87 | 
 88 |         self.categorical = categorical
 89 |         return self
 90 | 
 91 | 
 92 | class OpenMLHoldoutDataManager(OpenMLDataManager):
 93 | 
 94 |     def load(self, test_size: float = 0.3) -> 'OpenMLHoldoutDataManager':
 95 |         super().load()
 96 |         X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size)
 97 |         ls = [X_train, y_train, X_test, y_test]
 98 |         self.folds.append(ls)
 99 |         return self
100 | 
101 | 
102 | class OpenMLCVDataManager(OpenMLDataManager):
103 | 
104 |     def load(self, n_splits: int = 4) -> 'OpenMLCVDataManager':
105 |         super().load()
106 |         kf = KFold(n_splits=n_splits)
107 |         for train_index, test_index in kf.split(self.X):
108 |             X_train, X_test = self.X[train_index], self.X[test_index]
109 |             y_train, y_test = self.y[train_index], self.y[test_index]
110 | 
111 |             ls = [X_train, y_train, X_test, y_test]
112 |             self.folds.append(ls)
113 |         return self
114 | 
115 | 
116 | class OpenMLBenchmark(AbstractBenchmark):
117 | 
118 |     def __init__(self, task_id: int, test_size: Optional[float] = 0.3, n_folds: Optional[int] = 4, load: bool = True):
119 |         super().__init__()
120 |         self.task_id = task_id
121 |         self.fold = None
122 | 
123 |         if load:
124 |             if test_size is not None:
125 |                 data = OpenMLHoldoutDataManager(task_id).load(test_size)
126 |             else:
127 |                 data = OpenMLCVDataManager(task_id).load(n_folds)
128 |             self.folds = data.folds
129 |             self.categorical = data.categorical
130 |             self.column_names = data.names
131 | 
132 |     def objective_function(self, configuration, timeout: int = 300, budget=1, seed=None):
133 |         start_time = time.time()
134 |         manager = multiprocessing.Manager()
135 |         score = manager.Value('d', 1.0)
136 |         avg_score = 0
137 | 
138 |         # logger.debug('Testing configuration {}'.format(configuration))
139 |         for idx, fold in enumerate(self.folds):
140 |             X_train, y_train, X_test, y_test = fold
141 | 
142 |             size = int(budget * X_train.shape[0])
143 |             X_train = X_train[:size]
144 |             y_train = y_train[:size]
145 | 
146 |             p = multiprocessing.Process(target=self._fit_and_score,
147 |                                         args=(configuration, X_train, y_train, X_test, y_test, score))
148 |             p.start()
149 |             p.join(30)
150 | 
151 |             if p.is_alive():
152 |                 logger.debug('Abort fitting after timeout')
153 |                 p.terminate()
154 |                 p.join()
155 |             avg_score += score.value
156 | 
157 |         c = time.time() - start_time
158 |         return {'function_value': avg_score / len(self.folds), 'cost': c, 'start': start_time, 'end': start_time + c}
159 | 
160 |     def _fit_and_score(self, configuration, X_train, y_train, X_test, y_test, score):
161 |         try:
162 |             clf = create_estimator(configuration)
163 |             clf = clf.fit(X_train, y_train)
164 |             score.value = 1 - clf.score(X_test, y_test)
165 |         except Exception as ex:
166 |             logger.error('Uncaught exception {} for {}'.format(ex, configuration))
167 | 
168 |     def objective_function_test(self, configuration, **kwargs):
169 |         pass
170 | 
171 |     @staticmethod
172 |     def get_configuration_space(converter: BaseConverter = NoopConverter()):
173 |         return converter.convert(MetaConfigCollection.from_json('assets/classifier.json'))
174 | 
175 |     def get_meta_information(self):
176 |         return {'name': 'OpenML_Task_{}'.format(self.task_id), 'cash': True}
177 | 
178 | 
179 | def fix_no_tags(result_dict, tag):
180 |     v = result_dict.get(tag, [])
181 |     if isinstance(v, list):
182 |         return v
183 |     elif isinstance(v, dict):
184 |         return [v]
185 |     else:
186 |         raise TypeError()
187 | 
188 | 
189 | class OpenMLCSVBenchmark(OpenMLBenchmark):
190 | 
191 |     def __init__(self, train_file: str, target_column: str, test_file: str, n_splits: int = 4):
192 |         super().__init__(-1, load=False)
193 | 
194 |         X_train = pd.read_csv(train_file)
195 |         y_train = X_train[target_column]
196 |         X_train.drop(target_column, axis=1, inplace=True)
197 | 
198 |         X_test = pd.read_csv(test_file)
199 |         self.names = X_test.columns
200 |         self.column_names = list(self.names)
201 |         self.categorical = [False] * len(X_test)
202 |         self.X_test = X_test.values
203 | 
204 |         X = X_train.values
205 |         y = y_train.values.__array__()
206 |         self.y = LabelEncoder().fit_transform(y)
207 |         self.X = X.astype(np.float64)
208 | 
209 |         shuffle = self.rng.permutation(X.shape[0])
210 |         self.X = self.X[shuffle[:]]
211 |         self.y = self.y[shuffle[:]]
212 | 
213 |         self.folds = []
214 |         # kf = KFold(n_splits=n_splits)
215 |         # for train_index, test_index in kf.split(self.X):
216 |         #     X_train, X_test = self.X[train_index], self.X[test_index]
217 |         #     y_train, y_test = self.y[train_index], self.y[test_index]
218 |         #
219 |         #     ls = [X_train, y_train, X_test, y_test]
220 |         #     self.folds.append(ls)
221 |         X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3)
222 |         ls = [X_train, y_train, X_test, y_test]
223 |         self.folds.append(ls)
224 | 
225 |     def format_output(self, predictions, algorithm: str, fold: int):
226 |         pass
227 | 
228 | 
229 | class OttoBenchmark(OpenMLCSVBenchmark):
230 | 
231 |     def __init__(self, n_splits: int = 4):
232 |         super().__init__('assets/otto/train.csv', 'target', 'assets/otto/test.csv', n_splits)
233 | 
234 |         # Remove index column
235 |         for i in range(len(self.folds)):
236 |             self.folds[i][0] = self.folds[i][0][:, 1:]
237 |             self.folds[i][2] = self.folds[i][2][:, 1:]
238 | 
239 |         self.names = self.names[1:]
240 |         self.column_names = self.column_names[1:]
241 | 
242 |         self.indices = self.X_test[:, 0]
243 |         self.X_test = self.X_test[:, 1:]
244 | 
245 |     def format_output(self, predictions, algorithm: str, fold: int):
246 |         res = np.zeros((len(predictions), 10))
247 |         res[:, 0] = self.indices
248 | 
249 |         if predictions.ndim == 1:
250 |             for row, column in enumerate(predictions):
251 |                 res[row, column + 1] = 1
252 |         else:
253 |             res[:, 1:] = predictions
254 | 
255 |         df = pd.DataFrame(
256 |             columns=['id', 'Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8',
257 |                      'Class_9'],
258 |             data=res)
259 |         df['id'] = df['id'].astype(int)
260 | 
261 |         file = 'assets/otto/{}/{}_{}.csv'.format(algorithm, algorithm, fold)
262 |         df.to_csv(file, index=False)
263 | 
264 |         print(predictions)
265 | 
266 | 
267 | class SantanderBenchmark(OpenMLCSVBenchmark):
268 | 
269 |     def __init__(self, n_splits: int = 4):
270 |         super().__init__('assets/santander/train.csv', 'TARGET', 'assets/santander/test.csv', n_splits)
271 | 
272 |         # Remove index column
273 |         for i in range(len(self.folds)):
274 |             self.folds[i][0] = self.folds[i][0][:, 1:]
275 |             self.folds[i][2] = self.folds[i][2][:, 1:]
276 | 
277 |         self.names = self.names[1:]
278 |         self.column_names = self.column_names[1:]
279 | 
280 |         self.indices = self.X_test[:, 0]
281 |         self.X_test = self.X_test[:, 1:]
282 | 
283 |     def format_output(self, predictions, algorithm: str, fold: int):
284 |         res = np.zeros((len(predictions), 2))
285 |         res[:, 0] = self.indices
286 |         res[:, 1] = predictions[:, 1]
287 | 
288 |         df = pd.DataFrame(
289 |             columns=['ID', 'TARGET'],
290 |             data=res)
291 |         df['ID'] = df['ID'].astype(int)
292 | 
293 |         file = 'assets/santander/{}/{}_{}.csv'.format(algorithm, algorithm, fold)
294 |         df.to_csv(file, index=False)
295 | 
296 |         print(predictions)
297 | 
298 | 
299 | openml.study.functions._multitag_to_list = fix_no_tags
300 | 
301 | 
302 | class OpenML100Suite:
303 | 
304 |     def __init__(self):
305 |         self.save_to = os.path.expanduser('~/OpenML')
306 | 
307 |         if not os.path.isdir(self.save_to):
308 |             logger.info('Create directory {}'.format(self.save_to))
309 |             os.makedirs(self.save_to)
310 | 
311 |         openml.config.apikey = '610344db6388d9ba34f6db45a3cf71de'
312 |         openml.config.set_cache_directory(self.save_to)
313 | 
314 |     @staticmethod
315 |     def load(chunk: int = None, total_chunks: int = 8) -> Generator[OpenMLBenchmark, None, None]:
316 |         benchmark_suite = openml.study.get_study('OpenML100', 'tasks')
317 |         chunk_size = int(math.ceil(len(benchmark_suite.tasks) / total_chunks))
318 | 
319 |         for i, task_id in enumerate(benchmark_suite.tasks):
320 |             if chunk is not None and (i < chunk * chunk_size or i >= (chunk + 1) * chunk_size):
321 |                 continue
322 | 
323 |             if task_id in [34536]:
324 |                 logger.info('Skipping broken OpenML benchmark {}'.format(task_id))
325 |             else:
326 |                 logger.debug('Loading OpenML benchmark {}'.format(task_id))
327 |                 yield OpenMLBenchmark(task_id)
328 | 
329 |     @staticmethod
330 |     def tasks():
331 |         benchmark_suite = openml.study.get_study('OpenML100', 'tasks')
332 |         return benchmark_suite.tasks
333 | 
334 | 
335 | if __name__ == '__main__':
336 |     util.logger.setup()
337 | 
338 |     suite = OpenML100Suite()
339 |     ls = []
340 |     for benchmark in suite.load():
341 |         print(len(benchmark.X_test) + len(benchmark.X_train))
342 |         ls.append(benchmark)
343 |     logger.info(len(ls))
344 | 


--------------------------------------------------------------------------------
/util/mean_shift.py:
--------------------------------------------------------------------------------
  1 | """Mean shift clustering algorithm.
  2 | 
  3 | Mean shift clustering aims to discover *blobs* in a smooth density of
  4 | samples. It is a centroid based algorithm, which works by updating candidates
  5 | for centroids to be the mean of the points within a given region. These
  6 | candidates are then filtered in a post-processing stage to eliminate
  7 | near-duplicates to form the final set of centroids.
  8 | 
  9 | Seeding is performed using a binning technique for scalability.
 10 | """
 11 | 
 12 | # Authors: Conrad Lee <conradlee@gmail.com>
 13 | #          Alexandre Gramfort <alexandre.gramfort@inria.fr>
 14 | #          Gael Varoquaux <gael.varoquaux@normalesup.org>
 15 | #          Martino Sorbaro <martino.sorbaro@ed.ac.uk>
 16 | 
 17 | import numpy as np
 18 | 
 19 | from sklearn.base import BaseEstimator, ClusterMixin
 20 | from sklearn.cluster import get_bin_seeds
 21 | from sklearn.cluster.mean_shift_ import _mean_shift_single_seed
 22 | from sklearn.metrics import pairwise_distances_argmin
 23 | from sklearn.neighbors import NearestNeighbors
 24 | from sklearn.utils import Parallel, delayed, check_array, check_random_state, gen_batches
 25 | from sklearn.utils.validation import check_is_fitted
 26 | 
 27 | 
 28 | def gower_distances(X, Y):
 29 |     """Compute the distances between the observations in X and Y,
 30 |     that may contain mixed types of data, using an implementation
 31 |     of Gower formula.
 32 |     Parameters
 33 |     ----------
 34 |     X : array-like, or pandas.DataFrame, shape (n_samples, n_features)
 35 |     Y : array-like, or pandas.DataFrame, optional,
 36 |         shape (n_samples, n_features)
 37 |     categorical_features : array-like, optional, shape (n_features)
 38 |         Indicates with True/False whether a column is a categorical attribute.
 39 |         This is useful when categorical atributes are represented as integer
 40 |         values. Categorical ordinal attributes are treated as numeric, and
 41 |         must be marked as false.
 42 |         Alternatively, the categorical_features array can be represented only
 43 |         with the numerical indexes of the categorical attribtes.
 44 |         If the categorical_features array is not provided, by default all
 45 |         non-numeric columns are considered categorical.
 46 |     scale : boolean, list or array, optional (default=True)
 47 |         Indicates if the numerical columns will be scaled between 0 and 1.
 48 |         If false, it is assumed the numerical columns are already scaled.
 49 |         If a list or array, it must countain the ranges of values from
 50 |         numerical columns.
 51 |     Returns
 52 |     -------
 53 |     similarities : ndarray, shape (n_samples_X, n_samples_Y)
 54 |     References
 55 |     ----------
 56 |     Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its
 57 |     Properties.
 58 |     Notes
 59 |     -----
 60 |     The numeric feature ranges are determined from both X and Y.
 61 |     Current implementation does not support sparse matrices.
 62 |     All the non-numerical types (e.g., str), are treated as categorical
 63 |     features.
 64 |     This implementation modifies the Gower's original similarity measure in
 65 |     the folowing aspects:
 66 |     * The values in the original similarity S range between 0 and 1. To
 67 |     guarantee this, it is assumed the numerical features of X and Y are
 68 |     scaled between 0 and 1.
 69 |     * Different from the original similarity S, this implementation
 70 |     returns 1-S.
 71 |     """
 72 |     cat_mask = np.logical_or(X < 0, Y < 0)
 73 |     num_mask = ~ cat_mask
 74 | 
 75 |     # Calculates the similarities for categorical columns
 76 |     cat_dists = (X[cat_mask] != Y[cat_mask])
 77 |     # Calculates the Manhattan distances for numerical columns
 78 |     num_dists = abs(X[num_mask] - Y[num_mask])
 79 | 
 80 |     # Calculates the number of non missing columns
 81 |     non_missing = X.shape[0]
 82 | 
 83 |     # Gets the final results
 84 |     total = np.sum(cat_dists) + np.sum(num_dists)
 85 | 
 86 |     results = total / non_missing
 87 | 
 88 |     return results
 89 | 
 90 | 
 91 | def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0,
 92 |                        n_jobs=None):
 93 |     """Estimate the bandwidth to use with the mean-shift algorithm.
 94 | 
 95 |     That this function takes time at least quadratic in n_samples. For large
 96 |     datasets, it's wise to set that parameter to a small value.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     X : array-like, shape=[n_samples, n_features]
101 |         Input points.
102 | 
103 |     quantile : float, default 0.3
104 |         should be between [0, 1]
105 |         0.5 means that the median of all pairwise distances is used.
106 | 
107 |     n_samples : int, optional
108 |         The number of samples to use. If not given, all samples are used.
109 | 
110 |     random_state : int, RandomState instance or None (default)
111 |         The generator used to randomly select the samples from input points
112 |         for bandwidth estimation. Use an int to make the randomness
113 |         deterministic.
114 |         See :term:`Glossary <random_state>`.
115 | 
116 |     n_jobs : int or None, optional (default=None)
117 |         The number of parallel jobs to run for neighbors search.
118 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
119 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
120 |         for more details.
121 | 
122 |     Returns
123 |     -------
124 |     bandwidth : float
125 |         The bandwidth parameter.
126 |     """
127 |     X = check_array(X)
128 | 
129 |     random_state = check_random_state(random_state)
130 |     if n_samples is not None:
131 |         idx = random_state.permutation(X.shape[0])[:n_samples]
132 |         X = X[idx]
133 |     n_neighbors = int(X.shape[0] * quantile)
134 |     if n_neighbors < 1:  # cannot fit NearestNeighbors with n_neighbors = 0
135 |         n_neighbors = 1
136 |     nbrs = NearestNeighbors(n_neighbors=n_neighbors,
137 |                             n_jobs=n_jobs,
138 |                             metric=gower_distances)
139 |     nbrs.fit(X)
140 | 
141 |     bandwidth = 0.
142 |     for batch in gen_batches(len(X), 500):
143 |         d, _ = nbrs.kneighbors(X[batch, :], return_distance=True)
144 |         bandwidth += np.max(d, axis=1).sum()
145 | 
146 |     return bandwidth / X.shape[0]
147 | 
148 | 
149 | def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False,
150 |                min_bin_freq=1, cluster_all=True, max_iter=300,
151 |                n_jobs=None):
152 |     """Perform mean shift clustering of data using a flat kernel.
153 | 
154 |     Read more in the :ref:`User Guide <mean_shift>`.
155 | 
156 |     Parameters
157 |     ----------
158 | 
159 |     X : array-like, shape=[n_samples, n_features]
160 |         Input data.
161 | 
162 |     bandwidth : float, optional
163 |         Kernel bandwidth.
164 | 
165 |         If bandwidth is not given, it is determined using a heuristic based on
166 |         the median of all pairwise distances. This will take quadratic time in
167 |         the number of samples. The sklearn.cluster.estimate_bandwidth function
168 |         can be used to do this more efficiently.
169 | 
170 |     seeds : array-like, shape=[n_seeds, n_features] or None
171 |         Point used as initial kernel locations. If None and bin_seeding=False,
172 |         each data point is used as a seed. If None and bin_seeding=True,
173 |         see bin_seeding.
174 | 
175 |     bin_seeding : boolean, default=False
176 |         If true, initial kernel locations are not locations of all
177 |         points, but rather the location of the discretized version of
178 |         points, where points are binned onto a grid whose coarseness
179 |         corresponds to the bandwidth. Setting this option to True will speed
180 |         up the algorithm because fewer seeds will be initialized.
181 |         Ignored if seeds argument is not None.
182 | 
183 |     min_bin_freq : int, default=1
184 |        To speed up the algorithm, accept only those bins with at least
185 |        min_bin_freq points as seeds.
186 | 
187 |     cluster_all : boolean, default True
188 |         If true, then all points are clustered, even those orphans that are
189 |         not within any kernel. Orphans are assigned to the nearest kernel.
190 |         If false, then orphans are given cluster label -1.
191 | 
192 |     max_iter : int, default 300
193 |         Maximum number of iterations, per seed point before the clustering
194 |         operation terminates (for that seed point), if has not converged yet.
195 | 
196 |     n_jobs : int or None, optional (default=None)
197 |         The number of jobs to use for the computation. This works by computing
198 |         each of the n_init runs in parallel.
199 | 
200 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
201 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
202 |         for more details.
203 | 
204 |         .. versionadded:: 0.17
205 |            Parallel Execution using *n_jobs*.
206 | 
207 |     Returns
208 |     -------
209 | 
210 |     cluster_centers : array, shape=[n_clusters, n_features]
211 |         Coordinates of cluster centers.
212 | 
213 |     labels : array, shape=[n_samples]
214 |         Cluster labels for each point.
215 | 
216 |     Notes
217 |     -----
218 |     For an example, see :ref:`examples/cluster/plot_mean_shift.py
219 |     <sphx_glr_auto_examples_cluster_plot_mean_shift.py>`.
220 | 
221 |     """
222 | 
223 |     if bandwidth is None:
224 |         bandwidth = estimate_bandwidth(X, n_jobs=n_jobs)
225 |         print(bandwidth)
226 |     elif bandwidth <= 0:
227 |         raise ValueError("bandwidth needs to be greater than zero or None,"
228 |                          " got %f" % bandwidth)
229 |     if seeds is None:
230 |         if bin_seeding:
231 |             seeds = get_bin_seeds(X, bandwidth, min_bin_freq)
232 |         else:
233 |             seeds = X
234 |     n_samples, n_features = X.shape
235 |     center_intensity_dict = {}
236 | 
237 |     # We use n_jobs=1 because this will be used in nested calls under
238 |     # parallel calls to _mean_shift_single_seed so there is no need for
239 |     # for further parallelism.
240 |     nbrs = NearestNeighbors(radius=bandwidth, n_jobs=1, metric=gower_distances).fit(X)
241 | 
242 |     # execute iterations on all seeds in parallel
243 |     all_res = Parallel(n_jobs=n_jobs)(
244 |         delayed(_mean_shift_single_seed)
245 |         (seed, X, nbrs, max_iter) for seed in seeds)
246 |     # copy results in a dictionary
247 |     for i in range(len(seeds)):
248 |         if all_res[i] is not None:
249 |             center_intensity_dict[all_res[i][0]] = all_res[i][1]
250 | 
251 |     if not center_intensity_dict:
252 |         # nothing near seeds
253 |         raise ValueError("No point was within bandwidth=%f of any seed."
254 |                          " Try a different seeding strategy \
255 |                          or increase the bandwidth."
256 |                          % bandwidth)
257 | 
258 |     # POST PROCESSING: remove near duplicate points
259 |     # If the distance between two kernels is less than the bandwidth,
260 |     # then we have to remove one because it is a duplicate. Remove the
261 |     # one with fewer points.
262 | 
263 |     sorted_by_intensity = sorted(center_intensity_dict.items(),
264 |                                  key=lambda tup: (tup[1], tup[0]),
265 |                                  reverse=True)
266 |     sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
267 |     unique = np.ones(len(sorted_centers), dtype=np.bool)
268 |     nbrs = NearestNeighbors(radius=bandwidth,
269 |                             n_jobs=n_jobs,
270 |                             metric=gower_distances).fit(sorted_centers)
271 |     for i, center in enumerate(sorted_centers):
272 |         if unique[i]:
273 |             neighbor_idxs = nbrs.radius_neighbors([center],
274 |                                                   return_distance=False)[0]
275 |             unique[neighbor_idxs] = 0
276 |             unique[i] = 1  # leave the current point as unique
277 |     cluster_centers = sorted_centers[unique]
278 | 
279 |     # ASSIGN LABELS: a point belongs to the cluster that it is closest to
280 |     nbrs = NearestNeighbors(n_neighbors=1, n_jobs=n_jobs, metric=gower_distances).fit(cluster_centers)
281 |     labels = np.zeros(n_samples, dtype=np.int)
282 |     distances, idxs = nbrs.kneighbors(X)
283 |     if cluster_all:
284 |         labels = idxs.flatten()
285 |     else:
286 |         labels.fill(-1)
287 |         bool_selector = distances.flatten() <= bandwidth
288 |         labels[bool_selector] = idxs.flatten()[bool_selector]
289 |     return cluster_centers, labels
290 | 
291 | 
292 | class CustomMeanShift(BaseEstimator, ClusterMixin):
293 |     """Mean shift clustering using a flat kernel.
294 | 
295 |     Mean shift clustering aims to discover "blobs" in a smooth density of
296 |     samples. It is a centroid-based algorithm, which works by updating
297 |     candidates for centroids to be the mean of the points within a given
298 |     region. These candidates are then filtered in a post-processing stage to
299 |     eliminate near-duplicates to form the final set of centroids.
300 | 
301 |     Seeding is performed using a binning technique for scalability.
302 | 
303 |     Read more in the :ref:`User Guide <mean_shift>`.
304 | 
305 |     Parameters
306 |     ----------
307 |     bandwidth : float, optional
308 |         Bandwidth used in the RBF kernel.
309 | 
310 |         If not given, the bandwidth is estimated using
311 |         sklearn.cluster.estimate_bandwidth; see the documentation for that
312 |         function for hints on scalability (see also the Notes, below).
313 | 
314 |     seeds : array, shape=[n_samples, n_features], optional
315 |         Seeds used to initialize kernels. If not set,
316 |         the seeds are calculated by clustering.get_bin_seeds
317 |         with bandwidth as the grid size and default values for
318 |         other parameters.
319 | 
320 |     bin_seeding : boolean, optional
321 |         If true, initial kernel locations are not locations of all
322 |         points, but rather the location of the discretized version of
323 |         points, where points are binned onto a grid whose coarseness
324 |         corresponds to the bandwidth. Setting this option to True will speed
325 |         up the algorithm because fewer seeds will be initialized.
326 |         default value: False
327 |         Ignored if seeds argument is not None.
328 | 
329 |     min_bin_freq : int, optional
330 |        To speed up the algorithm, accept only those bins with at least
331 |        min_bin_freq points as seeds. If not defined, set to 1.
332 | 
333 |     cluster_all : boolean, default True
334 |         If true, then all points are clustered, even those orphans that are
335 |         not within any kernel. Orphans are assigned to the nearest kernel.
336 |         If false, then orphans are given cluster label -1.
337 | 
338 |     n_jobs : int or None, optional (default=None)
339 |         The number of jobs to use for the computation. This works by computing
340 |         each of the n_init runs in parallel.
341 | 
342 |         ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
343 |         ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
344 |         for more details.
345 | 
346 |     Attributes
347 |     ----------
348 |     cluster_centers_ : array, [n_clusters, n_features]
349 |         Coordinates of cluster centers.
350 | 
351 |     labels_ :
352 |         Labels of each point.
353 | 
354 |     Examples
355 |     --------
356 |     >>> from sklearn.cluster import MeanShift
357 |     >>> import numpy as np
358 |     >>> X = np.array([[1, 1], [2, 1], [1, 0],
359 |     ...               [4, 7], [3, 5], [3, 6]])
360 |     >>> clustering = MeanShift(bandwidth=2).fit(X)
361 |     >>> clustering.labels_
362 |     array([1, 1, 1, 0, 0, 0])
363 |     >>> clustering.predict([[0, 0], [5, 5]])
364 |     array([1, 0])
365 |     >>> clustering # doctest: +NORMALIZE_WHITESPACE
366 |     MeanShift(bandwidth=2, bin_seeding=False, cluster_all=True, min_bin_freq=1,
367 |          n_jobs=None, seeds=None)
368 | 
369 |     Notes
370 |     -----
371 | 
372 |     Scalability:
373 | 
374 |     Because this implementation uses a flat kernel and
375 |     a Ball Tree to look up members of each kernel, the complexity will tend
376 |     towards O(T*n*log(n)) in lower dimensions, with n the number of samples
377 |     and T the number of points. In higher dimensions the complexity will
378 |     tend towards O(T*n^2).
379 | 
380 |     Scalability can be boosted by using fewer seeds, for example by using
381 |     a higher value of min_bin_freq in the get_bin_seeds function.
382 | 
383 |     Note that the estimate_bandwidth function is much less scalable than the
384 |     mean shift algorithm and will be the bottleneck if it is used.
385 | 
386 |     References
387 |     ----------
388 | 
389 |     Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward
390 |     feature space analysis". IEEE Transactions on Pattern Analysis and
391 |     Machine Intelligence. 2002. pp. 603-619.
392 | 
393 |     """
394 | 
395 |     def __init__(self, bandwidth=None, seeds=None, bin_seeding=False,
396 |                  min_bin_freq=1, cluster_all=True, n_jobs=None):
397 |         self.bandwidth = bandwidth
398 |         self.seeds = seeds
399 |         self.bin_seeding = bin_seeding
400 |         self.cluster_all = cluster_all
401 |         self.min_bin_freq = min_bin_freq
402 |         self.n_jobs = n_jobs
403 | 
404 |     def fit(self, X, y=None):
405 |         """Perform clustering.
406 | 
407 |         Parameters
408 |         ----------
409 |         X : array-like, shape=[n_samples, n_features]
410 |             Samples to cluster.
411 | 
412 |         y : Ignored
413 | 
414 |         """
415 |         X = check_array(X)
416 |         self.cluster_centers_, self.labels_ = \
417 |             mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds,
418 |                        min_bin_freq=self.min_bin_freq,
419 |                        bin_seeding=self.bin_seeding,
420 |                        cluster_all=self.cluster_all, n_jobs=self.n_jobs)
421 |         return self
422 | 
423 |     def predict(self, X):
424 |         """Predict the closest cluster each sample in X belongs to.
425 | 
426 |         Parameters
427 |         ----------
428 |         X : {array-like, sparse matrix}, shape=[n_samples, n_features]
429 |             New data to predict.
430 | 
431 |         Returns
432 |         -------
433 |         labels : array, shape [n_samples,]
434 |             Index of the cluster each sample belongs to.
435 |         """
436 |         check_is_fitted(self, "cluster_centers_")
437 | 
438 |         return pairwise_distances_argmin(X, self.cluster_centers_)
439 | 


--------------------------------------------------------------------------------