├── models ├── __init__.py ├── power_law.py ├── gaussian_processor.py ├── feed_forward_nn.py ├── preact_resnet.py ├── conditioned_power_law.py ├── conditioned_power_law_v1.py ├── conditioned_janoschek.py ├── conditioned_mmf.py ├── conditioned_power_law_v2.py ├── conditioned_log_power.py └── breaking_power_law.py ├── test ├── __init__.py └── benchmark │ ├── __init__.py │ └── test_lcbench.py ├── benchmarks ├── __init__.py ├── benchmark.py ├── hyperbo.py ├── lcbench.py └── taskset.py ├── data_loader ├── __init__.py └── tabular_data_loader.py ├── dataset ├── __init__.py └── tabular_dataset.py ├── surrogate_models ├── __init__.py ├── dehb │ ├── __init__.py │ ├── dehb │ │ ├── README.md │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── bracket_manager.py │ │ ├── optimizers │ │ │ └── __init__.py │ │ └── __init__.py │ ├── requirements.txt │ ├── examples │ │ ├── imgs │ │ │ └── black-gray-box.png │ │ ├── 03_pytorch_mnist_hpo.py │ │ └── 00_interfacing_DEHB.ipynb │ ├── utils │ │ ├── run_dask_setup.sh │ │ ├── dask_scheduler.sh │ │ └── dask_workers.sh │ └── README.md ├── random_search.py ├── dragonfly.py └── asha.py ├── bash_scripts ├── nasbench_dataset_names.txt ├── submit.sh ├── lcbench_dataset_names.txt ├── pd1_dataset_names.txt ├── taskset_dataset_names.txt ├── experiment_run_array.sh └── experiment_array.moab ├── main_experiment.py ├── CITATION.cff ├── .gitignore ├── README.md ├── python_scripts ├── cifar10.py └── download_task_set_data.py ├── requirements.txt ├── framework.py ├── LICENSE └── plots └── debugging.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /surrogate_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /surrogate_models/dehb/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /surrogate_models/dehb/dehb/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bash_scripts/nasbench_dataset_names.txt: -------------------------------------------------------------------------------- 1 | cifar10 cifar100 ImageNet16-120 2 | -------------------------------------------------------------------------------- /surrogate_models/dehb/dehb/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .bracket_manager import SHBracketManager -------------------------------------------------------------------------------- /surrogate_models/dehb/dehb/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .de import DE, AsyncDE 2 | from .dehb import DEHB, DEHBBase 3 | -------------------------------------------------------------------------------- /bash_scripts/submit.sh: -------------------------------------------------------------------------------- 1 | ./experiment_run_array.sh "taskset" "/work/ws/nemo/fr_ak1206-learning_curve-0/power_law" 2 | 3 | -------------------------------------------------------------------------------- /surrogate_models/dehb/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.18.2 2 | loguru>=0.5.3 3 | dask>=2.27.0 4 | distributed>=2.27.0 5 | ConfigSpace>=0.4.16 -------------------------------------------------------------------------------- /surrogate_models/dehb/dehb/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimizers import DE, AsyncDE 2 | from .optimizers import DEHB 3 | from .utils import SHBracketManager 4 | -------------------------------------------------------------------------------- /surrogate_models/dehb/examples/imgs/black-gray-box.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machinelearningnuremberg/DPL/HEAD/surrogate_models/dehb/examples/imgs/black-gray-box.png -------------------------------------------------------------------------------- /bash_scripts/lcbench_dataset_names.txt: -------------------------------------------------------------------------------- 1 | blood-transfusion-service-center vehicle jungle_chess_2pcs_raw_endgame_complete kr-vs-kp cnae-9 numerai28.6 helena fabert kc1 KDDCup09_appetency car MiniBooNE Australian sylvine jannis albert phoneme mfeat-factors nomao jasmine credit-g dionis Amazon_employee_access christine APSFailure Fashion-MNIST volkert higgs bank-marketing shuttle adult connect-4 segment airlines covertype 2 | -------------------------------------------------------------------------------- /data_loader/tabular_data_loader.py: -------------------------------------------------------------------------------- 1 | class WrappedDataLoader: 2 | def __init__(self, dl, dev): 3 | self.dl = dl 4 | self.device = dev 5 | 6 | def __len__(self): 7 | return len(self.dl) 8 | 9 | def __iter__(self): 10 | batches = iter(self.dl) 11 | for b in batches: 12 | yield b[0].to(self.device), b[1].to(self.device), b[2].to(self.device), b[3].to(self.device) 13 | -------------------------------------------------------------------------------- /dataset/tabular_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | 4 | class TabularDataset(Dataset): 5 | 6 | def __init__(self, X, y, budgets, curves): 7 | self.X = X 8 | self.y = y 9 | self.budgets = budgets 10 | self.curves = curves 11 | 12 | def __len__(self): 13 | return self.y.size 14 | 15 | def __getitem__(self, idx): 16 | 17 | return self.X[idx], self.y[idx], self.budgets[idx], self.curves[idx] 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /surrogate_models/dehb/utils/run_dask_setup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | while getopts f:e:n: flag 4 | do 5 | case "${flag}" in 6 | f) filename=${OPTARG};; # specified as -f 7 | e) envname=${OPTARG};; # specified as -e 8 | n) nworkers=${OPTARG};; # specified as -n 9 | esac 10 | done 11 | 12 | echo "Submitting Dask scheduler..." 13 | sbatch utils/dask_scheduler.sh -f $filename -e $envname 14 | 15 | for ((i=1; i<=$nworkers; i++)); do 16 | echo "Submitting worker "$i"..." 17 | sbatch utils/dask_workers.sh -f $filename -e $envname -w worker$i 18 | sleep 2 19 | done 20 | -------------------------------------------------------------------------------- /bash_scripts/pd1_dataset_names.txt: -------------------------------------------------------------------------------- 1 | imagenet_resnet_batch_size_512 uniref50_transformer_batch_size_128 translate_wmt_xformer_translate_batch_size_64 lm1b_transformer_batch_size_2048 imagenet_resnet_batch_size_256 mnist_max_pooling_cnn_tanh_batch_size_256 mnist_max_pooling_cnn_relu_batch_size_256 mnist_simple_cnn_batch_size_256 fashion_mnist_max_pooling_cnn_tanh_batch_size_256 fashion_mnist_max_pooling_cnn_relu_batch_size_256 fashion_mnist_simple_cnn_batch_size_256 svhn_no_extra_wide_resnet_batch_size_1024 svhn_no_extra_wide_resnet_batch_size_256 cifar100_wide_resnet_batch_size_256 cifar10_wide_resnet_batch_size_256 2 | -------------------------------------------------------------------------------- /surrogate_models/dehb/utils/dask_scheduler.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #SBATCH -p cluster-name 4 | #SBATCH --gres=gpu:0 5 | #SBATCH --mem 0 6 | #SBATCH -c 2 7 | #SBATCH -J scheduler 8 | #SBATCH -t 6-00 9 | 10 | while getopts f:e: flag 11 | do 12 | case "${flag}" in 13 | f) filename=${OPTARG};; # specified as -f 14 | e) envname=${OPTARG};; # specified as -e 15 | esac 16 | done 17 | 18 | # setting up environment 19 | source $HOME/anaconda3/bin/activate $envname 20 | 21 | # Creating a Dask scheduler 22 | PYTHONPATH=$PWD dask-scheduler --scheduler-file $filename 23 | 24 | # for more options: https://docs.dask.org/en/latest/setup/cli.html#dask-scheduler 25 | -------------------------------------------------------------------------------- /bash_scripts/taskset_dataset_names.txt: -------------------------------------------------------------------------------- 1 | FixedTextRNNClassification_imdb_patch32_GRU64_avg_bs128 FixedTextRNNClassification_imdb_patch32_GRU128_bs128 FixedTextRNNClassification_imdb_patch32_IRNN64_relu_avg_bs128 FixedTextRNNClassification_imdb_patch32_IRNN64_relu_last_bs128 FixedTextRNNClassification_imdb_patch32_LSTM128_bs128 FixedTextRNNClassification_imdb_patch32_LSTM128_E128_bs128 FixedTextRNNClassification_imdb_patch32_VRNN64_relu_avg_bs128 FixedTextRNNClassification_imdb_patch32_VRNN64_tanh_avg_bs128 FixedTextRNNClassification_imdb_patch32_VRNN128_tanh_bs128 FixedTextRNNClassification_imdb_patch128_LSTM128_avg_bs64 FixedTextRNNClassification_imdb_patch128_LSTM128_bs64 FixedTextRNNClassification_imdb_patch128_LSTM128_embed128_bs64 2 | -------------------------------------------------------------------------------- /surrogate_models/dehb/utils/dask_workers.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | #SBATCH -p cluster-name 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem 0 6 | #SBATCH -J worker 7 | #SBATCH -t 6-00 8 | 9 | while getopts f:e:w: flag 10 | do 11 | case "${flag}" in 12 | f) filename=${OPTARG};; # specified as -f 13 | e) envname=${OPTARG};; # specified as -e 14 | w) workername=${OPTARG};; # specified as -w 15 | esac 16 | done 17 | 18 | # setting up environment 19 | source $HOME/anaconda3/bin/activate $envname 20 | 21 | # creating a Dask worker 22 | PYTHONPATH=$PWD dask-worker --scheduler-file $filename --name $workername --resources "GPU=1" --no-nanny 23 | 24 | # for more options: https://docs.dask.org/en/latest/setup/cli.html#dask-worker 25 | -------------------------------------------------------------------------------- /bash_scripts/experiment_run_array.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export benchmark=$1 4 | 5 | if [ "$benchmark" == "lcbench" ] 6 | then 7 | file="lcbench_dataset_names.txt" 8 | elif [ "$benchmark" == "taskset" ] 9 | then 10 | file="taskset_dataset_names.txt" 11 | elif [ "$benchmark" == "nasbench201" ] 12 | then 13 | file="nasbench_dataset_names.txt" 14 | else 15 | file="pd1_dataset_names.txt" 16 | fi 17 | 18 | export dir="$2" 19 | 20 | if ! [ -e "$file" ] ; then # spaces inside square brackets 21 | echo "$0: $file does not exist" >&2 # error message includes $0 and goes to stderr 22 | exit 1 # exit code is non-zero for error 23 | fi 24 | 25 | NAMES=$(<$file) 26 | for NAME in $NAMES 27 | do 28 | export dataset=$(echo $NAME) 29 | msub -V -t 1-10 experiment_array.moab 30 | done 31 | -------------------------------------------------------------------------------- /bash_scripts/experiment_array.moab: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #MSUB -l walltime=04:00:00 3 | #MSUB -l nodes=1:ppn=2 4 | #MSUB -l pmem=6gb 5 | #MSUB -N power_law_taskset 6 | 7 | # This is a workaround for a known bug. 8 | # Arrayjobs need to be given the output directory 9 | cd /home/fr/fr_fr/fr_ak1206/projekte/powerlaw/DeepRegret 10 | 11 | module load tools/conda/latest 12 | conda activate power_law 13 | 14 | 15 | JOBID=(${MOAB_JOBID//[/ }) 16 | export OMP_NUM_THREADS=1 17 | export OPENBLAS_NUM_THREADS=1 18 | export MKL_NUM_THREADS=1 19 | 20 | python /home/fr/fr_fr/fr_ak1206/projekte/powerlaw/DeepRegret/main_experiment.py --index $MOAB_JOBARRAYINDEX --benchmark_name "$benchmark" --surrogate_name "power_law" --dataset_name "$dataset" --nr_epochs 250 --ensemble_size 5 --budget_limit 1000 --project_dir "/home/fr/fr_fr/fr_ak1206/projekte/powerlaw/DeepRegret" --output_dir "$dir" 21 | -------------------------------------------------------------------------------- /benchmarks/benchmark.py: -------------------------------------------------------------------------------- 1 | class BaseBenchmark: 2 | 3 | nr_hyperparameters = None 4 | max_budget = None 5 | log_indicator = None 6 | hp_names = None 7 | # if the best value corresponds to a lower value 8 | minimization_metric = True 9 | 10 | def __init__(self, path_to_json_file: str): 11 | 12 | self.path_to_json_file = path_to_json_file 13 | 14 | def _load_benchmark(self): 15 | 16 | raise NotImplementedError('Please implement the load_benchmark method') 17 | 18 | def load_dataset_names(self): 19 | raise NotImplementedError('Please implement the load_dataset_names method') 20 | 21 | def get_hyperparameter_candidates(self): 22 | 23 | raise NotImplementedError('Please extend the get_hyperparameter_candidates method') 24 | 25 | def get_performance(self, hp_index: int, budget: int): 26 | 27 | raise NotImplementedError('Please extend the get_performance method') 28 | -------------------------------------------------------------------------------- /models/power_law.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class PowerLaw(torch.nn.Module): 5 | def __init__(self): 6 | 7 | super().__init__() 8 | self.alpha = torch.nn.Parameter(torch.rand(())) 9 | self.beta = torch.nn.Parameter(torch.rand(())) 10 | self.gamma = torch.nn.Parameter(torch.rand(())) 11 | 12 | self.act_func = torch.nn.LeakyReLU() 13 | 14 | def forward(self, x): 15 | """ 16 | In the forward function we accept a Tensor of input data and we must return 17 | a Tensor of output data. We can use Modules defined in the constructor as 18 | well as arbitrary operators on Tensors. 19 | """ 20 | output = torch.add( 21 | self.act_func(self.alpha), 22 | torch.mul( 23 | self.act_func(self.beta), 24 | torch.pow( 25 | x, 26 | torch.mul(self.act_func(self.gamma), -1) 27 | ) 28 | ) 29 | ) 30 | 31 | return output 32 | -------------------------------------------------------------------------------- /models/gaussian_processor.py: -------------------------------------------------------------------------------- 1 | import scipy 2 | from sklearn.gaussian_process import GaussianProcessRegressor 3 | from sklearn.utils.optimize import _check_optimize_result 4 | 5 | 6 | class MyGPR(GaussianProcessRegressor): 7 | def __init__(self, *args, max_iter=5e05, gtol=1e-06, **kwargs): 8 | super().__init__(*args, **kwargs) 9 | self._max_iter = max_iter 10 | self._gtol = gtol 11 | 12 | def _constrained_optimization(self, obj_func, initial_theta, bounds): 13 | if self.optimizer == "fmin_l_bfgs_b": 14 | opt_res = scipy.optimize.minimize(obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds, options={'maxiter':self._max_iter, 'gtol': self._gtol}) 15 | _check_optimize_result("lbfgs", opt_res) 16 | theta_opt, func_min = opt_res.x, opt_res.fun 17 | elif callable(self.optimizer): 18 | theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds) 19 | else: 20 | raise ValueError("Unknown optimizer %s." % self.optimizer) 21 | return theta_opt, func_min 22 | -------------------------------------------------------------------------------- /test/benchmark/test_lcbench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from benchmarks.lcbench import LCBench 5 | 6 | 7 | class TestLCBench(unittest.TestCase): 8 | 9 | def setUp(self) -> None: 10 | 11 | project_folder = os.path.expanduser( 12 | os.path.join( 13 | '~', 14 | 'Desktop', 15 | 'PhD', 16 | 'Projekte', 17 | 'DeepRegret', 18 | ) 19 | ) 20 | 21 | benchmark_data_path = os.path.join( 22 | project_folder, 23 | 'lc_bench', 24 | 'results', 25 | 'data_2k.json', 26 | ) 27 | 28 | self.lcbench = LCBench(benchmark_data_path) 29 | self.dataset_name = 'credit-g' 30 | 31 | def test_load_dataset_names(self): 32 | 33 | dataset_names = [ 34 | 'APSFailure', 'Amazon_employee_access', 'Australian', 35 | 'Fashion-MNIST', 'KDDCup09_appetency', 'MiniBooNE', 36 | 'adult', 'airlines', 'albert', 'bank-marketing', 37 | 'blood-transfusion-service-center', 'car', 'christine', 38 | 'cnae-9', 'connect-4', 'covertype', 'credit-g', 'dionis', 39 | 'fabert', 'helena', 'higgs', 'jannis', 'jasmine', 40 | 'jungle_chess_2pcs_raw_endgame_complete', 'kc1', 'kr-vs-kp', 41 | 'mfeat-factors', 'nomao', 'numerai28.6', 'phoneme', 'segment', 42 | 'shuttle', 'sylvine', 'vehicle', 'volkert', 43 | ] 44 | 45 | self.assertEqual(dataset_names, self.lcbench.dataset_names) 46 | 47 | def test_get_hyperparameter_candidates(self): 48 | 49 | hp_configs = self.lcbench.get_hyperparameter_candidates(self.dataset_name) 50 | self.assertEqual(hp_configs.shape, (LCBench.nr_hyperparameters, len(LCBench.param_space))) 51 | 52 | def test_get_performance(self): 53 | 54 | hp_index = 0 55 | self.assertGreaterEqual( 56 | self.lcbench.get_performance(self.dataset_name, hp_index, LCBench.max_budget), 57 | self.lcbench.get_performance(self.dataset_name, hp_index, 1), 58 | ) 59 | -------------------------------------------------------------------------------- /models/feed_forward_nn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class NN(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | dropout_fraction=0.2, 13 | nr_classes=1, 14 | ): 15 | """ 16 | 17 | Args: 18 | nr_initial_features: int 19 | The number of features per example. 20 | nr_units: int 21 | The number of units for every layer. 22 | nr_layers: int 23 | The number of layers for the neural network. 24 | dropout_fraction: float 25 | The dropout fraction to be used through training. 26 | nr_classes: int 27 | The number of classes in the dataset. 28 | """ 29 | super(NN, self).__init__() 30 | self.nr_layers = nr_layers 31 | self.fc1 = nn.Linear(nr_initial_features, nr_units) 32 | self.bn1 = nn.BatchNorm1d(nr_units) 33 | for i in range(2, nr_layers + 1): 34 | setattr(self, f'fc{i}', nn.Linear(nr_units, nr_units)) 35 | setattr(self, f'bn{i}', nn.BatchNorm1d(nr_units)) 36 | setattr(self, f'fc{nr_layers + 1}', nn.Linear(nr_units, nr_classes)) 37 | 38 | self.dropout = nn.Dropout(p=dropout_fraction) 39 | self.last_act_func = torch.nn.LeakyReLU() 40 | 41 | 42 | def forward(self, x): 43 | 44 | x = x.view(-1, self.num_flat_features(x)) 45 | 46 | x = self.last_act_func(self.bn1(self.fc1(x))) 47 | for i in range(2, self.nr_layers + 1): 48 | x = self.dropout(x) 49 | temp_layer = getattr(self, f'fc{i}') 50 | x = self.last_act_func(getattr(self, f'bn{i}')(temp_layer(x))) 51 | 52 | x = self.dropout(x) 53 | x = getattr(self, f'fc{self.nr_layers + 1}')(x) 54 | 55 | return x 56 | 57 | def num_flat_features(self, x): 58 | size = x.size()[1:] # all dimensions except the batch dimension 59 | num_features = 1 60 | for s in size: 61 | num_features *= s 62 | return num_features 63 | -------------------------------------------------------------------------------- /main_experiment.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | 5 | from framework import Framework 6 | 7 | parser = argparse.ArgumentParser( 8 | description='DPL publication experiments.', 9 | ) 10 | parser.add_argument( 11 | '--index', 12 | type=int, 13 | default=1, 14 | help='The worker index. Every worker runs the same experiment, however, with a different seed.', 15 | ) 16 | parser.add_argument( 17 | '--fantasize_step', 18 | type=int, 19 | default=1, 20 | help='The step used in fantasizing the next learning curve value from the last' 21 | 'observed one for a certain hyperparameter configuration.', 22 | ) 23 | parser.add_argument( 24 | '--budget_limit', 25 | type=int, 26 | default=1000, 27 | help='The maximal number of HPO iterations.', 28 | ) 29 | parser.add_argument( 30 | '--ensemble_size', 31 | type=int, 32 | default=5, 33 | help='The ensemble size for the DPL surrogate.', 34 | ) 35 | parser.add_argument( 36 | '--nr_epochs', 37 | type=int, 38 | default=250, 39 | help='The number of epochs used to train (not refine) the HPO surrogate.', 40 | ) 41 | parser.add_argument( 42 | '--dataset_name', 43 | type=str, 44 | default='credit-g', 45 | help='The name of the dataset used in the experiment.' 46 | 'The dataset names must be matched with the benchmark they belong to.', 47 | ) 48 | parser.add_argument( 49 | '--benchmark_name', 50 | type=str, 51 | default='lcbench', 52 | help='The name of the benchmark used in the experiment. ' 53 | 'Every benchmark offers its own distinctive datasets. Available options are lcbench, taskset and pd1.', 54 | ) 55 | parser.add_argument( 56 | '--surrogate_name', 57 | type=str, 58 | default='power_law', 59 | help='The method that will be run.', 60 | ) 61 | parser.add_argument( 62 | '--project_dir', 63 | type=str, 64 | default='.', 65 | help='The directory where the project files are located.', 66 | ) 67 | parser.add_argument( 68 | '--output_dir', 69 | type=str, 70 | default='./output', 71 | help='The directory where the project output files will be stored.', 72 | ) 73 | 74 | args = parser.parse_args() 75 | seeds = np.arange(10) 76 | seed = seeds[args.index - 1] 77 | 78 | framework = Framework(args, seed) 79 | framework.run() 80 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | title: Scaling Laws for Hyperparameter Optimization 2 | abstract: Hyperparameter optimization is an important subfield of machine learning that focuses on tuning the hyperparameters of a chosen algorithm to achieve peak performance. Recently, there has been a stream of methods that tackle the issue of hyperparameter optimization, however, most of the methods do not exploit the dominant power law nature of learning curves for Bayesian optimization. In this work, we propose Deep Power Laws (DPL), an ensemble of neural network models conditioned to yield predictions that follow a power-law scaling pattern. Our method dynamically decides which configurations to pause and train incrementally by making use of gray-box evaluations. We compare our method against 7 state-of-the-art competitors on 3 benchmarks related to tabular, image, and NLP datasets covering 59 diverse tasks. Our method achieves the best results across all benchmarks by obtaining the best any-time results compared to all competitors. 3 | authors: 4 | - family-names: Kadra 5 | given-names: Arlind 6 | orcid: "https://orcid.org/0000-0001-9308-6576" 7 | - family-names: Janowski 8 | given-names: Maciej 9 | - family-names: Wistuba 10 | given-names: Martin 11 | - family-names: Grabocka 12 | given-names: Josif 13 | cff-version: 1.2.0 14 | date-released: "2023-02-01" 15 | identifiers: 16 | - type: url 17 | value: "https://github.com/releaunifreiburg/DPL/" 18 | description: Latest version 19 | keywords: 20 | - scaling laws 21 | - power law 22 | - hyperparameter optimization 23 | - gray-box hyperparameter optimization 24 | - multi-fidelity hyperparameter optimization 25 | license: Apache-2.0 26 | message: If you use DPL in your project, please cite our paper. 27 | repository-code: "https://github.com/releaunifreiburg/DPL/" 28 | preferred-citation: 29 | title: "Scaling Laws for Hyperparameter Optimization" 30 | type: conference-paper 31 | authors: 32 | - family-names: Kadra 33 | given-names: Arlind 34 | - family-names: Janowski 35 | given-names: Maciej 36 | - family-names: Wistuba 37 | given-names: Martin 38 | - family-names: Grabocka 39 | given-names: Josif 40 | collection-title: "Thirty-seventh Conference on Neural Information Processing Systems" # booktitle 41 | collection-type: "proceedings" 42 | year: 2023 43 | url: "https://openreview.net/forum?id=ghzEUGfRMD" 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /surrogate_models/random_search.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | 5 | 6 | class RandomOptimizer: 7 | def __init__( 8 | self, 9 | hyperparameter_candidates: np.ndarray, 10 | max_budget: int = 52, 11 | seed: int = 0, 12 | max_nr_trials=1000, 13 | **kwargs, 14 | ): 15 | """ 16 | Wrapper for the Random search algorithm. 17 | 18 | Args: 19 | ----- 20 | hyperparameter_candidates: np.ndarray 21 | 2d array which contains all possible configurations which can be queried. 22 | max_budget: int 23 | The number of max epochs used during the HPO optimization. 24 | seed: int 25 | Seed used to reproduce the experiments. 26 | max_nr_trials: int 27 | The total runtime budget, given as the number of epochs spent during HPO. 28 | """ 29 | self.hyperparameter_candidates = hyperparameter_candidates 30 | self.rng = np.random.RandomState(seed) 31 | np.random.seed(seed) 32 | self.evaluated_configurations = set() 33 | self.max_budget = max_budget 34 | self.max_trials = max_nr_trials 35 | self.extra_args = kwargs 36 | 37 | def suggest(self) -> Tuple[int, int]: 38 | """ 39 | Get information about the next configuration. 40 | 41 | Returns: 42 | ________ 43 | next_conf, conf_budget: tuple 44 | A tuple that contains information about the next 45 | configuration (index in the hyperparameter_candidates it was 46 | given) and the budget for the hyperparameter to be evaluated 47 | on. 48 | """ 49 | possible_candidates = {i for i in range(self.hyperparameter_candidates.shape[0])} 50 | not_evaluated_candidates = possible_candidates - self.evaluated_configurations 51 | config_index = np.random.choice(list(not_evaluated_candidates)) 52 | self.evaluated_configurations.add(config_index) 53 | 54 | # if not enough budget to give max fidelity, give max budget 55 | max_budget = min(self.max_budget, self.max_trials) 56 | 57 | return config_index, max_budget 58 | 59 | def observe( 60 | self, 61 | hp_index: int, 62 | budget: int, 63 | learning_curve: List[float], 64 | ): 65 | """ 66 | Respond regarding the performance of a 67 | hyperparameter configuration. get_next should 68 | be called first to retrieve the configuration. 69 | 70 | Args: 71 | ----- 72 | hp_index: int 73 | The index of the evaluated hyperparameter configuration. 74 | budget: int 75 | The budget for which the hyperparameter configuration was evaluated. 76 | learning curve: np.ndarray, list 77 | validation accuracy curve. The last value is the same as the score. 78 | """ 79 | pass 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scaling Laws for Hyperparameter Optimization 2 | 3 | Hyperparameter optimization is an important subfield of machine learning that focuses on tuning the hyperparameters of a chosen algorithm to achieve peak performance. Recently, there has been a stream of methods that tackle the issue of hyperparameter optimization, however, most of the methods do not exploit the scaling law property of learning curves. In this work, we propose Deep Power Law (DPL), a neural network model conditioned to yield predictions that follow a power-law scaling pattern. Our model dynamically decides which configurations to pause and train incrementally by making use of multi-fidelity estimation. We compare our method against 7 state-of-the-art competitors on 3 benchmarks related to tabular, image, and NLP datasets covering 57 diverse search spaces. Our method achieves the best results across all benchmarks by obtaining the best any-time results compared to all competitors. 4 | 5 | Authors: Arlind Kadra, Maciej Janowski, Martin Wistuba, Josif Grabocka 6 | 7 | 8 | ## Setting up the virtual environment 9 | 10 | ``` 11 | # The following commands assume the user is in the cloned directory 12 | conda create -n dpl python=3.8 13 | conda activate dpl 14 | cat requirements.txt | xargs -n 1 -L 1 pip install 15 | ``` 16 | 17 | ## Add the LCBench code & data 18 | 19 | 20 | Copy the contents of `https://github.com/automl/LCBench` into a folder `lc_bench` in the root DPL repo. 21 | 22 | From `https://figshare.com/projects/LCBench/74151` download `data_2k.zip` and extract the json file into `DPL/lc_bench/results/data_2k.json`. 23 | 24 | ## Running the Deep Power Laws (DPL) code 25 | 26 | The entry script to running the experiment is `main_experiment.py`. The module can be used to start a full HPO search. 27 | 28 | The main arguments for `main_experiment.py` are: 29 | 30 | - `--index`: The worker index. Every worker runs the same experiment, however, with a different seed. 31 | - `--fantasize_step`: The step used in fantasizing the next learning curve value from the last observed one for a certain hyperparameter configuration. 32 | - `--budget_limit`: The maximal number of HPO iterations. 33 | - `--ensemble_size`: The ensemble size for the DPL surrogate. 34 | - `--nr_epochs`: The number of epochs used to train (not refine) the HPO surrogate. 35 | - `--dataset_name`: The name of the dataset used in the experiment. The dataset names must be matched with the benchmark they belong to. 36 | - `--benchmark_name`: The name of the benchmark used in the experiment. Every benchmark offers its own distinctive datasets. Available options are lcbench, taskset and pd1. 37 | - `--surrogate_name`: The method that will be run. 38 | - `--project_dir`: The directory where the project files are located. 39 | - `--output_dir`: The directory where the project output files will be stored. 40 | 41 | **A minimal example of running DPL**: 42 | 43 | ``` 44 | python main_experiment.py --index 1 --fantasize_step 1 --budget_limit 1000 --ensemble_size 5 --nr_epochs 250 --dataset_name "credit-g" --benchmark_name "lcbench" --surrogate_name "power_law" --project_dir "." --output_dir "." 45 | 46 | ``` 47 | 48 | The example above will run the first repetition (pertaining to the first seed) for a HPO budget of 1000 trials. It will use dataset credit-g from the lcbench benchmark. 49 | The experiment will run the power law surrogate with an ensemble size of 5 members, where we will run each selected hyperparameter configuration by the acquisition function with 1 more step. 50 | In the beginning and everytime that the training procedure is restarted, the models will be trained for 250 epochs. The script will consider the current folder as the project folder and it 51 | will save the output files at the current folder. 52 | 53 | ## Plots 54 | 55 | The plots that are included in our paper were generated from the functions in the module `plots/normalized_regret.py`. 56 | The plots expect the following result folder structure: 57 | 58 | ``` 59 | ├── results_folder 60 | │ ├── benchmark_name 61 | │ │ ├── method_name 62 | │ │ │ ├── dataset_name_repetitionid.json 63 | 64 | ``` 65 | ## Citation 66 | ``` 67 | @inproceedings{ 68 | kadra2023scaling, 69 | title={Scaling Laws for Hyperparameter Optimization}, 70 | author={Arlind Kadra and Maciej Janowski and Martin Wistuba and Josif Grabocka}, 71 | booktitle={Thirty-seventh Conference on Neural Information Processing Systems}, 72 | year={2023}, 73 | url={https://openreview.net/forum?id=ghzEUGfRMD} 74 | } 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /models/preact_resnet.py: -------------------------------------------------------------------------------- 1 | '''Pre-activation ResNet in PyTorch. 2 | 3 | Reference: 4 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 5 | Identity Mappings in Deep Residual Networks. arXiv:1603.05027 6 | ''' 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | class PreActBlock(nn.Module): 13 | '''Pre-activation version of the BasicBlock.''' 14 | expansion = 1 15 | 16 | def __init__(self, in_planes, planes, stride=1, dropout_rate=0.2): 17 | super(PreActBlock, self).__init__() 18 | self.bn1 = nn.BatchNorm2d(in_planes) 19 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) 22 | self.dropout = nn.Dropout2d(p=dropout_rate) 23 | 24 | if stride != 1 or in_planes != self.expansion*planes: 25 | self.shortcut = nn.Sequential( 26 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 27 | ) 28 | 29 | def forward(self, x): 30 | out = F.relu(self.bn1(x)) 31 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 32 | out = self.conv1(out) 33 | out = self.dropout(out) 34 | out = self.conv2(F.relu(self.bn2(out))) 35 | out += shortcut 36 | return out 37 | 38 | 39 | class PreActBottleneck(nn.Module): 40 | '''Pre-activation version of the original Bottleneck module.''' 41 | expansion = 4 42 | 43 | def __init__(self, in_planes, planes, stride=1, dropout_rate=0.2): 44 | super(PreActBottleneck, self).__init__() 45 | self.bn1 = nn.BatchNorm2d(in_planes) 46 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 47 | self.bn2 = nn.BatchNorm2d(planes) 48 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 49 | self.bn3 = nn.BatchNorm2d(planes) 50 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 51 | self.dropout = nn.Dropout2d(p=dropout_rate) 52 | 53 | if stride != 1 or in_planes != self.expansion*planes: 54 | self.shortcut = nn.Sequential( 55 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 56 | ) 57 | 58 | def forward(self, x): 59 | out = F.relu(self.bn1(x)) 60 | shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x 61 | out = self.conv1(out) 62 | out = self.dropout(out) 63 | out = self.conv2(F.relu(self.bn2(out))) 64 | out = self.dropout(out) 65 | out = self.conv3(F.relu(self.bn3(out))) 66 | out += shortcut 67 | return out 68 | 69 | 70 | class PreActResNet(nn.Module): 71 | def __init__(self, block, num_blocks, drop_rate=0.2, num_classes=10): 72 | super(PreActResNet, self).__init__() 73 | self.in_planes = 64 74 | self.drop_rate = drop_rate 75 | self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) 76 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 77 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 78 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 79 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 80 | self.linear = nn.Linear(512*block.expansion, num_classes) 81 | 82 | def _make_layer(self, block, planes, num_blocks, stride): 83 | strides = [stride] + [1]*(num_blocks-1) 84 | layers = [] 85 | for stride in strides: 86 | layers.append(block(self.in_planes, planes, stride, self.drop_rate)) 87 | self.in_planes = planes * block.expansion 88 | return nn.Sequential(*layers) 89 | 90 | def forward(self, x): 91 | out = self.conv1(x) 92 | out = self.layer1(out) 93 | out = self.layer2(out) 94 | out = self.layer3(out) 95 | out = self.layer4(out) 96 | out = F.avg_pool2d(out, 4) 97 | out = out.view(out.size(0), -1) 98 | out = self.linear(out) 99 | return out 100 | 101 | 102 | def PreActResNet18(drop_rate: float): 103 | return PreActResNet(PreActBlock, [2,2,2,2], drop_rate) 104 | 105 | def PreActResNet34(): 106 | return PreActResNet(PreActBlock, [3,4,6,3]) 107 | 108 | def PreActResNet50(): 109 | return PreActResNet(PreActBottleneck, [3,4,6,3]) 110 | 111 | def PreActResNet101(): 112 | return PreActResNet(PreActBottleneck, [3,4,23,3]) 113 | 114 | def PreActResNet152(): 115 | return PreActResNet(PreActBottleneck, [3,8,36,3]) 116 | -------------------------------------------------------------------------------- /models/conditioned_power_law.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedPowerLaw(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedPowerLaw, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.last_act_func = torch.nn.GLU() 43 | self.tan_func = torch.nn.Tanh() 44 | self.batch_norm = torch.nn.BatchNorm1d 45 | 46 | layers = [] 47 | # adding one since we concatenate the features with the budget 48 | nr_initial_features = nr_initial_features 49 | if self.use_learning_curve: 50 | nr_initial_features = nr_initial_features + nr_filters 51 | 52 | layers.append(nn.Linear(nr_initial_features, nr_units)) 53 | layers.append(self.act_func) 54 | 55 | for i in range(2, nr_layers + 1): 56 | layers.append(nn.Linear(nr_units, nr_units)) 57 | layers.append(self.act_func) 58 | 59 | last_layer = nn.Linear(nr_units, 3) 60 | layers.append(last_layer) 61 | 62 | self.layers = torch.nn.Sequential(*layers) 63 | 64 | cnn_part = [] 65 | if use_learning_curve: 66 | cnn_part.append( 67 | nn.Conv1d( 68 | in_channels=2, 69 | kernel_size=(self.kernel_size,), 70 | out_channels=self.nr_filters, 71 | ), 72 | ) 73 | for i in range(1, self.nr_cnn_layers): 74 | cnn_part.append(self.act_func) 75 | cnn_part.append( 76 | nn.Conv1d( 77 | in_channels=self.nr_filters, 78 | kernel_size=(self.kernel_size,), 79 | out_channels=self.nr_filters, 80 | ), 81 | ), 82 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 83 | 84 | self.cnn = nn.Sequential(*cnn_part) 85 | 86 | def forward( 87 | self, 88 | x: torch.Tensor, 89 | predict_budgets: torch.Tensor, 90 | evaluated_budgets: torch.Tensor, 91 | learning_curves: torch.Tensor, 92 | ): 93 | """ 94 | Args: 95 | x: torch.Tensor 96 | The examples. 97 | predict_budgets: torch.Tensor 98 | The budgets for which the performance will be predicted for the 99 | hyperparameter configurations. 100 | evaluated_budgets: torch.Tensor 101 | The budgets for which the hyperparameter configurations have been 102 | evaluated so far. 103 | learning_curves: torch.Tensor 104 | The learning curves for the hyperparameter configurations. 105 | """ 106 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 107 | if self.use_learning_curve: 108 | lc_features = self.cnn(learning_curves) 109 | # revert the output from the cnn into nr_rows x nr_kernels. 110 | lc_features = torch.squeeze(lc_features, 2) 111 | x = torch.cat((x, lc_features), dim=1) 112 | 113 | x = self.layers(x) 114 | alphas = x[:, 0] 115 | betas = x[:, 1] 116 | gammas = x[:, 2] 117 | 118 | output = torch.add( 119 | alphas, 120 | torch.mul( 121 | self.last_act_func(torch.cat((betas, betas))), 122 | torch.pow( 123 | predict_budgets, 124 | torch.mul(self.last_act_func(torch.cat((gammas, gammas))), -1) 125 | ) 126 | ), 127 | ) 128 | 129 | return output 130 | -------------------------------------------------------------------------------- /models/conditioned_power_law_v1.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedPowerLawV1(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedPowerLaw, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.relu_func = torch.nn.ReLU() 43 | self.last_act_func = torch.nn.GLU() 44 | self.tan_func = torch.nn.Tanh() 45 | self.batch_norm = torch.nn.BatchNorm1d 46 | 47 | layers = [] 48 | # adding one since we concatenate the features with the budget 49 | nr_initial_features = nr_initial_features 50 | if self.use_learning_curve: 51 | nr_initial_features = nr_initial_features + nr_filters 52 | 53 | layers.append(nn.Linear(nr_initial_features, nr_units)) 54 | layers.append(self.act_func) 55 | 56 | for i in range(2, nr_layers + 1): 57 | layers.append(nn.Linear(nr_units, nr_units)) 58 | layers.append(self.act_func) 59 | 60 | last_layer = nn.Linear(nr_units, 4) 61 | 62 | layers.append(last_layer) 63 | with torch.no_grad(): 64 | last_layer.bias.data = torch.Tensor([0.1, 1, 0.2, 0.2]) 65 | 66 | self.layers = torch.nn.Sequential(*layers) 67 | 68 | cnn_part = [] 69 | if use_learning_curve: 70 | cnn_part.append( 71 | nn.Conv1d( 72 | in_channels=2, 73 | kernel_size=(self.kernel_size,), 74 | out_channels=self.nr_filters, 75 | ), 76 | ) 77 | for i in range(1, self.nr_cnn_layers): 78 | cnn_part.append(self.act_func) 79 | cnn_part.append( 80 | nn.Conv1d( 81 | in_channels=self.nr_filters, 82 | kernel_size=(self.kernel_size,), 83 | out_channels=self.nr_filters, 84 | ), 85 | ), 86 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 87 | 88 | self.cnn = nn.Sequential(*cnn_part) 89 | 90 | def forward( 91 | self, 92 | x: torch.Tensor, 93 | predict_budgets: torch.Tensor, 94 | evaluated_budgets: torch.Tensor, 95 | learning_curves: torch.Tensor, 96 | ): 97 | """ 98 | Args: 99 | x: torch.Tensor 100 | The examples. 101 | predict_budgets: torch.Tensor 102 | The budgets for which the performance will be predicted for the 103 | hyperparameter configurations. 104 | evaluated_budgets: torch.Tensor 105 | The budgets for which the hyperparameter configurations have been 106 | evaluated so far. 107 | learning_curves: torch.Tensor 108 | The learning curves for the hyperparameter configurations. 109 | """ 110 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 111 | if self.use_learning_curve: 112 | lc_features = self.cnn(learning_curves) 113 | # revert the output from the cnn into nr_rows x nr_kernels. 114 | lc_features = torch.squeeze(lc_features, 2) 115 | x = torch.cat((x, lc_features), dim=1) 116 | 117 | x = self.layers(x) 118 | a = x[:, 0] 119 | b = x[:, 1] 120 | c = x[:, 2] 121 | d = x[:, 3] 122 | 123 | first_part = self.relu_func( 124 | torch.add( 125 | predict_budgets, 126 | d, 127 | ), 128 | ) 129 | 130 | output = torch.add( 131 | a, 132 | torch.mul( 133 | b, 134 | torch.pow( 135 | first_part, 136 | -c, 137 | ), 138 | ), 139 | ) 140 | 141 | return output 142 | -------------------------------------------------------------------------------- /models/conditioned_janoschek.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedJanoschek(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedJanoschek, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.last_act_func = torch.nn.GLU() 43 | self.tan_func = torch.nn.Tanh() 44 | self.batch_norm = torch.nn.BatchNorm1d 45 | 46 | layers = [] 47 | # adding one since we concatenate the features with the budget 48 | nr_initial_features = nr_initial_features 49 | if self.use_learning_curve: 50 | nr_initial_features = nr_initial_features + nr_filters 51 | 52 | layers.append(nn.Linear(nr_initial_features, nr_units)) 53 | layers.append(self.act_func) 54 | 55 | for i in range(2, nr_layers + 1): 56 | layers.append(nn.Linear(nr_units, nr_units)) 57 | layers.append(self.act_func) 58 | 59 | last_layer = nn.Linear(nr_units, 4) 60 | layers.append(last_layer) 61 | 62 | self.layers = torch.nn.Sequential(*layers) 63 | 64 | cnn_part = [] 65 | if use_learning_curve: 66 | cnn_part.append( 67 | nn.Conv1d( 68 | in_channels=2, 69 | kernel_size=(self.kernel_size,), 70 | out_channels=self.nr_filters, 71 | ), 72 | ) 73 | for i in range(1, self.nr_cnn_layers): 74 | cnn_part.append(self.act_func) 75 | cnn_part.append( 76 | nn.Conv1d( 77 | in_channels=self.nr_filters, 78 | kernel_size=(self.kernel_size,), 79 | out_channels=self.nr_filters, 80 | ), 81 | ), 82 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 83 | 84 | self.cnn = nn.Sequential(*cnn_part) 85 | 86 | def forward( 87 | self, 88 | x: torch.Tensor, 89 | predict_budgets: torch.Tensor, 90 | evaluated_budgets: torch.Tensor, 91 | learning_curves: torch.Tensor, 92 | ): 93 | """ 94 | Args: 95 | x: torch.Tensor 96 | The examples. 97 | predict_budgets: torch.Tensor 98 | The budgets for which the performance will be predicted for the 99 | hyperparameter configurations. 100 | evaluated_budgets: torch.Tensor 101 | The budgets for which the hyperparameter configurations have been 102 | evaluated so far. 103 | learning_curves: torch.Tensor 104 | The learning curves for the hyperparameter configurations. 105 | """ 106 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 107 | if self.use_learning_curve: 108 | lc_features = self.cnn(learning_curves) 109 | # revert the output from the cnn into nr_rows x nr_kernels. 110 | lc_features = torch.squeeze(lc_features, 2) 111 | x = torch.cat((x, lc_features), dim=1) 112 | 113 | x = self.layers(x) 114 | alpha = x[:, 0] 115 | beta = x[:, 1] 116 | k = x[:, 2] 117 | delta = x[:, 3] 118 | 119 | # alpha - (alpha - beta) * e^-(k * x^delta) 120 | output = torch.sub( 121 | alpha, 122 | torch.mul( 123 | torch.sub( 124 | alpha, 125 | beta, 126 | ), 127 | torch.exp( 128 | torch.mul( 129 | -k, 130 | torch.pow( 131 | predict_budgets, 132 | delta, 133 | ), 134 | ) 135 | ) 136 | 137 | ) 138 | ) 139 | 140 | return output 141 | -------------------------------------------------------------------------------- /models/conditioned_mmf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedMMF(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedMMF, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.last_act_func = torch.nn.GLU() 43 | self.tan_func = torch.nn.Tanh() 44 | self.batch_norm = torch.nn.BatchNorm1d 45 | 46 | layers = [] 47 | # adding one since we concatenate the features with the budget 48 | nr_initial_features = nr_initial_features 49 | if self.use_learning_curve: 50 | nr_initial_features = nr_initial_features + nr_filters 51 | 52 | layers.append(nn.Linear(nr_initial_features, nr_units)) 53 | layers.append(self.act_func) 54 | 55 | for i in range(2, nr_layers + 1): 56 | layers.append(nn.Linear(nr_units, nr_units)) 57 | layers.append(self.act_func) 58 | 59 | last_layer = nn.Linear(nr_units, 4) 60 | layers.append(last_layer) 61 | 62 | self.layers = torch.nn.Sequential(*layers) 63 | 64 | cnn_part = [] 65 | if use_learning_curve: 66 | cnn_part.append( 67 | nn.Conv1d( 68 | in_channels=2, 69 | kernel_size=(self.kernel_size,), 70 | out_channels=self.nr_filters, 71 | ), 72 | ) 73 | for i in range(1, self.nr_cnn_layers): 74 | cnn_part.append(self.act_func) 75 | cnn_part.append( 76 | nn.Conv1d( 77 | in_channels=self.nr_filters, 78 | kernel_size=(self.kernel_size,), 79 | out_channels=self.nr_filters, 80 | ), 81 | ), 82 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 83 | 84 | self.cnn = nn.Sequential(*cnn_part) 85 | 86 | def forward( 87 | self, 88 | x: torch.Tensor, 89 | predict_budgets: torch.Tensor, 90 | evaluated_budgets: torch.Tensor, 91 | learning_curves: torch.Tensor, 92 | ): 93 | """ 94 | Args: 95 | x: torch.Tensor 96 | The examples. 97 | predict_budgets: torch.Tensor 98 | The budgets for which the performance will be predicted for the 99 | hyperparameter configurations. 100 | evaluated_budgets: torch.Tensor 101 | The budgets for which the hyperparameter configurations have been 102 | evaluated so far. 103 | learning_curves: torch.Tensor 104 | The learning curves for the hyperparameter configurations. 105 | """ 106 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 107 | if self.use_learning_curve: 108 | lc_features = self.cnn(learning_curves) 109 | # revert the output from the cnn into nr_rows x nr_kernels. 110 | lc_features = torch.squeeze(lc_features, 2) 111 | x = torch.cat((x, lc_features), dim=1) 112 | 113 | x = self.layers(x) 114 | alpha = x[:, 0] 115 | beta = x[:, 1] 116 | delta = x[:, 2] 117 | k = x[:, 3] 118 | 119 | # alpha - ((alpha - beta) / 1 + k * (x)^delta) 120 | output = torch.sub( 121 | alpha, 122 | torch.div( 123 | torch.sub( 124 | alpha, 125 | beta, 126 | ), 127 | torch.add( 128 | 1, 129 | torch.mul( 130 | torch.pow( 131 | predict_budgets, 132 | delta, 133 | ), 134 | k, 135 | ) 136 | ) 137 | ) 138 | ) 139 | 140 | return output 141 | -------------------------------------------------------------------------------- /models/conditioned_power_law_v2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedPowerLawV2(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedPowerLaw, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.relu_func = torch.nn.ReLU() 43 | self.last_act_func = torch.nn.GLU() 44 | self.tan_func = torch.nn.Tanh() 45 | self.batch_norm = torch.nn.BatchNorm1d 46 | 47 | layers = [] 48 | # adding one since we concatenate the features with the budget 49 | nr_initial_features = nr_initial_features 50 | if self.use_learning_curve: 51 | nr_initial_features = nr_initial_features + nr_filters 52 | 53 | layers.append(nn.Linear(nr_initial_features, nr_units)) 54 | layers.append(self.act_func) 55 | 56 | for i in range(2, nr_layers + 1): 57 | layers.append(nn.Linear(nr_units, nr_units)) 58 | layers.append(self.act_func) 59 | 60 | last_layer = nn.Linear(nr_units, 5) 61 | 62 | layers.append(last_layer) 63 | with torch.no_grad(): 64 | last_layer.bias.data = torch.Tensor([0.1, 1, 0.2, 0.2, 0.4]) 65 | 66 | self.layers = torch.nn.Sequential(*layers) 67 | 68 | cnn_part = [] 69 | if use_learning_curve: 70 | cnn_part.append( 71 | nn.Conv1d( 72 | in_channels=2, 73 | kernel_size=(self.kernel_size,), 74 | out_channels=self.nr_filters, 75 | ), 76 | ) 77 | for i in range(1, self.nr_cnn_layers): 78 | cnn_part.append(self.act_func) 79 | cnn_part.append( 80 | nn.Conv1d( 81 | in_channels=self.nr_filters, 82 | kernel_size=(self.kernel_size,), 83 | out_channels=self.nr_filters, 84 | ), 85 | ), 86 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 87 | 88 | self.cnn = nn.Sequential(*cnn_part) 89 | 90 | def forward( 91 | self, 92 | x: torch.Tensor, 93 | predict_budgets: torch.Tensor, 94 | evaluated_budgets: torch.Tensor, 95 | learning_curves: torch.Tensor, 96 | ): 97 | """ 98 | Args: 99 | x: torch.Tensor 100 | The examples. 101 | predict_budgets: torch.Tensor 102 | The budgets for which the performance will be predicted for the 103 | hyperparameter configurations. 104 | evaluated_budgets: torch.Tensor 105 | The budgets for which the hyperparameter configurations have been 106 | evaluated so far. 107 | learning_curves: torch.Tensor 108 | The learning curves for the hyperparameter configurations. 109 | """ 110 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 111 | if self.use_learning_curve: 112 | lc_features = self.cnn(learning_curves) 113 | # revert the output from the cnn into nr_rows x nr_kernels. 114 | lc_features = torch.squeeze(lc_features, 2) 115 | x = torch.cat((x, lc_features), dim=1) 116 | 117 | x = self.layers(x) 118 | a = x[:, 0] 119 | b = x[:, 1] 120 | c = x[:, 2] 121 | d = x[:, 3] 122 | e = x[:, 4] 123 | 124 | first_part = self.relu_func( 125 | torch.add( 126 | torch.mul( 127 | b, 128 | predict_budgets, 129 | ), 130 | d, 131 | ) 132 | ) 133 | 134 | output = torch.add( 135 | a, 136 | torch.mul( 137 | e, 138 | torch.pow( 139 | first_part, 140 | -c, 141 | ), 142 | ), 143 | ) 144 | 145 | return output 146 | -------------------------------------------------------------------------------- /models/conditioned_log_power.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class ConditionedLogPower(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(ConditionedLogPower, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.last_act_func = torch.nn.GLU() 43 | self.tan_func = torch.nn.Tanh() 44 | self.batch_norm = torch.nn.BatchNorm1d 45 | 46 | layers = [] 47 | # adding one since we concatenate the features with the budget 48 | nr_initial_features = nr_initial_features 49 | if self.use_learning_curve: 50 | nr_initial_features = nr_initial_features + nr_filters 51 | 52 | layers.append(nn.Linear(nr_initial_features, nr_units)) 53 | layers.append(self.act_func) 54 | for i in range(2, nr_layers + 1): 55 | layers.append(nn.Linear(nr_units, nr_units)) 56 | layers.append(self.act_func) 57 | 58 | last_layer = nn.Linear(nr_units, 3) 59 | 60 | layers.append(last_layer) 61 | 62 | self.layers = torch.nn.Sequential(*layers) 63 | 64 | cnn_part = [] 65 | if use_learning_curve: 66 | cnn_part.append( 67 | nn.Conv1d( 68 | in_channels=2, 69 | kernel_size=(self.kernel_size,), 70 | out_channels=self.nr_filters, 71 | ), 72 | ) 73 | for i in range(1, self.nr_cnn_layers): 74 | cnn_part.append(self.act_func) 75 | cnn_part.append( 76 | nn.Conv1d( 77 | in_channels=self.nr_filters, 78 | kernel_size=(self.kernel_size,), 79 | out_channels=self.nr_filters, 80 | ), 81 | ), 82 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 83 | 84 | self.cnn = nn.Sequential(*cnn_part) 85 | 86 | def forward( 87 | self, 88 | x: torch.Tensor, 89 | predict_budgets: torch.Tensor, 90 | evaluated_budgets: torch.Tensor, 91 | learning_curves: torch.Tensor, 92 | ): 93 | """ 94 | Args: 95 | x: torch.Tensor 96 | The examples. 97 | predict_budgets: torch.Tensor 98 | The budgets for which the performance will be predicted for the 99 | hyperparameter configurations. 100 | evaluated_budgets: torch.Tensor 101 | The budgets for which the hyperparameter configurations have been 102 | evaluated so far. 103 | learning_curves: torch.Tensor 104 | The learning curves for the hyperparameter configurations. 105 | """ 106 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 107 | if self.use_learning_curve: 108 | lc_features = self.cnn(learning_curves) 109 | # revert the output from the cnn into nr_rows x nr_kernels. 110 | lc_features = torch.squeeze(lc_features, 2) 111 | x = torch.cat((x, lc_features), dim=1) 112 | 113 | x = self.layers(x) 114 | a = x[:, 0] 115 | b = x[:, 1] 116 | c = x[:, 2] 117 | 118 | # a divided by (1 + (x / e^b)^c) 119 | output = torch.div( 120 | a, 121 | torch.add( 122 | 1, 123 | torch.pow( 124 | torch.div( 125 | predict_budgets, 126 | torch.exp(b) 127 | ), 128 | c, 129 | ), 130 | ) 131 | ) 132 | 133 | return output 134 | 135 | @staticmethod 136 | def num_flat_features(x): 137 | size = x.size()[1:] # all dimensions except the batch dimension 138 | num_features = 1 139 | for s in size: 140 | num_features *= s 141 | return num_features 142 | 143 | @staticmethod 144 | def init_weights(m): 145 | if isinstance(m, nn.Linear): 146 | torch.nn.init.xavier_normal_(m.weight) 147 | -------------------------------------------------------------------------------- /benchmarks/hyperbo.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import math 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from syne_tune.blackbox_repository import load_blackbox 7 | from syne_tune.config_space import is_log_space 8 | 9 | from benchmarks.benchmark import BaseBenchmark 10 | 11 | 12 | class PD1(BaseBenchmark): 13 | 14 | def __init__(self, path_to_json_files: str, dataset_name: str, eta=3, number_of_brackets=3): 15 | 16 | super().__init__(path_to_json_files) 17 | self.dataset_name = dataset_name 18 | self.blackbox = self._load_benchmark() 19 | 20 | self.hp_names = None 21 | self.hp_candidates = self.blackbox[dataset_name].hyperparameters.to_numpy() 22 | 23 | # hyperparameter candidates, seed, fidelity, metrics 24 | # 0 is the validation train error 25 | self.validation_error_rate = self.blackbox[dataset_name].objectives_evaluations[:, :, :, 0] 26 | self.validation_error_rate = np.mean(self.validation_error_rate, axis=1) 27 | 28 | self.max_value = 0.0 29 | self.min_value = 1.0 30 | 31 | self.eta = eta 32 | self.number_of_brackets = number_of_brackets 33 | 34 | filtered_indices = self.filter_curves() 35 | self.max_budget = self.blackbox[dataset_name].num_fidelities 36 | # considering an eta of 3 37 | self.min_budget = int(self.max_budget / math.pow(self.eta, self.number_of_brackets)) 38 | self.min_budget = self.min_budget if self.min_budget > 0 else 1 39 | self.log_indicator = self.get_log_indicator() 40 | self.categorical_indicator = [False] * self.hp_candidates[1] 41 | self.validation_error_rate = self.validation_error_rate[filtered_indices] 42 | self.hp_candidates = self.hp_candidates[filtered_indices] 43 | self.nr_hyperparameters = self.validation_error_rate.shape[0] 44 | 45 | hp_names = list(self.blackbox[dataset_name].hyperparameters.columns) 46 | 47 | self.param_space = OrderedDict( 48 | [ 49 | (hp_names[i], [self.blackbox[dataset_name].hyperparameters[hp_names[i]].min(), self.blackbox[dataset_name].hyperparameters[hp_names[i]].max(), float, self.log_indicator[i]]) for i in range(len(hp_names)) 50 | ] 51 | ) 52 | 53 | def get_worst_performance(self): 54 | 55 | return np.amax(self.validation_error_rate) 56 | 57 | def _load_benchmark(self): 58 | 59 | return load_blackbox('pd1') 60 | 61 | @staticmethod 62 | def load_dataset_names(): 63 | 64 | dataset_names = [] 65 | enough_lc_points = 10 66 | 67 | blackbox = load_blackbox('pd1') 68 | for dataset_name in blackbox: 69 | if blackbox[dataset_name].num_fidelities > enough_lc_points: 70 | dataset_names.append(dataset_name) 71 | 72 | return dataset_names 73 | 74 | def get_log_indicator(self): 75 | 76 | log_indicator = [is_log_space(v) for v in self.blackbox[self.dataset_name].configuration_space.values()] 77 | 78 | return log_indicator 79 | 80 | def get_hyperparameter_candidates(self) -> np.ndarray: 81 | 82 | return self.hp_candidates 83 | 84 | def get_performance(self, hp_index: int, budget: int) -> float: 85 | 86 | budget = int(budget) 87 | val_performance = self.validation_error_rate[hp_index, budget - 1] 88 | 89 | return float(val_performance) 90 | 91 | def get_curve(self, hp_index: int, budget: int) -> float: 92 | 93 | budget = int(budget) 94 | val_curve = self.validation_error_rate[hp_index, 0:budget] 95 | return val_curve.tolist() 96 | 97 | def get_incumbent_curve(self): 98 | 99 | best_value = np.inf 100 | best_index = -1 101 | for index in range(0, self.validation_error_rate.shape[0]): 102 | val_error_curve = self.validation_error_rate[index, :] 103 | best_performance = min(val_error_curve) 104 | 105 | if best_performance < best_value: 106 | best_value = best_performance 107 | best_index = index 108 | 109 | return self.validation_error_rate[best_index] 110 | 111 | def get_gap_performance(self): 112 | 113 | incumbent_curve = self.get_incumbent_curve() 114 | best_value = min(incumbent_curve) 115 | worst_value = self.get_worst_performance() 116 | 117 | return worst_value - best_value 118 | 119 | def get_incumbent_config_index(self): 120 | 121 | best_value = np.inf 122 | best_index = -1 123 | for index in range(0, self.validation_error_rate.shape[0]): 124 | val_error_curve = self.validation_error_rate[index, :] 125 | best_performance = min(val_error_curve) 126 | 127 | if best_performance < best_value: 128 | best_value = best_performance 129 | best_index = index 130 | 131 | return best_index 132 | 133 | def filter_curves(self): 134 | 135 | validation_curves = pd.DataFrame(self.validation_error_rate) 136 | # TODO do a query for both values instead of going through the df twice 137 | non_nan_idx = validation_curves.notnull().all(axis=1) 138 | non_diverging_idx = (validation_curves < validation_curves.quantile(0.95).min()).all(axis=1) 139 | 140 | idx = non_nan_idx & non_diverging_idx 141 | 142 | return idx 143 | -------------------------------------------------------------------------------- /models/breaking_power_law.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class BreakingPowerLaw(nn.Module): 6 | 7 | def __init__( 8 | self, 9 | nr_initial_features=10, 10 | nr_units=200, 11 | nr_layers=3, 12 | use_learning_curve: bool = True, 13 | kernel_size: int = 3, 14 | nr_filters: int = 4, 15 | nr_cnn_layers: int = 2, 16 | ): 17 | """ 18 | Args: 19 | nr_initial_features: int 20 | The number of features per example. 21 | nr_units: int 22 | The number of units for every layer. 23 | nr_layers: int 24 | The number of layers for the neural network. 25 | use_learning_curve: bool 26 | If the learning curve should be use in the network. 27 | kernel_size: int 28 | The size of the kernel that is applied in the cnn layer. 29 | nr_filters: int 30 | The number of filters that are used in the cnn layers. 31 | nr_cnn_layers: int 32 | The number of cnn layers to be used. 33 | """ 34 | super(BreakingPowerLaw, self).__init__() 35 | 36 | self.use_learning_curve = use_learning_curve 37 | self.kernel_size = kernel_size 38 | self.nr_filters = nr_filters 39 | self.nr_cnn_layers = nr_cnn_layers 40 | 41 | self.act_func = torch.nn.LeakyReLU() 42 | self.last_act_func = torch.nn.GLU() 43 | self.tan_func = torch.nn.Tanh() 44 | self.sigmoid_func = torch.nn.Sigmoid() 45 | self.batch_norm = torch.nn.BatchNorm1d 46 | 47 | layers = [] 48 | # adding one since we concatenate the features with the budget 49 | nr_initial_features = nr_initial_features 50 | if self.use_learning_curve: 51 | nr_initial_features = nr_initial_features + nr_filters 52 | 53 | layers.append(nn.Linear(nr_initial_features, nr_units)) 54 | layers.append(self.act_func) 55 | 56 | for i in range(2, nr_layers + 1): 57 | layers.append(nn.Linear(nr_units, nr_units)) 58 | layers.append(self.act_func) 59 | 60 | last_layer = nn.Linear(nr_units, 6) 61 | layers.append(last_layer) 62 | 63 | self.layers = torch.nn.Sequential(*layers) 64 | 65 | cnn_part = [] 66 | if use_learning_curve: 67 | cnn_part.append( 68 | nn.Conv1d( 69 | in_channels=2, 70 | kernel_size=(self.kernel_size,), 71 | out_channels=self.nr_filters, 72 | ), 73 | ) 74 | for i in range(1, self.nr_cnn_layers): 75 | cnn_part.append(self.act_func) 76 | cnn_part.append( 77 | nn.Conv1d( 78 | in_channels=self.nr_filters, 79 | kernel_size=(self.kernel_size,), 80 | out_channels=self.nr_filters, 81 | ), 82 | ), 83 | cnn_part.append(nn.AdaptiveAvgPool1d(1)) 84 | 85 | self.cnn = nn.Sequential(*cnn_part) 86 | 87 | def forward( 88 | self, 89 | x: torch.Tensor, 90 | predict_budgets: torch.Tensor, 91 | evaluated_budgets: torch.Tensor, 92 | learning_curves: torch.Tensor, 93 | ): 94 | """ 95 | Args: 96 | x: torch.Tensor 97 | The examples. 98 | predict_budgets: torch.Tensor 99 | The budgets for which the performance will be predicted for the 100 | hyperparameter configurations. 101 | evaluated_budgets: torch.Tensor 102 | The budgets for which the hyperparameter configurations have been 103 | evaluated so far. 104 | learning_curves: torch.Tensor 105 | The learning curves for the hyperparameter configurations. 106 | """ 107 | #x = torch.cat((x, torch.unsqueeze(evaluated_budgets, 1)), dim=1) 108 | if self.use_learning_curve: 109 | lc_features = self.cnn(learning_curves) 110 | # revert the output from the cnn into nr_rows x nr_kernels. 111 | lc_features = torch.squeeze(lc_features, 2) 112 | x = torch.cat((x, lc_features), dim=1) 113 | 114 | x = self.layers(x) 115 | a = x[:, 0] 116 | b = x[:, 1] 117 | c0 = x[:, 2] 118 | c1 = x[:, 3] 119 | #c2 = x[:, 4] 120 | d1 = x[:, 4] 121 | #d2 = x[:, 6] 122 | f1 = x[:, 5] 123 | #f2 = x[:, 8] 124 | 125 | output = torch.add( 126 | a, 127 | torch.mul( 128 | torch.mul( 129 | #torch.mul( 130 | self.last_act_func(torch.cat((b, b))), 131 | torch.pow( 132 | predict_budgets, 133 | torch.mul(self.last_act_func(torch.cat((c0, c0))), -1) 134 | ) 135 | ), 136 | torch.pow(torch.add(1, torch.pow(torch.div(predict_budgets, self.sigmoid_func(d1)), torch.div(1, self.sigmoid_func(f1)))), torch.mul(-c1, self.sigmoid_func(f1))) 137 | ), 138 | #torch.pow(torch.add(1, torch.pow(torch.div(predict_budgets, self.sigmoid_func(d2)), torch.div(1, self.sigmoid_func(f2)))), torch.mul(-c2, self.sigmoid_func(f2))) 139 | ) 140 | 141 | return output 142 | -------------------------------------------------------------------------------- /python_scripts/cifar10.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import json 4 | import os 5 | 6 | import numpy as np 7 | import torch 8 | import torch.nn as nn 9 | import torch.optim as optim 10 | from torchvision import datasets, transforms 11 | 12 | from models.preact_resnet import PreActResNet18 13 | 14 | 15 | def main(): 16 | 17 | # Training settings 18 | parser = argparse.ArgumentParser(description='CIFAR10 benchmarking') 19 | parser.add_argument('--train_batch_size', type=int, default=64, metavar='N', 20 | help='input batch size for training (default: 64)') 21 | parser.add_argument('--test_batch_size', type=int, default=1000, metavar='N', 22 | help='input batch size for testing (default: 1000)') 23 | parser.add_argument('--nr_epochs', type=int, default=50, metavar='N', 24 | help='number of epochs to train (default: 50)') 25 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 26 | help='learning rate (default: 0.01)') 27 | parser.add_argument('--no-cuda', action='store_true', default=False, 28 | help='disables CUDA training') 29 | parser.add_argument('--dry-run', action='store_true', default=False, 30 | help='quickly check a single pass') 31 | parser.add_argument('--seed', type=int, default=11, metavar='S', 32 | help='random seed (default: 11)') 33 | parser.add_argument('--save-model', action='store_true', default=False, 34 | help='For Saving the current Model') 35 | parser.add_argument('--output_path', type=str, default='./benchmarks/cifar', 36 | help='Path where the results will be stored.') 37 | parser.add_argument('--index', type=int, default=1, 38 | help='The index of the dropout rate to be used.') 39 | 40 | args = parser.parse_args() 41 | os.makedirs(args.output_path, exist_ok=True) 42 | file_path = os.path.join( 43 | args.output_path, 44 | f'hp_config{args.index}.log', 45 | ) 46 | torch.manual_seed(args.seed) 47 | np.random.seed(args.seed) 48 | 49 | drop_rate_possible_values = np.arange(0.05, 0.85, 0.05) 50 | drop_rate = drop_rate_possible_values[args.index] 51 | use_cuda = not args.no_cuda and torch.cuda.is_available() 52 | device = torch.device("cuda" if use_cuda else "cpu") 53 | 54 | transform = transforms.Compose( 55 | [ 56 | transforms.ToTensor(), 57 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 58 | ] 59 | ) 60 | 61 | train_set = datasets.CIFAR10( 62 | root='./data', 63 | train=True, 64 | download=False, 65 | transform=transform, 66 | ) 67 | train_loader = torch.utils.data.DataLoader( 68 | train_set, 69 | batch_size=args.train_batch_size, 70 | shuffle=True, 71 | num_workers=1, 72 | ) 73 | 74 | test_set = datasets.CIFAR10( 75 | root='./data', 76 | train=False, 77 | download=False, 78 | transform=transform, 79 | ) 80 | test_loader = torch.utils.data.DataLoader( 81 | test_set, 82 | batch_size=args.test_batch_size, 83 | shuffle=False, 84 | num_workers=1, 85 | ) 86 | 87 | criterion = nn.CrossEntropyLoss() 88 | model = PreActResNet18(drop_rate).to(device) 89 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) 90 | scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.nr_epochs) 91 | test_epoch_performances = [] 92 | for epoch in range(args.nr_epochs): 93 | 94 | model.train() 95 | running_loss = 0.0 96 | 97 | for i, data in enumerate(train_loader, 0): 98 | 99 | inputs, labels = data 100 | # zero the parameter gradients 101 | optimizer.zero_grad() 102 | # forward + backward + optimize 103 | outputs = model(inputs) 104 | loss = criterion(outputs, labels) 105 | loss.backward() 106 | optimizer.step() 107 | 108 | # print statistics 109 | running_loss += loss.item() 110 | if i % 100 == 99: # print every 100 mini-batches 111 | print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 99:.3f}') 112 | running_loss = 0.0 113 | 114 | model.eval() 115 | # total number of examples evaluated 116 | total = 0 117 | # number of correctly classified examples 118 | correct = 0 119 | with torch.no_grad(): 120 | for data in test_loader: 121 | images, labels = data 122 | outputs = model(images) 123 | _, predicted = torch.max(outputs.data, 1) 124 | total += labels.size(0) 125 | correct += (predicted == labels).sum().item() 126 | 127 | fraction_correct_examples = correct / total 128 | test_epoch_performances.append(fraction_correct_examples) 129 | with open(file_path, 'w') as fp: 130 | json.dump({drop_rate: test_epoch_performances}, fp) 131 | 132 | scheduler.step(epoch + 1) 133 | 134 | if args.save_model: 135 | torch.save( 136 | model.state_dict(), 137 | os.path.join(args.output_path, f'{drop_rate}_cifar10_cnn.pt'), 138 | ) 139 | 140 | if __name__ == '__main__': 141 | main() 142 | -------------------------------------------------------------------------------- /python_scripts/download_task_set_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import urllib 5 | 6 | from concurrent import futures 7 | import numpy as np 8 | import tensorflow as tf # for gfile. 9 | import tensorflow_io as tfio 10 | import tqdm 11 | 12 | 13 | parser = argparse.ArgumentParser( 14 | description='Prepare hyperparameter candidates from the taskset task', 15 | ) 16 | parser.add_argument( 17 | '--task_id', 18 | help='The task index to retrieve from all the TaskSet tasks', 19 | type=int, 20 | default=0, 21 | ) 22 | parser.add_argument( 23 | '--output_dir', 24 | help='The output directory where the validation curves and hyperparameter configurations will be saved', 25 | type=str, 26 | default='./taskset', 27 | ) 28 | 29 | gfile = tf.io.gfile 30 | 31 | 32 | def load_joint_cache(task, opt_set_name): 33 | """Loads the learning curves for the given task and opt_set_name.""" 34 | base_dir = "gs://gresearch/task_set_data/" 35 | p = os.path.join(base_dir, task, 36 | "%s_10000_replica5.npz" % (opt_set_name)) 37 | cc = np.load(gfile.GFile(p, "rb")) 38 | return cc["optimizers"], cc["xs"], cc["ys"] 39 | 40 | 41 | def threaded_tqdm_map(threads, func, data): 42 | """Helper that does a map on multiple threads.""" 43 | future_list = [] 44 | with futures.ThreadPoolExecutor(threads) as executor: 45 | for l in tqdm.tqdm(data, position=0): 46 | future_list.append(executor.submit(func, l)) 47 | return [x.result() for x in tqdm.tqdm(future_list, position=0)] 48 | 49 | 50 | def load_tasks(tasks): 51 | """Multi threaded loading of all data for each task. 52 | Args: 53 | tasks: list of task names 54 | Returns: 55 | A dictionary mapping taks name to tuples of: 56 | (optimizer names, x data points, and y data points) 57 | """ 58 | 59 | def load_one(t): 60 | adam8p = load_joint_cache(t, "adam8p_wide_grid_1k") 61 | adam6p = load_joint_cache(t, "adam6p_wide_grid_1k") 62 | adam4p = load_joint_cache(t, "adam4p_wide_grid_1k") 63 | adam1p = load_joint_cache(t, "adam1p_wide_grid_1k") 64 | nadamw = load_joint_cache(t, "nadamw_grid_1k") 65 | return { 66 | "adam8p_wide_grid_1k": adam8p, 67 | "adam6p_wide_grid_1k": adam6p, 68 | "adam4p_wide_grid_1k": adam4p, 69 | "adam1p_wide_grid_1k": adam1p, 70 | "nadamw": nadamw, 71 | } 72 | 73 | results = threaded_tqdm_map(100, load_one, tasks) 74 | 75 | for k, v in zip(tasks, results): 76 | if v is None: 77 | print("No data found for task: %s" % k) 78 | 79 | return {k: v for k, v in zip(tasks, results) if v is not None} 80 | 81 | 82 | def get_task_names(): 83 | content = gfile.GFile("gs://gresearch/task_set_data/task_names.txt").read() 84 | return sorted(content.strip().split("\n")) 85 | 86 | 87 | args = parser.parse_args() 88 | 89 | task_id = args.task_id 90 | task_names = get_task_names() 91 | 92 | for task_name in task_names: 93 | if task_name.startswith('FixedTextRNNClassification'): 94 | 95 | results = load_tasks([task_name]) 96 | 97 | # For each task, there is then a dictionary of optimizer families. 98 | optimizer_families = results[task_name].keys() 99 | # hardcode to only the search space with 8 hyperparameters now 100 | optimizer_name = 'adam8p_wide_grid_1k' 101 | optimizer_names, x, y = results[task_name][optimizer_name] 102 | 103 | nr_seeds = y.shape[1] 104 | nr_optimizations = y.shape[0] 105 | train_curves = [] 106 | val_curves = [] 107 | test_curves = [] 108 | 109 | for hp_index in range(nr_optimizations): 110 | 111 | train_seed_curve = [] 112 | valid_seed_curve = [] 113 | test_seed_curve = [] 114 | 115 | for seed_index in range(nr_seeds): 116 | train_seed_curve.append(y[hp_index, seed_index, :, 0]) 117 | valid_seed_curve.append(y[hp_index, seed_index, :, 1]) 118 | test_seed_curve.append(y[hp_index, seed_index, :, 2]) 119 | 120 | train_seed_curve = np.mean(train_seed_curve, axis=0) 121 | valid_seed_curve = np.mean(valid_seed_curve, axis=0) 122 | test_seed_curve = np.mean(test_seed_curve, axis=0) 123 | 124 | train_curves.append(train_seed_curve.tolist()) 125 | val_curves.append(valid_seed_curve.tolist()) 126 | test_curves.append(test_seed_curve.tolist()) 127 | 128 | os.makedirs(args.output_dir, exist_ok=True) 129 | 130 | path = "https://raw.githubusercontent.com/google-research/google-research/master/task_set/optimizers/configs/adam8p_wide_grid.json" 131 | configs = json.loads(urllib.request.urlopen(path).read()) 132 | hparam_dicts = [configs[optname.decode("utf8")][0] for optname in optimizer_names] 133 | 134 | all_results = [] 135 | 136 | for hp_index, hp_config in enumerate(hparam_dicts): 137 | 138 | hp_config_result = { 139 | 'hp': hp_config, 140 | 'train': {'loss': train_curves[hp_index]}, 141 | 'valid': {'loss': val_curves[hp_index]}, 142 | 'test': {'loss': test_curves[hp_index]}, 143 | } 144 | 145 | all_results.append(hp_config_result) 146 | 147 | result_path = os.path.join( 148 | args.output_dir, 149 | f'{task_name}_0_{nr_optimizations}.json', 150 | ) 151 | 152 | with open(result_path, 'w') as file_handle: 153 | json.dump(all_results, file_handle) 154 | 155 | else: 156 | continue -------------------------------------------------------------------------------- /benchmarks/lcbench.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import List 3 | 4 | import numpy as np 5 | 6 | from benchmarks.benchmark import BaseBenchmark 7 | from lc_bench.api import Benchmark 8 | 9 | 10 | class LCBench(BaseBenchmark): 11 | 12 | nr_hyperparameters = 2000 13 | # Declaring the search space for LCBench 14 | param_space = OrderedDict([ 15 | ('batch_size', [16, 512, int, True]), 16 | ('learning_rate', [0.0001, 0.1, float, True]), 17 | ('momentum', [0.1, 0.99, float, False]), 18 | ('weight_decay', [0.00001, 0.1, float, False]), 19 | ('num_layers', [1, 5, int, False]), 20 | ('max_units', [64, 1024, int, True]), 21 | ('max_dropout', [0.0, 1.0, float, False]), 22 | ]) 23 | max_budget = 51 24 | min_budget = 1 25 | 26 | hp_names = list(param_space.keys()) 27 | 28 | log_indicator = [True, True, False, False, False, True, False] 29 | 30 | # if the best value corresponds to a lower value 31 | minimization_metric = False 32 | 33 | def __init__(self, path_to_json_file: str, dataset_name: str): 34 | 35 | super().__init__(path_to_json_file) 36 | self.benchmark = self._load_benchmark() 37 | self.dataset_name = dataset_name 38 | self.dataset_names = self.load_dataset_names() 39 | self.categorical_indicator = [False] * len(self.param_space) 40 | self.max_value = 1.0 41 | self.min_value = 0.0 42 | 43 | def get_worst_performance(self): 44 | 45 | # since it is accuracy for LCBench 46 | min_value = 100 47 | for hp_index in range(0, LCBench.nr_hyperparameters): 48 | val_curve = self.benchmark.query( 49 | dataset_name=self.dataset_name, 50 | config_id=hp_index, 51 | tag='Train/val_balanced_accuracy', 52 | ) 53 | val_curve = val_curve[1:] 54 | worst_performance_hp_curve = min(val_curve) 55 | if worst_performance_hp_curve < min_value: 56 | min_value = worst_performance_hp_curve 57 | 58 | return min_value 59 | 60 | def _load_benchmark(self): 61 | 62 | bench = Benchmark( 63 | data_dir=self.path_to_json_file, 64 | ) 65 | 66 | return bench 67 | 68 | def load_dataset_names(self) -> List[str]: 69 | 70 | return self.benchmark.get_dataset_names() 71 | 72 | def get_hyperparameter_candidates(self) -> np.ndarray: 73 | 74 | hp_names = list(LCBench.param_space.keys()) 75 | hp_configs = [] 76 | for i in range(LCBench.nr_hyperparameters): 77 | hp_config = [] 78 | config = self.benchmark.query( 79 | dataset_name=self.dataset_name, 80 | tag='config', 81 | config_id=i, 82 | ) 83 | for hp_name in hp_names: 84 | hp_config.append(config[hp_name]) 85 | hp_configs.append(hp_config) 86 | 87 | hp_configs = np.array(hp_configs) 88 | 89 | return hp_configs 90 | 91 | def get_performance(self, hp_index: int, budget: int) -> float: 92 | 93 | val_curve = self.benchmark.query( 94 | dataset_name=self.dataset_name, 95 | config_id=hp_index, 96 | tag='Train/val_balanced_accuracy', 97 | ) 98 | val_curve = val_curve[1:] 99 | budget = int(budget) 100 | 101 | return val_curve[budget - 1] 102 | 103 | def get_curve(self, hp_index: int, budget: int) -> List[float]: 104 | 105 | val_curve = self.benchmark.query( 106 | dataset_name=self.dataset_name, 107 | config_id=hp_index, 108 | tag='Train/val_balanced_accuracy', 109 | ) 110 | val_curve = val_curve[1:] 111 | budget = int(budget) 112 | 113 | return val_curve[0:budget] 114 | 115 | def get_incumbent_curve(self): 116 | 117 | inc_curve = self.benchmark.query_best( 118 | self.dataset_name, 119 | "Train/val_balanced_accuracy", 120 | "Train/val_balanced_accuracy", 121 | 0, 122 | ) 123 | inc_curve = inc_curve[1:] 124 | 125 | return inc_curve 126 | 127 | def get_max_value(self): 128 | 129 | return max(self.get_incumbent_curve()) 130 | 131 | def get_incumbent_config_id(self): 132 | 133 | best_value = 0 134 | best_index = -1 135 | for index in range(0, LCBench.nr_hyperparameters): 136 | val_curve = self.benchmark.query( 137 | dataset_name=self.dataset_name, 138 | config_id=index, 139 | tag='Train/val_balanced_accuracy', 140 | ) 141 | val_curve = val_curve[1:] 142 | max_value = max(val_curve) 143 | 144 | if max_value > best_value: 145 | best_value = max_value 146 | best_index = index 147 | 148 | return best_index 149 | 150 | def get_gap_performance(self): 151 | 152 | incumbent_curve = self.get_incumbent_curve() 153 | best_value = max(incumbent_curve) 154 | worst_value = self.get_worst_performance() 155 | 156 | return best_value - worst_value 157 | 158 | def get_step_cost(self, hp_index: int, budget: int): 159 | 160 | time_cost_curve = self.benchmark.query( 161 | dataset_name=self.dataset_name, 162 | config_id=hp_index, 163 | tag='time', 164 | ) 165 | time_cost_curve = time_cost_curve[1:] 166 | budget = int(budget) 167 | if budget > 1: 168 | step_cost = time_cost_curve[budget - 1] - time_cost_curve[budget - 2] 169 | else: 170 | step_cost = time_cost_curve[budget - 1] 171 | 172 | return step_cost 173 | 174 | def set_dataset_name(self, dataset_name: str): 175 | 176 | self.dataset_name = dataset_name 177 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.0.0 2 | aiosignal==1.2.0 3 | alabaster==0.7.12 4 | alembic==1.8.1 5 | argon2-cffi==21.3.0 6 | argon2-cffi-bindings==21.2.0 7 | asttokens==2.0.8 8 | astunparse==1.6.3 9 | atomicwrites==1.4.1 10 | attrs==21.4.0 11 | autograd==1.4 12 | autopage==0.5.1 13 | Babel==2.11.0 14 | backcall==0.2.0 15 | backports.functools-lru-cache==1.6.4 16 | beautifulsoup4==4.11.1 17 | black==22.3.0 18 | bleach==5.0.1 19 | boto3==1.26.33 20 | botocore==1.29.33 21 | botorch==0.6.6 22 | cachetools==5.0.0 23 | certifi==2022.6.15 24 | cffi==1.15.1 25 | charset-normalizer==2.0.12 26 | click==8.0.4 27 | cliff==4.0.0 28 | cloudpickle==2.2.0 29 | cmaes==0.8.2 30 | cmd2==2.4.2 31 | colorama==0.4.5 32 | coloredlogs==15.0.1 33 | colorlog==6.7.0 34 | ConfigSpace==0.6.0 35 | contextlib2==21.6.0 36 | cramjam==2.5.0 37 | cycler==0.11.0 38 | Cython==0.29.32 39 | dask==2022.9.1 40 | debugpy==1.6.3 41 | decorator==5.1.1 42 | defusedxml==0.7.1 43 | dill==0.3.5.1 44 | distlib==0.3.6 45 | distributed==2022.9.1 46 | docutils==0.17.1 47 | dragonfly-opt==0.1.6 48 | entrypoints==0.4 49 | etils==0.6.0 50 | executing==0.10.0 51 | fastjsonschema==2.16.1 52 | fastparquet==0.8.3 53 | filelock==3.8.0 54 | flake8==5.0.4 55 | flatbuffers==1.12 56 | fonttools==4.29.1 57 | frozenlist==1.3.1 58 | fsspec==2022.8.2 59 | future==0.18.2 60 | gast==0.4.0 61 | google-auth==2.6.2 62 | google-auth-oauthlib==0.4.6 63 | google-pasta==0.2.0 64 | googleapis-common-protos==1.56.2 65 | gpytorch==1.8.1 66 | greenlet==1.1.3 67 | grpcio==1.43.0 68 | h5py==3.6.0 69 | HeapDict==1.0.1 70 | humanfriendly==10.0 71 | idna==3.3 72 | imagesize==1.4.1 73 | importlib-metadata==4.11.4 74 | importlib-resources==5.9.0 75 | iniconfig==1.1.1 76 | ipykernel==6.15.1 77 | ipython==8.4.0 78 | ipython-genutils==0.2.0 79 | ipywidgets==8.0.1 80 | jedi==0.18.1 81 | Jinja2==3.1.2 82 | jmespath==1.0.1 83 | joblib==1.1.0 84 | jsonschema==4.13.0 85 | jupyter-client==7.3.4 86 | jupyter_core==4.11.1 87 | jupyterlab-pygments==0.2.2 88 | jupyterlab-widgets==3.0.2 89 | keras==2.9.0 90 | Keras-Preprocessing==1.1.2 91 | kiwisolver==1.3.2 92 | latexcodec==2.0.1 93 | lcdb==0.1.0 94 | liac-arff==2.5.0 95 | libclang==13.0.0 96 | locket==1.0.0 97 | loguru==0.6.0 98 | lxml==4.9.1 99 | Mako==1.2.2 100 | Markdown==3.3.6 101 | markdown-it-py==2.1.0 102 | MarkupSafe==2.1.1 103 | matplotlib==3.5.1 104 | matplotlib-inline==0.1.6 105 | mccabe==0.7.0 106 | mdit-py-plugins==0.3.3 107 | mdurl==0.1.2 108 | minio==7.1.5 109 | mistune==0.8.4 110 | mkl-fft==1.3.1 111 | mkl-random==1.2.2 112 | mkl-service==2.4.0 113 | mpmath==1.2.1 114 | msgpack==1.0.4 115 | multipledispatch==0.6.0 116 | multiprocess==0.70.13 117 | mypy-extensions==0.4.3 118 | myst-parser==0.18.1 119 | nas-bench-201==2.1 120 | nbclient==0.6.6 121 | nbconvert==6.5.3 122 | nbformat==5.4.0 123 | nest-asyncio==1.5.5 124 | notebook==6.4.12 125 | numpy==1.21.5 126 | oauthlib==3.2.0 127 | olefile==0.46 128 | onnxruntime==1.12.1 129 | openml==0.12.2 130 | opt-einsum==3.3.0 131 | optuna==3.0.2 132 | packaging==21.3 133 | pandas==1.4.1 134 | pandocfilters==1.5.0 135 | parso==0.8.3 136 | partd==1.3.0 137 | pathos==0.2.9 138 | pathspec==0.10.0 139 | patsy==0.5.2 140 | pbr==5.10.0 141 | pickleshare==0.7.5 142 | Pillow==8.4.0 143 | pip==22.2.2 144 | pkgutil_resolve_name==1.3.10 145 | platformdirs==2.5.2 146 | pluggy==1.0.0 147 | pox==0.3.1 148 | ppft==1.7.6.5 149 | prettytable==3.4.1 150 | prometheus-client==0.14.1 151 | promise==2.3 152 | prompt-toolkit==3.0.30 153 | protobuf==3.19.4 154 | protobuf3-to-dict==0.1.5 155 | psutil==5.9.1 156 | pure-eval==0.2.2 157 | py==1.11.0 158 | pyaml==21.10.1 159 | pyarrow==7.0.0 160 | pyasn1==0.4.8 161 | pyasn1-modules==0.2.8 162 | pybtex==0.24.0 163 | pybtex-docutils==1.0.2 164 | pycodestyle==2.9.1 165 | pycparser==2.21 166 | pyflakes==2.5.0 167 | Pygments==2.13.0 168 | pyparsing==3.0.9 169 | pyperclip==1.8.2 170 | pyreadline3==3.4.1 171 | pyro-api==0.1.2 172 | pyro-ppl==1.8.0 173 | pyrsistent==0.18.1 174 | pytest==7.1.2 175 | pytest-timeout==2.1.0 176 | python-dateutil==2.8.2 177 | pytz==2021.3 178 | pywin32==304 179 | pywinpty==2.0.7 180 | PyYAML==6.0 181 | pyzmq==23.2.1 182 | ray==2.0.0 183 | requests==2.27.1 184 | requests-oauthlib==1.3.1 185 | rsa==4.8 186 | s3fs==0.4.2 187 | s3transfer==0.6.0 188 | sagemaker==2.125.0 189 | schema==0.7.5 190 | scikit-learn==1.0.2 191 | scikit-optimize==0.9.0 192 | scipy==1.8.0 193 | seaborn==0.11.2 194 | Send2Trash==1.8.0 195 | setuptools==58.0.4 196 | six==1.16.0 197 | sklearn==0.0 198 | smdebug-rulesconfig==1.0.1 199 | snowballstemmer==2.2.0 200 | sortedcontainers==2.4.0 201 | soupsieve==2.3.2.post1 202 | Sphinx==5.3.0 203 | sphinx_autodoc_typehints==1.19.5 204 | sphinx-copybutton==0.5.1 205 | sphinx-rtd-theme==1.1.1 206 | sphinxcontrib-applehelp==1.0.2 207 | sphinxcontrib-bibtex==2.5.0 208 | sphinxcontrib-devhelp==1.0.2 209 | sphinxcontrib-htmlhelp==2.0.0 210 | sphinxcontrib-jsmath==1.0.1 211 | sphinxcontrib-qthelp==1.0.3 212 | sphinxcontrib-serializinghtml==1.1.5 213 | SQLAlchemy==1.4.41 214 | stack-data==0.4.0 215 | statsmodels==0.13.2 216 | stevedore==4.0.0 217 | sympy==1.11.1 218 | syne-tune==0.3.3 219 | tabulate==0.8.10 220 | tblib==1.7.0 221 | tensorboard==2.9.1 222 | tensorboard-data-server==0.6.1 223 | tensorboard-plugin-wit==1.8.1 224 | tensorboardX==2.5.1 225 | tensorflow==2.9.1 226 | tensorflow-datasets==4.6.0 227 | tensorflow-estimator==2.9.0 228 | tensorflow-io==0.26.0 229 | tensorflow-io-gcs-filesystem==0.26.0 230 | tensorflow-metadata==1.8.0 231 | termcolor==1.1.0 232 | terminado==0.15.0 233 | tf-estimator-nightly==2.8.0.dev2021122109 234 | threadpoolctl==3.1.0 235 | tinycss2==1.1.1 236 | toml==0.10.2 237 | tomli==2.0.1 238 | toolz==0.12.0 239 | torch==1.10.2 240 | torchaudio==0.10.2 241 | torchvision==0.11.3 242 | tornado==6.1 243 | tqdm==4.64.0 244 | traitlets==5.3.0 245 | typing-extensions==3.10.0.2 246 | ujson==5.4.0 247 | urllib3==1.26.9 248 | virtualenv==20.16.4 249 | wcwidth==0.2.5 250 | webencodings==0.5.1 251 | Werkzeug==2.1.0 252 | wheel==0.37.1 253 | widgetsnbextension==4.0.2 254 | win32-setctime==1.1.0 255 | wincertstore==0.2 256 | wrapt==1.14.0 257 | xgboost==1.6.2 258 | xmltodict==0.12.0 259 | yahpo-gym==1.0.1 260 | zict==2.2.0 261 | zipp==3.8.1 262 | -------------------------------------------------------------------------------- /benchmarks/taskset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from benchmarks.benchmark import BaseBenchmark 9 | 10 | 11 | class TaskSet(BaseBenchmark): 12 | 13 | nr_hyperparameters = 1000 14 | max_budget = 51 15 | 16 | hp_names = [ 17 | 'learning_rate', 18 | 'beta1', 19 | 'beta2', 20 | 'epsilon', 21 | 'l1', 22 | 'l2', 23 | 'linear_decay', 24 | 'exponential_decay', 25 | ] 26 | 27 | log_indicator = [True, False, False, True, True, True, True, True] 28 | 29 | def __init__(self, path_to_json_files: str, dataset_name: str): 30 | 31 | super().__init__(path_to_json_files) 32 | self.dataset_name = dataset_name 33 | self.hp_candidates = [] 34 | self.training_curves = [] 35 | self.validation_curves = [] 36 | self.test_curves = [] 37 | 38 | self._load_benchmark() 39 | 40 | filtered_indices = self.filter_curves() 41 | self.validation_curves = np.array(self.validation_curves) 42 | self.validation_curves = self.validation_curves[filtered_indices] 43 | self.hp_candidates = np.array(self.hp_candidates) 44 | self.hp_candidates = self.hp_candidates[filtered_indices] 45 | 46 | self.categorical_indicator = [False] * self.hp_candidates[1] 47 | self.min_value = self.get_worst_performance() 48 | self.max_value = self.get_best_performance() 49 | 50 | def get_worst_performance(self): 51 | # for taskset we have loss, so the worst value possible value 52 | # is infinity 53 | worst_value = 0 54 | for hp_index in range(0, self.validation_curves.shape[0]): 55 | val_curve = self.validation_curves[hp_index] 56 | worst_performance_hp_curve = max(val_curve) 57 | if worst_performance_hp_curve > worst_value: 58 | worst_value = worst_performance_hp_curve 59 | 60 | return worst_value 61 | 62 | def get_best_performance(self): 63 | 64 | incumbent_curve = self.get_incumbent_curve() 65 | best_value = min(incumbent_curve) 66 | 67 | return best_value 68 | 69 | def _load_benchmark(self): 70 | 71 | dataset_file = os.path.join(self.path_to_json_file, f'{self.dataset_name}.json') 72 | 73 | with open(dataset_file, 'r') as fp: 74 | dataset_info = json.load(fp) 75 | 76 | for optimization_iteration in dataset_info: 77 | hp_configuration = optimization_iteration['hp'] 78 | train_curve = optimization_iteration['train']['loss'] 79 | validation_curve = optimization_iteration['valid']['loss'] 80 | test_curve = optimization_iteration['test']['loss'] 81 | 82 | # keep a fixed order for the hps and their values, 83 | # just in case 84 | new_hp_configuration = [] 85 | for hp_name in self.hp_names: 86 | new_hp_configuration.append(hp_configuration[hp_name]) 87 | 88 | self.hp_candidates.append(new_hp_configuration) 89 | self.training_curves.append(train_curve) 90 | self.validation_curves.append(validation_curve) 91 | self.test_curves.append(test_curve) 92 | 93 | def load_dataset_names(self) -> List[str]: 94 | 95 | dataset_file_names = [ 96 | dataset_file_name[:-5] for dataset_file_name in os.listdir(self.path_to_json_file) 97 | if os.path.isfile(os.path.join(self.path_to_json_file, dataset_file_name)) 98 | ] 99 | 100 | return dataset_file_names 101 | 102 | def get_hyperparameter_candidates(self) -> np.ndarray: 103 | 104 | return np.array(self.hp_candidates) 105 | 106 | def get_performance(self, hp_index: int, budget: int) -> float: 107 | 108 | val_curve = self.validation_curves[hp_index] 109 | 110 | budget = int(budget) 111 | 112 | return val_curve[budget - 1] 113 | 114 | def get_curve(self, hp_index: int, budget: int) -> float: 115 | 116 | val_curve = self.validation_curves[hp_index] 117 | 118 | budget = int(budget) 119 | 120 | return val_curve[0:budget].tolist() 121 | 122 | def get_incumbent_curve(self): 123 | 124 | best_value = np.inf 125 | best_index = -1 126 | for index in range(0, self.validation_curves.shape[0]): 127 | val_curve = self.validation_curves[index] 128 | min_loss = min(val_curve) 129 | 130 | if min_loss < best_value: 131 | best_value = min_loss 132 | best_index = index 133 | 134 | return self.validation_curves[best_index] 135 | 136 | def get_gap_performance(self): 137 | 138 | incumbent_curve = self.get_incumbent_curve() 139 | best_value = min(incumbent_curve) 140 | worst_value = self.get_worst_performance() 141 | 142 | return worst_value - best_value 143 | 144 | def get_incumbent_config_index(self): 145 | 146 | best_value = np.inf 147 | best_index = -1 148 | for index in range(0, self.validation_curves.shape[0]): 149 | val_curve = self.validation_curves[index] 150 | min_loss = min(val_curve) 151 | 152 | if min_loss < best_value: 153 | best_value = min_loss 154 | best_index = index 155 | 156 | return best_index 157 | 158 | def log_transform_labels(self): 159 | 160 | validation_curves = np.array(self.validation_curves).flatten() 161 | max_value = np.amax(validation_curves) 162 | min_value = np.amin(validation_curves) 163 | self.max_value = max_value 164 | self.min_value = min_value 165 | 166 | f = lambda x: (np.log(x) - np.log(min_value)) / (np.log(max_value) - np.log(min_value)) 167 | 168 | log_transformed_values = f(self.validation_curves) 169 | 170 | return log_transformed_values.tolist() 171 | 172 | def filter_curves(self): 173 | 174 | validation_curves = np.array(self.validation_curves) 175 | validation_curves = pd.DataFrame(validation_curves) 176 | # TODO do a query for both values instead of going through the df twice 177 | non_nan_idx = validation_curves.notnull().all(axis=1) 178 | non_diverging_idx = (validation_curves < validation_curves.quantile(0.95).min()).all(axis=1) 179 | 180 | idx = non_nan_idx & non_diverging_idx 181 | 182 | return idx 183 | -------------------------------------------------------------------------------- /surrogate_models/dehb/README.md: -------------------------------------------------------------------------------- 1 | # DEHB: Evolutionary Hyperband for Scalable, Robust and Efficient Hyperparameter Optimization 2 | 3 | ### Getting started 4 | ```bash 5 | git clone https://github.com/automl/DEHB.git 6 | cd DEHB/ 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | ### Tutorials/Example notebooks 11 | 12 | * [00 - A generic template to use DEHB for multi-fidelity Hyperparameter Optimization](examples/00_interfacing_DEHB.ipynb) 13 | * [01 - Using DEHB to optimize 4 hyperparameters of a Scikit-learn's Random Forest on a classification dataset](examples/01_Optimizing_RandomForest_using_DEHB.ipynb) 14 | * [02 - Optimizing Scikit-learn's Random Forest without using ConfigSpace to represent the hyperparameter space](examples/02_using%20DEHB_without_ConfigSpace.ipynb) 15 | * [03 - Hyperparameter Optimization for MNIST in PyTorch](examples/03_pytorch_mnist_hpo.py) 16 | 17 | To run PyTorch example: (*note additional requirements*) 18 | ```bash 19 | PYTHONPATH=$PWD python examples/03_pytorch_mnist_hpo.py \ 20 | --min_budget 1 --max_budget 3 --verbose --runtime 60 21 | ``` 22 | 23 | ### Running DEHB in a parallel setting 24 | 25 | DEHB has been designed to interface a [Dask client](https://distributed.dask.org/en/latest/api.html#distributed.Client). 26 | DEHB can either create a Dask client during instantiation and close/kill the client during garbage colleciton. 27 | Or a client can be passed as an argument during instantiation. 28 | 29 | * Setting `n_workers` during instantiation \ 30 | If set to `1` (default) then the entire process is a sequential run without invoking Dask. \ 31 | If set to `>1` then a Dask Client is initialized with as many workers as `n_workers`. \ 32 | This parameter is ignored if `client` is not None. 33 | * Setting `client` during instantiation \ 34 | When `None` (default), the a Dask client is created using `n_workers` specified. \ 35 | Else, any custom configured Dask Client can be created and passed as the `client` argument to DEHB. 36 | 37 | #### Using GPUs in a parallel run 38 | 39 | Certain target function evaluations (especially for Deep Learning) requires computations to be 40 | carried out on GPUs. The GPU devices are often ordered by device ID and if not configured, all 41 | spawned worker processes access these devices in the same order and can either run out of memory, or 42 | not exhibit parallelism. 43 | 44 | For `n_workers>1` and when running on a single node (or local), the `single_node_with_gpus` can be 45 | passed to the `run()` call to DEHB. Setting it to `False` (default) has no effect on the default setup 46 | of the machine. Setting it to `True` will reorder the GPU device IDs dynamically by setting the environment 47 | variable `CUDA_VISIBLE_DEVICES` for each worker process executing a target function evaluation. The re-ordering 48 | is done in a manner that the first priority device is the one with the least number of active jobs assigned 49 | to it by that DEHB run. 50 | 51 | To run the PyTorch MNIST example on a single node using 2 workers: 52 | ```bash 53 | PYTHONPATH=$PWD python examples/03_pytorch_mnist_hpo.py --min_budget 1 --max_budget 3 \ 54 | --verbose --runtime 60 --n_workers 2 --single_node_with_gpus 55 | ``` 56 | 57 | #### Multi-node runs 58 | 59 | Multi-node parallelism is often contingent on the cluster setup to be deployed on. Dask provides useful 60 | frameworks to interface various cluster designs. As long as the `client` passed to DEHB during 61 | instantiation is of type `dask.distributed.Client`, DEHB can interact with this client and 62 | distribute its optimisation process in a parallel manner. 63 | 64 | For instance, `Dask-CLI` can be used to create a `dask-scheduler` which can dump its connection 65 | details to a file on a cluster node accessible to all processes. Multiple `dask-worker` can then be 66 | created to interface the `dask-scheduler` by connecting to the details read from the file dumped. Each 67 | dask-worker can be triggered on any remote machine. Each worker can be configured as required, 68 | including mapping to specific GPU devices. 69 | 70 | Some helper scripts can be found [here](utils/), that can be used as reference to run DEHB in a multi-node 71 | manner on clusters managed by SLURM. (*not expected to work off-the-shelf*) 72 | 73 | To run the PyTorch MNIST example on a multi-node setup using 4 workers: 74 | ```bash 75 | bash utils/run_dask_setup.sh -f dask_dump/scheduler.json -e env_name -n 4 76 | sleep 5 77 | PYTHONPATH=$PWD python examples/03_pytorch_mnist_hpo.py --min_budget 1 --max_budget 3 \ 78 | --verbose --runtime 60 --scheduler_file dask_dump/scheduler.json 79 | ``` 80 | 81 | 82 | 83 | ### DEHB Hyperparameters 84 | 85 | *We recommend the default settings*. 86 | The default settings were chosen based on ablation studies over a collection of diverse problems 87 | and were found to be *generally* useful across all cases tested. 88 | However, the parameters are still available for tuning to a specific problem. 89 | 90 | The Hyperband components: 91 | * *min\_budget*: Needs to be specified for every DEHB instantiation and is used in determining 92 | the budget spacing for the problem at hand. 93 | * *max\_budget*: Needs to be specified for every DEHB instantiation. Represents the full-budget 94 | evaluation or the actual black-box setting. 95 | * *eta*: (default=3) Sets the aggressiveness of Hyperband's aggressive early stopping by retaining 96 | 1/eta configurations every round 97 | 98 | The DE components: 99 | * *strategy*: (default=`rand1_bin`) Chooses the mutation and crossover strategies for DE. `rand1` 100 | represents the *mutation* strategy while `bin` represents the *binomial crossover* strategy. \ 101 | Other mutation strategies include: {`rand2`, `rand2dir`, `best`, `best2`, `currenttobest1`, `randtobest1`}\ 102 | Other crossover strategies include: {`exp`}\ 103 | Mutation and crossover strategies can be combined with a `_` separator, for e.g.: `rand2dir_exp`. 104 | * *mutation_factor*: (default=0.5) A fraction within [0, 1] weighing the difference operation in DE 105 | * *crossover_prob*: (default=0.5) A probability within [0, 1] weighing the traits from a parent or the mutant 106 | 107 | --- 108 | 109 | ### To cite the paper or code 110 | 111 | ```bibtex 112 | @article{awad2021dehb, 113 | title={DEHB: Evolutionary Hyberband for Scalable, Robust and Efficient Hyperparameter Optimization}, 114 | author={Awad, Noor and Mallik, Neeratyoy and Hutter, Frank}, 115 | journal={arXiv preprint arXiv:2105.09821}, 116 | year={2021} 117 | } 118 | -------------------------------------------------------------------------------- /surrogate_models/dehb/dehb/utils/bracket_manager.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class SHBracketManager(object): 5 | """ Synchronous Successive Halving utilities 6 | """ 7 | def __init__(self, n_configs, budgets, bracket_id=None): 8 | assert len(n_configs) == len(budgets) 9 | self.n_configs = n_configs 10 | self.budgets = budgets 11 | self.bracket_id = bracket_id 12 | self.sh_bracket = {} 13 | self._sh_bracket = {} 14 | self._config_map = {} 15 | for i, budget in enumerate(budgets): 16 | # sh_bracket keeps track of jobs/configs that are still to be scheduled/allocatted 17 | # _sh_bracket keeps track of jobs/configs that have been run and results retrieved for 18 | # (sh_bracket[i] + _sh_bracket[i]) == n_configs[i] is when no jobs have been scheduled 19 | # or all jobs for that budget/rung are over 20 | # (sh_bracket[i] + _sh_bracket[i]) < n_configs[i] indicates a job has been scheduled 21 | # and is queued/running and the bracket needs to be paused till results are retrieved 22 | self.sh_bracket[budget] = n_configs[i] # each scheduled job does -= 1 23 | self._sh_bracket[budget] = 0 # each retrieved job does +=1 24 | self.n_rungs = len(budgets) 25 | self.current_rung = 0 26 | 27 | def get_budget(self, rung=None): 28 | """ Returns the exact budget that rung is pointing to. 29 | 30 | Returns current rung's budget if no rung is passed. 31 | """ 32 | if rung is not None: 33 | return self.budgets[rung] 34 | return self.budgets[self.current_rung] 35 | 36 | def get_lower_budget_promotions(self, budget): 37 | """ Returns the immediate lower budget and the number of configs to be promoted from there 38 | """ 39 | assert budget in self.budgets 40 | rung = np.where(budget == self.budgets)[0][0] 41 | prev_rung = np.clip(rung - 1, a_min=0, a_max=self.n_rungs-1) 42 | lower_budget = self.budgets[prev_rung] 43 | num_promote_configs = self.n_configs[rung] 44 | return lower_budget, num_promote_configs 45 | 46 | def get_next_job_budget(self): 47 | """ Returns the budget that will be selected if current_rung is incremented by 1 48 | """ 49 | if self.sh_bracket[self.get_budget()] > 0: 50 | # the current rung still has unallocated jobs (>0) 51 | return self.get_budget() 52 | else: 53 | # the current rung has no more jobs to allocate, increment it 54 | rung = (self.current_rung + 1) % self.n_rungs 55 | if self.sh_bracket[self.get_budget(rung)] > 0: 56 | # the incremented rung has unallocated jobs (>0) 57 | return self.get_budget(rung) 58 | else: 59 | # all jobs for this bracket has been allocated/bracket is complete 60 | # no more budgets to evaluate and can return None 61 | pass 62 | return None 63 | 64 | def register_job(self, budget): 65 | """ Registers the allocation of a configuration for the budget and updates current rung 66 | 67 | This function must be called when scheduling a job in order to allow the bracket manager 68 | to continue job and budget allocation without waiting for jobs to finish and return 69 | results necessarily. This feature can be leveraged to run brackets asynchronously. 70 | """ 71 | assert budget in self.budgets 72 | assert self.sh_bracket[budget] > 0 73 | self.sh_bracket[budget] -= 1 74 | if not self._is_rung_pending(self.current_rung): 75 | # increment current rung if no jobs left in the rung 76 | self.current_rung = (self.current_rung + 1) % self.n_rungs 77 | 78 | def complete_job(self, budget): 79 | """ Notifies the bracket that a job for a budget has been completed 80 | 81 | This function must be called when a config for a budget has finished evaluation to inform 82 | the Bracket Manager that no job needs to be waited for and the next rung can begin for the 83 | synchronous Successive Halving case. 84 | """ 85 | assert budget in self.budgets 86 | _max_configs = self.n_configs[list(self.budgets).index(budget)] 87 | assert self._sh_bracket[budget] < _max_configs 88 | self._sh_bracket[budget] += 1 89 | 90 | def _is_rung_waiting(self, rung): 91 | """ Returns True if at least one job is still pending/running and waits for results 92 | """ 93 | job_count = self._sh_bracket[self.budgets[rung]] + self.sh_bracket[self.budgets[rung]] 94 | if job_count < self.n_configs[rung]: 95 | return True 96 | return False 97 | 98 | def _is_rung_pending(self, rung): 99 | """ Returns True if at least one job pending to be allocatted in the rung 100 | """ 101 | if self.sh_bracket[self.budgets[rung]] > 0: 102 | return True 103 | return False 104 | 105 | def previous_rung_waits(self): 106 | """ Returns True if none of the rungs < current rung is waiting for results 107 | """ 108 | for rung in range(self.current_rung): 109 | if self._is_rung_waiting(rung) and not self._is_rung_pending(rung): 110 | return True 111 | return False 112 | 113 | def is_bracket_done(self): 114 | """ Returns True if all configs in all rungs in the bracket have been allocated 115 | """ 116 | return ~self.is_pending() and ~self.is_waiting() 117 | 118 | def is_pending(self): 119 | """ Returns True if any of the rungs/budgets have still a configuration to submit 120 | """ 121 | return np.any([self._is_rung_pending(i) > 0 for i, _ in enumerate(self.budgets)]) 122 | 123 | def is_waiting(self): 124 | """ Returns True if any of the rungs/budgets have a configuration pending/running 125 | """ 126 | return np.any([self._is_rung_waiting(i) > 0 for i, _ in enumerate(self.budgets)]) 127 | 128 | def __repr__(self): 129 | cell_width = 9 130 | cell = "{{:^{}}}".format(cell_width) 131 | budget_cell = "{{:^{}.2f}}".format(cell_width) 132 | header = "|{}|{}|{}|{}|".format( 133 | cell.format("budget"), 134 | cell.format("pending"), 135 | cell.format("waiting"), 136 | cell.format("done") 137 | ) 138 | _hline = "-" * len(header) 139 | table = [header, _hline] 140 | for i, budget in enumerate(self.budgets): 141 | pending = self.sh_bracket[budget] 142 | done = self._sh_bracket[budget] 143 | waiting = np.abs(self.n_configs[i] - pending - done) 144 | entry = "|{}|{}|{}|{}|".format( 145 | budget_cell.format(budget), 146 | cell.format(pending), 147 | cell.format(waiting), 148 | cell.format(done) 149 | ) 150 | table.append(entry) 151 | table.append(_hline) 152 | return "\n".join(table) 153 | -------------------------------------------------------------------------------- /surrogate_models/dragonfly.py: -------------------------------------------------------------------------------- 1 | from argparse import Namespace 2 | import threading 3 | 4 | from typing import Dict, List, OrderedDict, Tuple 5 | 6 | from dragonfly import load_config, maximize_multifidelity_function, minimize_multifidelity_function 7 | 8 | import numpy as np 9 | 10 | 11 | class DragonFlyOptimizer: 12 | def __init__( 13 | self, 14 | hyperparameter_candidates: np.ndarray, 15 | param_space: OrderedDict, 16 | seed: int = 0, 17 | max_budget: int = 52, 18 | max_nr_trials: int = 2000, 19 | maximization: bool = True, 20 | **kwargs, 21 | ): 22 | """ 23 | Wrapper for the BOCA algorithm. 24 | 25 | Args: 26 | ----- 27 | hyperparameter_candidates: np.ndarray 28 | 2d array which contains all possible configurations which can be queried. 29 | param_space: OrderedDict 30 | The hyperparameter search-space, indicating the type and range of every 31 | hyperparameter. 32 | seed: int 33 | Seed used to reproduce the experiments. 34 | max_budget: int 35 | The number of maximal steps for a hyperparameter configuration. 36 | max_nr_trials: int 37 | The total runtime budget, given as the number of epochs spent during HPO. 38 | maximization: bool 39 | If the inner objective is to maximize or minimize. 40 | """ 41 | self.maximization = maximization 42 | self.hyperparameter_candidates = hyperparameter_candidates 43 | self.param_space = param_space 44 | self.extra_arguments = kwargs 45 | 46 | self.hyperparameter_mapping = self.create_configuration_to_indices() 47 | 48 | # empty configuration, empty budget, empty information for config 49 | self.next_conf = None 50 | self.conf_budget = None 51 | self.conf_info = None 52 | self.fidelity_index = None 53 | self.rng = np.random.RandomState(seed) 54 | np.random.seed(seed) 55 | 56 | self.evaluated_configurations = dict() 57 | self.evaluated_hp_curves = dict() 58 | # Basically the same as evaluated_hp_curves. However, this will 59 | # be used to estimate the evaluation cost for a certain fidelity. 60 | # If we used evaluated_hp_curves, the cost would always be zero 61 | # since the configuration index is added there as evaluated already 62 | # before. 63 | self.fidelity_hp_curves = dict() 64 | domain_vars = [ 65 | {'type': 'discrete_euclidean', 'items': list(self.hyperparameter_candidates)}, 66 | ] 67 | fidel_vars = [ 68 | {'type': 'int', 'min': 1, 'max': max_budget}, 69 | ] 70 | 71 | fidel_to_opt = [int(max_budget)] 72 | 73 | config = { 74 | 'domain': domain_vars, 75 | 'fidel_space': fidel_vars, 76 | 'fidel_to_opt': fidel_to_opt, 77 | } 78 | # How frequently to build a new (GP) model 79 | # --build_new_model_every 17 80 | options_namespace = Namespace( 81 | gpb_hp_tune_criterion='ml', 82 | ) 83 | config = load_config(config) 84 | 85 | self.dragonfly_run = threading.Thread( 86 | target=maximize_multifidelity_function if self.maximization else minimize_multifidelity_function, 87 | kwargs={ 88 | 'func': self.target_function, 89 | 'max_capital': max_nr_trials, 90 | 'config': config, 91 | 'domain': config.domain, 92 | 'fidel_space': config.fidel_space, 93 | 'fidel_to_opt': config.fidel_to_opt, 94 | 'options': options_namespace, 95 | 'fidel_cost_func': self.fidel_cost_function, 96 | }, 97 | daemon=True, 98 | ) 99 | self.dragonfly_run.start() 100 | 101 | def fidel_cost_function(self, fidelity): 102 | 103 | fidelity_value = fidelity[0] 104 | while True: 105 | if self.fidelity_index is not None: 106 | config_index = self.fidelity_index 107 | if config_index in self.fidelity_hp_curves: 108 | budget_evaluated = self.fidelity_hp_curves[config_index] 109 | # the hyperparameter configuration has been evaluated before 110 | # and it was evaluated for a higher\same budget 111 | if budget_evaluated >= fidelity_value: 112 | # there was a curve which was evaluated for longer 113 | fidelity_opt_cost = 0 114 | else: 115 | # will only resume training for the extra query 116 | fidelity_opt_cost = fidelity_value - budget_evaluated 117 | self.fidelity_hp_curves[config_index] = fidelity_value 118 | else: 119 | # first evaluation 120 | fidelity_opt_cost = fidelity_value 121 | self.fidelity_hp_curves[config_index] = fidelity_value 122 | self.fidelity_index = None 123 | break 124 | 125 | return fidelity_opt_cost 126 | 127 | def target_function( 128 | self, 129 | budget: List[int], 130 | config: List[np.ndarray], 131 | ) -> float: 132 | """ 133 | Function to evaluate for a given configuration. 134 | 135 | Args: 136 | ----- 137 | budget: list 138 | The budget for which the configuration will be run. 139 | config: list 140 | Configuration suggested by DragonFly. 141 | 142 | Returns: 143 | ________ 144 | score: float 145 | A score which indicates the validation performance 146 | of the configuration. 147 | """ 148 | # the budget is a list initially with only one value 149 | budget = budget[0] 150 | if budget is not None: 151 | budget = int(budget) 152 | 153 | # initially the config is a list consisting of a single np.ndarray 154 | config = list(config[0]) 155 | 156 | config_index = self.map_configuration_to_index(config) 157 | 158 | # not the first hyperparameter to be evaluated for the selected 159 | # budget 160 | if budget in self.evaluated_configurations: 161 | self.evaluated_configurations[budget].add(config_index) 162 | else: 163 | self.evaluated_configurations[budget] = set([config_index]) 164 | 165 | self.conf_budget = budget 166 | 167 | need_to_query_framework = True 168 | if config_index in self.evaluated_hp_curves: 169 | config_curve = self.evaluated_hp_curves[config_index] 170 | # the hyperparameter configuration has been evaluated before 171 | # and it was evaluated for a higher\same budget 172 | if len(config_curve) >= budget: 173 | need_to_query_framework = False 174 | 175 | # Save the config index in fidelity index, since sometimes this config 176 | # if evaluated before it is not passed to the framework, but it would be 177 | # still needed for the cost estimation. 178 | self.fidelity_index = config_index 179 | 180 | if need_to_query_framework: 181 | # update the field so the framework can take the index and 182 | # reply 183 | self.next_conf = config_index 184 | while True: 185 | if self.conf_info is not None: 186 | score = self.conf_info['score'] 187 | val_curve = self.conf_info['val_curve'] 188 | # save the curve for the evaluated hyperparameter 189 | # configuration 190 | self.evaluated_hp_curves[config_index] = val_curve 191 | break 192 | else: 193 | # The framework has not yet responded with a value, 194 | # keep checking 195 | # TODO add a delay 196 | pass 197 | else: 198 | score = config_curve[budget - 1] 199 | 200 | # need to make the previous response None since DragonFly 201 | # continues running in the background 202 | self.conf_info = None 203 | 204 | return score 205 | 206 | def suggest(self) -> Tuple[int, int]: 207 | """ 208 | Get information about the next configuration. 209 | 210 | Returns: 211 | ________ 212 | next_conf, conf_budget: tuple 213 | A tuple that contains information about the next 214 | configuration (index in the hyperparameter_candidates it was 215 | given) and the budget for the hyperparameter to be evaluated 216 | on. 217 | """ 218 | while self.next_conf is None: 219 | # DragonFly has not generated the config yet 220 | pass 221 | self.conf_info = None 222 | 223 | return self.next_conf, self.conf_budget 224 | 225 | def observe( 226 | self, 227 | hp_index: int, 228 | budget: int, 229 | learning_curve: List[float], 230 | ): 231 | """ 232 | Respond regarding the performance of a 233 | hyperparameter configuration. get_next should 234 | be called first to retrieve the configuration. 235 | 236 | Args: 237 | ----- 238 | hp_index: int 239 | The index of the evaluated hyperparameter configuration. 240 | budget: int 241 | The budget for which the hyperparameter configuration was evaluated. 242 | learning curve: np.ndarray, list 243 | validation accuracy curve. The last value is the same as the score. 244 | """ 245 | assert self.next_conf is not None, 'Call get_next first.' 246 | self.next_conf = None 247 | 248 | self.conf_info = { 249 | 'score': learning_curve[-1], 250 | 'val_curve': learning_curve, 251 | } 252 | 253 | def create_configuration_to_indices( 254 | self, 255 | ) -> Dict[Tuple, int]: 256 | """ 257 | Maps every configuration to its index as specified 258 | in hyperparameter_candidates. 259 | 260 | Args: 261 | ----- 262 | hyperparameter_candidates: np.ndarray 263 | All the possible hyperparameter candidates given 264 | by the calling framework. 265 | 266 | Returns: 267 | ________ 268 | hyperparameter_mapping: dict 269 | A dictionary where the keys are tuples representing 270 | hyperparameter configurations and the values are indices 271 | representing their placement in hyperparameter_candidates. 272 | """ 273 | hyperparameter_mapping = dict() 274 | for i in range(0, self.hyperparameter_candidates.shape[0]): 275 | hyperparameter_mapping[tuple(self.hyperparameter_candidates[i])] = i 276 | 277 | return hyperparameter_mapping 278 | 279 | def map_configuration_to_index( 280 | self, 281 | hyperparameter_candidate: List, 282 | ) -> int: 283 | """ 284 | Return the index of the hyperparameter_candidate from 285 | the given initial array of possible hyperparameters. 286 | 287 | Args: 288 | ----- 289 | hyperparameter_candidate: np.ndarray 290 | Hyperparameter configuration. 291 | 292 | Returns: 293 | ________ 294 | index of the hyperparameter_candidate. 295 | """ 296 | hyperparameter_candidate = tuple(hyperparameter_candidate) 297 | 298 | return self.hyperparameter_mapping[hyperparameter_candidate] -------------------------------------------------------------------------------- /framework.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import time 5 | 6 | import numpy as np 7 | from sklearn.preprocessing import MinMaxScaler 8 | from sklearn.compose import ColumnTransformer 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.preprocessing import FunctionTransformer, OneHotEncoder 11 | 12 | from benchmarks.lcbench import LCBench 13 | from benchmarks.taskset import TaskSet 14 | from benchmarks.hyperbo import PD1 15 | from surrogate_models.power_law_surrogate import PowerLawSurrogate 16 | from surrogate_models.asha import AHBOptimizer 17 | from surrogate_models.dehb.interface import DEHBOptimizer 18 | from surrogate_models.dragonfly import DragonFlyOptimizer 19 | from surrogate_models.random_search import RandomOptimizer 20 | 21 | 22 | class Framework: 23 | 24 | def __init__( 25 | self, 26 | args: argparse.Namespace, 27 | seed: int, 28 | ): 29 | """ 30 | Args: 31 | args: Namespace 32 | Includes all the arguments given as variables to the main_experiment 33 | script. 34 | seed: int 35 | The seed for the experiment. 36 | """ 37 | 38 | if args.benchmark_name == 'lcbench': 39 | benchmark_extension = os.path.join( 40 | 'lc_bench', 41 | 'results', 42 | 'data_2k.json', 43 | ) 44 | elif args.benchmark_name == 'taskset': 45 | benchmark_extension = os.path.join( 46 | 'data', 47 | 'taskset', 48 | ) 49 | elif args.benchmark_name == 'pd1': 50 | benchmark_extension = 'pd1' 51 | else: 52 | raise ValueError(f'Benchmark {args.benchmark_name} not supported') 53 | 54 | benchmark_data_path = os.path.join( 55 | args.project_dir, 56 | benchmark_extension, 57 | ) 58 | 59 | benchmark_types = { 60 | 'lcbench': LCBench, 61 | 'taskset': TaskSet, 62 | 'pd1': PD1, 63 | } 64 | 65 | surrogate_types = { 66 | 'power_law': PowerLawSurrogate, 67 | 'asha': AHBOptimizer, 68 | 'dehb': DEHBOptimizer, 69 | 'dragonfly': DragonFlyOptimizer, 70 | 'random': RandomOptimizer, 71 | } 72 | 73 | disable_preprocessing = { 74 | 'dehb', 75 | } 76 | 77 | self.benchmark = benchmark_types[args.benchmark_name](benchmark_data_path, args.dataset_name) 78 | self.dataset_name = args.dataset_name 79 | self.seed = seed 80 | self.max_value = self.benchmark.max_value 81 | self.min_value = self.benchmark.min_value 82 | self.total_budget = args.budget_limit 83 | self.fantasize_step = args.fantasize_step 84 | 85 | self.categorical_indicator = self.benchmark.categorical_indicator 86 | self.log_indicator = self.benchmark.log_indicator 87 | self.hp_names = self.benchmark.hp_names 88 | self.minimization_metric = self.benchmark.minimization_metric 89 | self.info_dict = dict() 90 | self.result_dir = os.path.join( 91 | args.output_dir, 92 | args.benchmark_name, 93 | args.surrogate_name, 94 | ) 95 | os.makedirs(self.result_dir, exist_ok=True) 96 | 97 | self.result_file = os.path.join( 98 | self.result_dir, 99 | f'{self.dataset_name}_{self.seed}.json', 100 | ) 101 | 102 | if args.surrogate_name not in disable_preprocessing: 103 | self.hp_candidates = self.preprocess(self.benchmark.get_hyperparameter_candidates()) 104 | else: 105 | self.hp_candidates = self.benchmark.get_hyperparameter_candidates() 106 | 107 | if args.surrogate_name == 'power_law': 108 | self.surrogate = surrogate_types[args.surrogate_name]( 109 | self.hp_candidates, 110 | seed=seed, 111 | max_benchmark_epochs=self.benchmark.max_budget, 112 | ensemble_size=args.ensemble_size, 113 | nr_epochs=args.nr_epochs, 114 | fantasize_step=self.fantasize_step, 115 | minimization=self.minimization_metric, 116 | total_budget=args.budget_limit, 117 | device='cpu', 118 | dataset_name=args.dataset_name, 119 | output_path=self.result_dir, 120 | max_value=self.max_value, 121 | min_value=self.min_value, 122 | ) 123 | else: 124 | self.surrogate = surrogate_types[args.surrogate_name]( 125 | hyperparameter_candidates=self.hp_candidates, 126 | param_space=self.benchmark.param_space, 127 | min_budget=self.benchmark.min_budget, 128 | max_budget=self.benchmark.max_budget, 129 | eta=3, 130 | seed=seed, 131 | max_nr_trials=args.budget_limit, 132 | maximization=not self.benchmark.minimization_metric, 133 | ) 134 | 135 | def run(self): 136 | 137 | evaluated_configs = dict() 138 | surrogate_budget = 0 139 | 140 | if self.benchmark.minimization_metric: 141 | best_value = np.inf 142 | else: 143 | best_value = 0 144 | 145 | while surrogate_budget < self.total_budget: 146 | 147 | start_time = time.time() 148 | hp_index, budget = self.surrogate.suggest() 149 | hp_curve = self.benchmark.get_curve(hp_index, budget) 150 | 151 | self.surrogate.observe(hp_index, budget, hp_curve) 152 | time_duration = time.time() - start_time 153 | 154 | if hp_index in evaluated_configs: 155 | previous_budget = evaluated_configs[hp_index] 156 | else: 157 | previous_budget = 0 158 | 159 | budget_cost = budget - previous_budget 160 | evaluated_configs[hp_index] = budget 161 | 162 | step_time_duration = time_duration / budget_cost 163 | 164 | for epoch in range(previous_budget + 1, budget + 1): 165 | epoch_performance = float(hp_curve[epoch - 1]) 166 | if self.benchmark.minimization_metric: 167 | if best_value > epoch_performance: 168 | best_value = epoch_performance 169 | else: 170 | if best_value < epoch_performance: 171 | best_value = epoch_performance 172 | 173 | surrogate_budget += 1 174 | 175 | if surrogate_budget > self.total_budget: 176 | exit(0) 177 | 178 | self.log_info( 179 | int(hp_index), 180 | epoch_performance, 181 | epoch, 182 | best_value, 183 | step_time_duration, 184 | ) 185 | 186 | exit(0) 187 | 188 | def preprocess(self, hp_candidates: np.ndarray) -> np.ndarray: 189 | """Preprocess the hyperparameter candidates. 190 | 191 | Performs min-max standardization for the numerical attributes and 192 | additionally one-hot encoding for the categorical attributes. 193 | 194 | Args: 195 | hp_candidates: np.ndarray 196 | The hyperparameter candidates in their raw form as taken 197 | from the benchmark. 198 | 199 | Returns: 200 | preprocessed_candidates: np.ndarray 201 | The transformed hyperparameter candidates after being 202 | preprocessed. 203 | """ 204 | column_transformers = [] 205 | numerical_columns = [ 206 | col_index for col_index, category_indicator in enumerate(self.categorical_indicator) 207 | if not category_indicator 208 | ] 209 | categorical_columns = [ 210 | col_index for col_index, category_indicator in enumerate(self.categorical_indicator) 211 | if category_indicator 212 | ] 213 | 214 | general_transformers = [] 215 | 216 | if len(numerical_columns) > 0: 217 | 218 | if self.log_indicator is not None and any(self.log_indicator): 219 | log_columns = [col_index for col_index, log_indicator in enumerate(self.log_indicator) if log_indicator] 220 | log_transformer = FunctionTransformer(np.log) 221 | column_transformers.append( 222 | ( 223 | 'log_pre', 224 | ColumnTransformer( 225 | [('log', log_transformer, log_columns)], 226 | remainder='passthrough' 227 | ) 228 | ) 229 | ) 230 | 231 | general_transformers.append(('num', MinMaxScaler(), numerical_columns)) 232 | 233 | if len(categorical_columns) > 0: 234 | 235 | general_transformers.append( 236 | ( 237 | 'cat', 238 | OneHotEncoder( 239 | categories=[self.hp_names] * hp_candidates.shape[1], 240 | sparse=False, 241 | ), 242 | categorical_columns, 243 | ) 244 | ) 245 | column_transformers.append(('feature_types_pre', ColumnTransformer(general_transformers))) 246 | 247 | preprocessor = Pipeline( 248 | column_transformers 249 | ) 250 | # TODO log preprocessing will push numerical columns to the right 251 | # so a mapping has to happen for the feature_types_pre 252 | preprocessed_candidates = preprocessor.fit_transform(hp_candidates) 253 | 254 | return preprocessed_candidates 255 | 256 | def log_info( 257 | self, 258 | hp_index: int, 259 | performance: float, 260 | budget: int, 261 | best_value_observed: float, 262 | time_duration: float, 263 | ): 264 | """Log information after every HPO iteration. 265 | 266 | Args: 267 | hp_index: int 268 | The index of the suggested hyperparameter candidate. 269 | performance: float 270 | The performance of the hyperparameter candidate. 271 | budget: int 272 | The budget at which the hyperpararameter candidate has been evaluated so far. 273 | best_value_observed: float 274 | The incumbent value observed so far during the optimization. 275 | time_duration: float 276 | The time taken for the HPO iteration. 277 | """ 278 | if 'hp' in self.info_dict: 279 | self.info_dict['hp'].append(hp_index) 280 | else: 281 | self.info_dict['hp'] = [hp_index] 282 | 283 | accuracy_performance = performance 284 | 285 | if 'scores' in self.info_dict: 286 | self.info_dict['scores'].append(accuracy_performance) 287 | else: 288 | self.info_dict['scores'] = [accuracy_performance] 289 | 290 | incumbent_acc_performance = best_value_observed 291 | 292 | if 'curve' in self.info_dict: 293 | self.info_dict['curve'].append(incumbent_acc_performance) 294 | else: 295 | self.info_dict['curve'] = [incumbent_acc_performance] 296 | 297 | if 'epochs' in self.info_dict: 298 | self.info_dict['epochs'].append(budget) 299 | else: 300 | self.info_dict['epochs'] = [budget] 301 | 302 | if 'overhead' in self.info_dict: 303 | self.info_dict['overhead'].append(time_duration) 304 | else: 305 | self.info_dict['overhead'] = [time_duration] 306 | 307 | with open(self.result_file, 'w') as fp: 308 | json.dump(self.info_dict, fp) 309 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /plots/debugging.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import List 4 | 5 | import matplotlib 6 | import matplotlib.pyplot as plt 7 | matplotlib.use('agg') # no need for tk 8 | import numpy as np 9 | import seaborn as sns 10 | import scipy 11 | from scipy import stats 12 | 13 | sns.set( 14 | rc={ 15 | 'figure.figsize': (11.7, 8.27), 16 | 'font.size': 45, 17 | 'axes.titlesize': 45, 18 | 'axes.labelsize': 45, 19 | 'xtick.labelsize': 45, 20 | 'ytick.labelsize': 45, 21 | 'legend.fontsize': 39, 22 | }, 23 | style="white" 24 | ) 25 | 26 | 27 | def gradients_and_parameters( 28 | parameters: List[List], 29 | parameter_gradients: List[List], 30 | parameter_names: List, 31 | final_predicted_curve: List, 32 | final_true_curve: List, 33 | loss_curve, 34 | hp_index: int, 35 | max_budget: int = 1000, 36 | curve_length: int = 25, 37 | ): 38 | # Create four subplots and unpack the output array immediately 39 | f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) 40 | 41 | for parameter_name, parameter_values in zip(parameter_names, parameters): 42 | ax1.plot(np.arange(1, max_budget + 1), parameter_values, label=f'{parameter_name} value') 43 | #ax1.set_aspect('equal', 'box') 44 | ax1.legend() 45 | for parameter_name, parameter_values in zip(parameter_names, parameter_gradients): 46 | ax2.plot(np.arange(1, max_budget + 1), parameter_values, label=f'{parameter_name} gradients') 47 | ax2.set_ylim(-0.5, 0.5) 48 | #ax2.set_aspect('equal', 'box') 49 | ax2.legend() 50 | 51 | 52 | ax3.plot(np.arange(1, curve_length + 1), final_true_curve, label=f'True validation curve') 53 | ax3.plot(np.arange(1, curve_length + 1), final_predicted_curve, label=f'Predicted validation curve') 54 | #ax3.set_aspect('equal', 'box') 55 | ax3.legend() 56 | 57 | ax4.plot(np.arange(1, max_budget + 1), loss_curve, label=f'{parameter_name} gradients') 58 | 59 | 60 | f.tight_layout() 61 | plt.savefig(f'training_info_{hp_index}.pdf') 62 | 63 | 64 | def plot_grad_flow(named_parameters, i): 65 | ave_grads = [] 66 | layers = [] 67 | for n, p in named_parameters: 68 | if p.requires_grad: 69 | layers.append(n) 70 | ave_grads.append(p.grad.abs().mean().cpu()) 71 | plt.plot(ave_grads, alpha=0.3, color="b") 72 | plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" ) 73 | plt.xticks(range(0, len(ave_grads), 1), layers, rotation="vertical") 74 | plt.xlim(xmin=0, xmax=len(ave_grads)) 75 | plt.xlabel("Layers") 76 | plt.ylabel("Average gradient") 77 | plt.title("Gradient flow") 78 | plt.grid(True) 79 | plt.tight_layout() 80 | plt.savefig(f'gradients_epoch{i}.pdf') 81 | 82 | 83 | def plot_conditioned_surrogates(result_dir: str): 84 | 85 | models = [ 86 | 'conditioned_power_law', 87 | 'conditioned_nn', 88 | ] 89 | 90 | method_names_to_pretty = { 91 | 'conditioned_power_law': 'DPL', 92 | 'conditioned_nn': 'Cond NN', 93 | 'power_law': 'PL', 94 | 'nn': 'NN', 95 | 'gp': 'GP', 96 | } 97 | seed = 11 98 | val_fractions = [0.1, 0.2, 0.3, 0.4, 0.5] 99 | 100 | dataset_names = ['APSFailure', 'Amazon_employee_access', 'Australian', 'Fashion-MNIST', 'KDDCup09_appetency', 101 | 'MiniBooNE', 'adult', 'airlines', 'albert', 'bank-marketing', 'blood-transfusion-service-center', 102 | 'car', 'christine', 'cnae-9', 'connect-4', 'covertype', 'credit-g', 'dionis', 'fabert', 'helena', 103 | 'higgs', 'jannis', 'jasmine', 'jungle_chess_2pcs_raw_endgame_complete', 'kc1', 'kr-vs-kp', 104 | 'mfeat-factors', 'nomao', 'numerai28.6', 'phoneme', 'segment', 'shuttle', 'sylvine', 'vehicle', 105 | 'volkert'] 106 | 107 | for model in models: 108 | model_correlation_means = [] 109 | model_correlation_stds = [] 110 | 111 | for val_fraction in val_fractions: 112 | 113 | run_information_folder = os.path.join( 114 | result_dir, 115 | f'{model}', 116 | f'{seed}', 117 | f'{val_fraction}', 118 | ) 119 | 120 | dataset_correlations = [] 121 | for dataset_name in dataset_names: 122 | # for dataset_name in dataset_names: 123 | information_file = os.path.join(run_information_folder, f'{dataset_name}.json') 124 | with open(information_file, 'r') as fp: 125 | information = json.load(fp) 126 | dataset_correlation = information['correlation'] 127 | if not np.isnan(dataset_correlation): 128 | dataset_correlations.append(dataset_correlation) 129 | 130 | if model == 'conditioned_power_law': 131 | print(f'Validation fraction: {val_fraction}') 132 | print(f'Dataset correlations: {dataset_correlations}') 133 | 134 | model_correlation_means.append(np.mean(dataset_correlations)) 135 | model_correlation_stds.append(np.std(dataset_correlations)) 136 | 137 | plt.plot(val_fractions, model_correlation_means, label=method_names_to_pretty[model], marker='o', linestyle='--', linewidth=7) 138 | 139 | for unconditioned_model_name in ['power_law', 'nn', 'gp']: 140 | model_correlation_means = [] 141 | model_correlation_stds = [] 142 | 143 | for val_fraction in val_fractions: 144 | run_information_folder = os.path.join( 145 | result_dir, 146 | f'{unconditioned_model_name}', 147 | f'{seed}', 148 | f'{val_fraction}', 149 | 'config_6' if unconditioned_model_name == 'nn' else 'config_1', 150 | ) 151 | dataset_correlations = [] 152 | for dataset_name in dataset_names: 153 | # for dataset_name in dataset_names: 154 | information_file = os.path.join(run_information_folder, f'{dataset_name}.json') 155 | with open(information_file, 'r') as fp: 156 | information = json.load(fp) 157 | 158 | hp_true_performances = [] 159 | hp_predicted_performances = [] 160 | for hp_information in information: 161 | hp_predicted_performances.append(hp_information['hp_predicted_performance']) 162 | hp_true_performances.append(hp_information['hp_true_performance']) 163 | 164 | dataset_correlation, _ = scipy.stats.pearsonr(hp_predicted_performances, hp_true_performances) 165 | if not np.isnan(dataset_correlation): 166 | dataset_correlations.append(dataset_correlation) 167 | 168 | model_correlation_means.append(np.mean(dataset_correlations)) 169 | model_correlation_stds.append(np.std(dataset_correlations)) 170 | 171 | plt.plot(val_fractions, model_correlation_means, label=method_names_to_pretty[unconditioned_model_name], marker='o', linestyle='--', linewidth=7) 172 | 173 | plt.xlabel('LC Length Fraction') 174 | plt.xticks(val_fractions, [f'{val_fraction}' for val_fraction in val_fractions]) 175 | plt.ylabel('Correlation: Est. vs. True') 176 | plt.legend(bbox_to_anchor=(0.5, -0.42), loc='lower center', ncol=5) 177 | plt.savefig('conditioned_model_correlations.pdf', bbox_inches="tight") 178 | 179 | 180 | def plot_uncertainty_estimation(mean_values, std_values, evaluated_configs, point_to_be_evaluated, hp_indices, counter): 181 | 182 | plt.figure() 183 | point_indices = np.arange(0, 2000, 20) 184 | point_indices = np.append(point_indices, list(evaluated_configs.keys())) 185 | point_indices = np.append(point_indices, point_to_be_evaluated) 186 | point_indices = np.sort(point_indices) 187 | hp_indices = np.array(hp_indices) 188 | mean_point_x = hp_indices[point_indices] 189 | mean_point_y = mean_values[point_indices] 190 | mean_point_std = std_values[point_indices] 191 | plt.plot(mean_point_x, mean_point_y, color='red', label='Mean Surrogate Predictions') 192 | 193 | plt.fill_between(mean_point_x, np.add(mean_point_y, mean_point_std), np.subtract(mean_point_y, mean_point_std), color='red', alpha=0.2) 194 | plt.plot([hp_indices[point_to_be_evaluated]], mean_values[point_to_be_evaluated], marker="o", markersize=20, markeredgecolor="red", markerfacecolor="green", label='Chosen Point') 195 | 196 | already_evaluated_x = [] 197 | already_evaluated_y = [] 198 | for hp_index, mean_performance in zip(hp_indices, mean_values): 199 | if hp_index in evaluated_configs: 200 | already_evaluated_x.append(hp_index) 201 | already_evaluated_y.append(mean_performance) 202 | 203 | plt.scatter(already_evaluated_x, already_evaluated_y, color='black', label='Evaluated Points') 204 | plt.xlabel('Hyperparameter indices') 205 | plt.ylabel('Surrogate Prediction') 206 | plt.legend(loc=8, ncol=5) 207 | counter = int(counter) 208 | plt.savefig(f'surrogate_uncertainty_{counter}.pdf', bbox_inches="tight") 209 | 210 | 211 | def plot_top_conditioned_surrogates(result_dir: str): 212 | 213 | models = [ 214 | 'conditioned_power_law', 215 | ] 216 | 217 | seed = 11 218 | val_fractions = [0.1, 0.2, 0.3, 0.4, 0.5] 219 | 220 | dataset_names = ['APSFailure', 'Amazon_employee_access', 'Australian', 'Fashion-MNIST', 'KDDCup09_appetency', 221 | 'MiniBooNE', 'adult', 'airlines', 'albert', 'bank-marketing', 'blood-transfusion-service-center', 222 | 'car', 'christine', 'cnae-9', 'connect-4', 'covertype', 'credit-g', 'dionis', 'fabert', 'helena', 223 | 'higgs', 'jannis', 'jasmine', 'jungle_chess_2pcs_raw_endgame_complete', 'kc1', 'kr-vs-kp', 224 | 'mfeat-factors', 'nomao', 'numerai28.6', 'phoneme', 'segment', 'shuttle', 'sylvine', 'vehicle', 225 | 'volkert'] 226 | 227 | model_correlations = [] 228 | for model in models: 229 | model_correlation_means = [] 230 | model_correlation_stds = [] 231 | model_mae = [] 232 | 233 | collection_mae = [] 234 | for val_fraction in val_fractions: 235 | 236 | run_information_folder = os.path.join( 237 | result_dir, 238 | f'{model}', 239 | f'{seed}', 240 | f'{val_fraction}', 241 | ) 242 | 243 | dataset_correlations = [] 244 | mean_absolute_relative_errors = [] 245 | 246 | dataset_errors = [] 247 | for dataset_name in dataset_names: 248 | # for dataset_name in dataset_names: 249 | information_file = os.path.join(run_information_folder, f'{dataset_name}.json') 250 | with open(information_file, 'r') as fp: 251 | information = json.load(fp) 252 | real_labels = information['real_labels'] 253 | predicted_labels = information['predicted_labels'] 254 | info_dict = dict() 255 | for real_label, predicted_label in zip(real_labels, predicted_labels): 256 | info_dict[real_label] = predicted_label 257 | real_labels.sort(reverse=True) 258 | real_top_labels = [] 259 | predicted_top_labels = [] 260 | config_errors = [] 261 | for i in range(0, len(real_labels)): 262 | example_label = real_labels[i] 263 | real_top_labels.append(example_label) 264 | predicted_label = info_dict[example_label] 265 | predicted_top_labels.append(info_dict[example_label]) 266 | mae = abs((example_label - predicted_label)) / example_label 267 | mean_absolute_relative_errors.append(mae) 268 | config_errors.append(mae) 269 | 270 | dataset_correlation, _ = stats.pearsonr(real_top_labels, predicted_top_labels) 271 | dataset_errors.append(np.mean(config_errors)) 272 | if not np.isnan(dataset_correlation): 273 | dataset_correlations.append(dataset_correlation) 274 | 275 | model_correlations.append(dataset_correlations) 276 | model_correlation_means.append(np.mean(dataset_correlations)) 277 | model_mae.append(dataset_errors) 278 | model_correlation_stds.append(np.std(dataset_correlations)) 279 | collection_mae.append(mean_absolute_relative_errors) 280 | 281 | meanlineprops = dict(linewidth=4) 282 | whiskersprops = dict(linewidth=3) 283 | plt.boxplot(model_mae, positions=val_fractions, widths=0.02, showfliers=False, whis=0.5, medianprops=meanlineprops, capprops=whiskersprops, boxprops=whiskersprops, whiskerprops=whiskersprops) 284 | plt.xlabel('LC Length Fraction') 285 | plt.xlim(0, 0.6) 286 | plt.ylim(0, 0.4) 287 | plt.xticks(val_fractions, [f'{val_fraction}' for val_fraction in val_fractions]) 288 | plt.ylabel('Absolute Relative Error') 289 | plt.savefig('pl_mae_distribution.pdf', bbox_inches="tight") 290 | -------------------------------------------------------------------------------- /surrogate_models/dehb/examples/03_pytorch_mnist_hpo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script runs a Hyperparameter Optimisation (HPO) using DEHB to tune the architecture and 3 | training hyperparameters for training a neural network on MNIST in PyTorch. 4 | 5 | The parameter space is defined in the get_configspace() function. Any configuration sampled from 6 | this space can be passed to an object of class Model() which can instantiate a CNN architecture 7 | from it. The objective_function() is the target function that DEHB minimizes for this problem. This 8 | function instantiates an architecture, an optimizer, as defined by a configuration and performs the 9 | training and evaluation (on the validation set) as per the budget passed. 10 | The argument `runtime` can be passed to DEHB as a wallclock budget for running the optimisation. 11 | 12 | This tutorial also briefly refers to the different methods of interfacing DEHB with the Dask 13 | parallelism framework. Moreover, also introduce how GPUs may be managed, which is recommended for 14 | running this example tutorial. 15 | 16 | Additional requirements: 17 | * torch>=1.7.1 18 | * torchvision>=0.8.2 19 | * torchsummary>=1.5.1 20 | 21 | PyTorch code referenced from: https://github.com/pytorch/examples/blob/master/mnist/main.py 22 | """ 23 | 24 | 25 | import os 26 | import time 27 | import pickle 28 | import argparse 29 | import numpy as np 30 | from distributed import Client 31 | 32 | import torch 33 | import torch.nn as nn 34 | import torch.nn.functional as F 35 | import torch.optim as optim 36 | import torchvision 37 | from torchvision import datasets, transforms 38 | from torchsummary import summary 39 | 40 | import ConfigSpace as CS 41 | import ConfigSpace.hyperparameters as CSH 42 | 43 | from dehb import DEHB 44 | 45 | 46 | class Model(nn.Module): 47 | def __init__(self, config, img_dim=28, output_dim=10): 48 | super().__init__() 49 | self.output_dim = output_dim 50 | self.pool_kernel = 2 51 | self.pool_stride = 1 52 | self.maxpool = nn.MaxPool2d(self.pool_kernel, self.pool_stride) 53 | self.conv1 = nn.Conv2d( 54 | in_channels=1, 55 | out_channels=config["channels_1"], 56 | kernel_size=config["kernel_1"], 57 | stride=config["stride_1"], 58 | padding=0, 59 | dilation=1 60 | ) 61 | # updating image size after conv1 62 | img_dim = self._update_size(img_dim, config["kernel_1"], config["stride_1"], 0, 1) 63 | self.conv2 = nn.Conv2d( 64 | in_channels=config["channels_1"], 65 | out_channels=config["channels_2"], 66 | kernel_size=config["kernel_2"], 67 | stride=config["stride_2"], 68 | padding=0, 69 | dilation=1 70 | ) 71 | # updating image size after conv2 72 | img_dim = self._update_size(img_dim, config["kernel_2"], config["stride_2"], 0, 1) 73 | # updating image size after maxpool 74 | img_dim = self._update_size(img_dim, self.pool_kernel, self.pool_stride, 0, 1) 75 | self.dropout = nn.Dropout(config["dropout"]) 76 | hidden_dim = config["hidden"] 77 | self.fc1 = nn.Linear(img_dim * img_dim * config["channels_2"], hidden_dim) 78 | self.fc2 = nn.Linear(hidden_dim, self.output_dim) 79 | 80 | def forward(self, x): 81 | # Layer 1 82 | x = self.conv1(x) 83 | x = F.relu(x) 84 | x = self.dropout(x) 85 | # Layer 2 86 | x = self.conv2(x) 87 | x = F.relu(x) 88 | x = self.maxpool(x) 89 | x = self.dropout(x) 90 | # FC Layer 1 91 | x = torch.flatten(x, 1) 92 | x = self.fc1(x) 93 | # Output layer 94 | x = self.fc2(x) 95 | output = F.log_softmax(x, dim=1) 96 | return output 97 | 98 | def _update_size(self, dim, kernel_size, stride, padding, dilation): 99 | return int(np.floor((dim + 2 * padding - (dilation * (kernel_size - 1) + 1)) / stride + 1)) 100 | 101 | 102 | def get_configspace(seed=None): 103 | cs = CS.ConfigurationSpace(seed) 104 | 105 | # Hyperparameter defining first Conv layer 106 | kernel1 = CSH.OrdinalHyperparameter("kernel_1", sequence=[3, 5, 7], default_value=5) 107 | channels1 = CSH.UniformIntegerHyperparameter("channels_1", lower=3, upper=64, 108 | default_value=32) 109 | stride1 = CSH.UniformIntegerHyperparameter("stride_1", lower=1, upper=2, default_value=1) 110 | cs.add_hyperparameters([kernel1, channels1, stride1]) 111 | 112 | # Hyperparameter defining second Conv layer 113 | kernel2 = CSH.OrdinalHyperparameter("kernel_2", sequence=[3, 5, 7], default_value=5) 114 | channels2 = CSH.UniformIntegerHyperparameter("channels_2", lower=3, upper=64, 115 | default_value=32) 116 | stride2 = CSH.UniformIntegerHyperparameter("stride_2", lower=1, upper=2, default_value=1) 117 | cs.add_hyperparameters([kernel2, channels2, stride2]) 118 | 119 | # Hyperparameter for FC layer 120 | hidden = CSH.UniformIntegerHyperparameter( 121 | "hidden", lower=32, upper=256, log=True, default_value=128 122 | ) 123 | cs.add_hyperparameter(hidden) 124 | 125 | # Regularization Hyperparameter 126 | dropout = CSH.UniformFloatHyperparameter("dropout", lower=0, upper=0.5, default_value=0.1) 127 | cs.add_hyperparameter(dropout) 128 | 129 | # Training Hyperparameters 130 | batch_size = CSH.OrdinalHyperparameter( 131 | "batch_size", sequence=[2, 4, 8, 16, 32, 64], default_value=4 132 | ) 133 | lr = CSH.UniformFloatHyperparameter("lr", lower=1e-6, upper=0.1, log=True, 134 | default_value=1e-3) 135 | cs.add_hyperparameters([batch_size, lr]) 136 | return cs 137 | 138 | 139 | def train(model, device, train_loader, optimizer): 140 | model.train() 141 | for batch_idx, (data, target) in enumerate(train_loader): 142 | data, target = data.to(device), target.to(device) 143 | optimizer.zero_grad() 144 | output = model(data) 145 | loss = F.nll_loss(output, target) 146 | loss.backward() 147 | optimizer.step() 148 | 149 | 150 | def evaluate(model, device, data_loader, acc=False): 151 | model.eval() 152 | loss = 0 153 | correct = 0 154 | with torch.no_grad(): 155 | for data, target in data_loader: 156 | data, target = data.to(device), target.to(device) 157 | output = model(data) 158 | loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss 159 | pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability 160 | correct += pred.eq(target.view_as(pred)).sum().item() 161 | 162 | loss /= len(data_loader.dataset) 163 | correct /= len(data_loader.dataset) 164 | 165 | if acc: 166 | return correct 167 | return loss 168 | 169 | 170 | def train_and_evaluate(config, max_budget, verbose=False, **kwargs): 171 | device = kwargs["device"] 172 | batch_size = config["batch_size"] 173 | train_set = kwargs["train_set"] 174 | test_set = kwargs["test_set"] 175 | train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) 176 | test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) 177 | model = Model(config).to(device) 178 | optimizer = optim.Adadelta(model.parameters(), lr=config["lr"]) 179 | for epoch in range(1, int(max_budget)+1): 180 | train(model, device, train_loader, optimizer) 181 | accuracy = evaluate(model, device, test_loader, acc=True) 182 | if verbose: 183 | summary(model, (1, 28, 28)) # image dimensions for MNIST 184 | return accuracy 185 | 186 | 187 | def objective_function(config, budget, **kwargs): 188 | """ The target function to minimize for HPO""" 189 | device = kwargs["device"] 190 | 191 | # Data Loaders 192 | batch_size = config["batch_size"] 193 | train_set = kwargs["train_set"] 194 | valid_set = kwargs["valid_set"] 195 | test_set = kwargs["test_set"] 196 | train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True) 197 | valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False) 198 | test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False) 199 | 200 | # Build model 201 | model = Model(config).to(device) 202 | 203 | # Optimizer 204 | optimizer = optim.Adadelta(model.parameters(), lr=config["lr"]) 205 | 206 | start = time.time() # measuring wallclock time 207 | for epoch in range(1, int(budget)+1): 208 | train(model, device, train_loader, optimizer) 209 | loss = evaluate(model, device, valid_loader) 210 | cost = time.time() - start 211 | 212 | # not including test score computation in the `cost` 213 | test_loss = evaluate(model, device, test_loader) 214 | 215 | # dict representation that DEHB requires 216 | res = { 217 | "fitness": loss, 218 | "cost": cost, 219 | "info": {"test_loss": test_loss, "budget": budget} 220 | } 221 | return res 222 | 223 | 224 | def input_arguments(): 225 | parser = argparse.ArgumentParser(description='Optimizing MNIST in PyTorch using DEHB.') 226 | parser.add_argument('--no_cuda', action='store_true', default=False, 227 | help='disables CUDA training') 228 | parser.add_argument('--seed', type=int, default=123, metavar='S', 229 | help='random seed (default: 123)') 230 | parser.add_argument('--refit_training', action='store_true', default=False, 231 | help='Refit with incumbent configuration on full training data and budget') 232 | parser.add_argument('--min_budget', type=float, default=None, 233 | help='Minimum budget (epoch length)') 234 | parser.add_argument('--max_budget', type=float, default=None, 235 | help='Maximum budget (epoch length)') 236 | parser.add_argument('--eta', type=int, default=3, 237 | help='Parameter for Hyperband controlling early stopping aggressiveness') 238 | parser.add_argument('--output_path', type=str, default="./pytorch_mnist_dehb", 239 | help='Directory for DEHB to write logs and outputs') 240 | parser.add_argument('--scheduler_file', type=str, default=None, 241 | help='The file to connect a Dask client with a Dask scheduler') 242 | parser.add_argument('--n_workers', type=int, default=1, 243 | help='Number of CPU workers for DEHB to distribute function evaluations to') 244 | parser.add_argument('--single_node_with_gpus', default=False, action="store_true", 245 | help='If True, signals the DEHB run to assume all required GPUs are on ' 246 | 'the same node/machine. To be specified as True if no client is ' 247 | 'passed and n_workers > 1. Should be set to False if a client is ' 248 | 'specified as a scheduler-file created. The onus of GPU usage is then' 249 | 'on the Dask workers created and mapped to the scheduler-file.') 250 | parser.add_argument('--verbose', action="store_true", default=False, 251 | help='Decides verbosity of DEHB optimization') 252 | parser.add_argument('--runtime', type=float, default=300, 253 | help='Total time in seconds as budget to run DEHB') 254 | args = parser.parse_args() 255 | return args 256 | 257 | 258 | def main(): 259 | args = input_arguments() 260 | 261 | use_cuda = not args.no_cuda and torch.cuda.is_available() 262 | device = torch.device("cuda" if use_cuda else "cpu") 263 | 264 | torch.manual_seed(args.seed) 265 | 266 | # Data Preparation 267 | transform = transforms.Compose([ 268 | transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) 269 | ]) 270 | train_set = torchvision.datasets.MNIST( 271 | root='./data', train=True, download=True, transform=transform 272 | ) 273 | train_set, valid_set = torch.utils.data.random_split(train_set, [50000, 10000]) 274 | test_set = torchvision.datasets.MNIST( 275 | root='./data', train=False, download=True, transform=transform 276 | ) 277 | 278 | # Get configuration space 279 | cs = get_configspace(args.seed) 280 | dimensions = len(cs.get_hyperparameters()) 281 | 282 | # Some insights into Dask interfaces to DEHB and handling GPU devices for parallelism: 283 | # * if args.scheduler_file is specified, args.n_workers need not be specifed --- since 284 | # args.scheduler_file indicates a Dask client/server is active 285 | # * if args.scheduler_file is not specified and args.n_workers > 1 --- the DEHB object 286 | # creates a Dask client as at instantiation and dies with the associated DEHB object 287 | # * if args.single_node_with_gpus is True --- assumes that all GPU devices indicated 288 | # through the environment variable "CUDA_VISIBLE_DEVICES" resides on the same machine 289 | 290 | # Dask checks and setups 291 | single_node_with_gpus = args.single_node_with_gpus 292 | if args.scheduler_file is not None and os.path.isfile(args.scheduler_file): 293 | client = Client(scheduler_file=args.scheduler_file) 294 | # explicitly delegating GPU handling to Dask workers defined 295 | single_node_with_gpus = False 296 | else: 297 | client = None 298 | 299 | ########################### 300 | # DEHB optimisation block # 301 | ########################### 302 | np.random.seed(args.seed) 303 | dehb = DEHB(f=objective_function, cs=cs, dimensions=dimensions, min_budget=args.min_budget, 304 | max_budget=args.max_budget, eta=args.eta, output_path=args.output_path, 305 | # if client is not None and of type Client, n_workers is ignored 306 | # if client is None, a Dask client with n_workers is set up 307 | client=client, n_workers=args.n_workers) 308 | traj, runtime, history = dehb.run(total_cost=args.runtime, verbose=args.verbose, 309 | # arguments below are part of **kwargs shared across workers 310 | train_set=train_set, valid_set=valid_set, test_set=test_set, 311 | single_node_with_gpus=single_node_with_gpus, device=device) 312 | # end of DEHB optimisation 313 | 314 | # Saving optimisation trace history 315 | name = time.strftime("%x %X %Z", time.localtime(dehb.start)) 316 | name = name.replace("/", '-').replace(":", '-').replace(" ", '_') 317 | dehb.logger.info("Saving optimisation trace history...") 318 | with open(os.path.join(args.output_path, "history_{}.pkl".format(name)), "wb") as f: 319 | pickle.dump(history, f) 320 | 321 | # Retrain and evaluate best found configuration 322 | if args.refit_training: 323 | dehb.logger.info("Retraining on complete training data to compute test metrics...") 324 | train_set = torchvision.datasets.MNIST( 325 | root='./data', train=True, download=True, transform=transform 326 | ) 327 | incumbent = dehb.vector_to_configspace(dehb.inc_config) 328 | acc = train_and_evaluate(incumbent, args.max_budget, verbose=True, 329 | train_set=train_set, test_set=test_set, device=device) 330 | dehb.logger.info("Test accuracy of {:.3f} for the best found configuration: ".format(acc)) 331 | dehb.logger.info(incumbent) 332 | 333 | 334 | if __name__ == "__main__": 335 | main() 336 | -------------------------------------------------------------------------------- /surrogate_models/asha.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, OrderedDict, Tuple 2 | 3 | import numpy as np 4 | 5 | import optuna 6 | 7 | 8 | class AHBOptimizer: 9 | 10 | def __init__( 11 | self, 12 | hyperparameter_candidates: np.ndarray, 13 | param_space: OrderedDict, 14 | min_budget: int, 15 | max_budget: int, 16 | eta: int, 17 | seed: int = 11, 18 | max_nr_trials: int = 1000, 19 | maximization: bool = True, 20 | **kwargs, 21 | ): 22 | """ 23 | Wrapper for the Async Hyperband algorithm. 24 | 25 | Args: 26 | ----- 27 | hyperparameter_candidates: np.ndarray 28 | 2d array which contains all possible configurations which can be queried. 29 | param_space: OrderedDict 30 | The hyperparameter search-space, indicating the type and range of every 31 | hyperparameter. 32 | min_budget: int 33 | Minimum number of epochs available. 34 | max_budget: int 35 | Maximum number of epochs available. 36 | eta: int 37 | Halving factor 38 | seed: int 39 | Seed used to reproduce the experiments. 40 | max_nr_trials: int 41 | Maximum number of HPO trials. 42 | maximization: bool 43 | If the inner objective is to maximize or minimize. 44 | """ 45 | self.maximization = maximization 46 | self.min_budget = min_budget 47 | self.max_budget = max_budget 48 | self.eta = eta 49 | self.max_nr_trials = max_nr_trials 50 | self.extra_arguments = kwargs 51 | 52 | self.param_space = param_space 53 | self.hyperparameter_candidates = hyperparameter_candidates 54 | self.hyperparameter_mapping = self.create_configuration_to_indices( 55 | hyperparameter_candidates, 56 | ) 57 | self.transformed_hp_candidates = self.from_hp_value_to_unit_cube_values( 58 | hyperparameter_candidates, 59 | ) 60 | 61 | self.distribution = self.get_optuna_search_space() 62 | 63 | # empty configuration, empty budget, empty information for config 64 | self.next_conf = None 65 | self.trial = None 66 | self.conf_budget = None 67 | self.conf_info = None 68 | self.fidelity_index = None 69 | self.rng = np.random.RandomState(seed) 70 | np.random.seed(seed) 71 | 72 | self.evaluated_configurations = dict() 73 | self.evaluated_hp_curves = dict() 74 | 75 | # define study with hyperband pruner. 76 | sampler = optuna.samplers.RandomSampler(seed=seed) 77 | self.study = optuna.create_study( 78 | sampler=sampler, 79 | direction='maximize' if self.maximization else 'minimize', 80 | pruner=optuna.pruners.HyperbandPruner( 81 | min_resource=self.min_budget, 82 | max_resource=self.max_budget, 83 | reduction_factor=self.eta, 84 | ), 85 | ) 86 | 87 | def suggest(self) -> Tuple[int, int]: 88 | """ 89 | Get information about the next configuration. 90 | 91 | Returns: 92 | ________ 93 | next_conf, conf_budget: tuple 94 | A tuple that contains information about the next 95 | configuration (index in the hyperparameter_candidates it was 96 | given) and the budget for the hyperparameter to be evaluated 97 | on. 98 | """ 99 | if self.next_conf is None: 100 | 101 | self.trial = self.study.ask(self.distribution) 102 | self.next_conf = self.get_hp_config_from_trial(self.conf_budget) 103 | self.conf_budget = 1 104 | 105 | # if the hyperparameter has been evaluated before 106 | while self.next_conf in self.evaluated_hp_curves: 107 | 108 | val_curve = self.evaluated_hp_curves[self.next_conf] 109 | # it was not evaluated as far as now, it can go to the framework 110 | if self.conf_budget > len(val_curve): 111 | break 112 | else: 113 | pruned_trial = False 114 | 115 | score = val_curve[self.conf_budget - 1] 116 | self.trial.report(score, self.conf_budget) 117 | 118 | if self.trial.should_prune(): 119 | pruned_trial = True 120 | 121 | if pruned_trial: 122 | self.study.tell(self.trial, state=optuna.trial.TrialState.PRUNED) 123 | # hyperparameter config was pruned, sample another one 124 | self.trial = self.study.ask(self.distribution) 125 | if self.conf_budget in self.evaluated_configurations: 126 | self.evaluated_configurations[self.conf_budget].add(self.next_conf) 127 | else: 128 | self.evaluated_configurations[self.conf_budget] = set([self.next_conf]) 129 | self.conf_budget = 1 130 | self.next_conf = self.get_hp_config_from_trial(self.conf_budget) 131 | 132 | else: 133 | if self.conf_budget == self.max_budget: 134 | self.study.tell(self.trial, val_curve[-1]) 135 | self.trial = self.study.ask(self.distribution) 136 | if self.conf_budget in self.evaluated_configurations: 137 | self.evaluated_configurations[self.conf_budget].add(self.next_conf) 138 | else: 139 | self.evaluated_configurations[self.conf_budget] = set([self.next_conf]) 140 | self.conf_budget = 1 141 | self.next_conf = self.get_hp_config_from_trial(self.conf_budget) 142 | else: 143 | # Increase the budget 144 | self.conf_budget += 1 145 | 146 | return self.next_conf, self.conf_budget 147 | 148 | def observe( 149 | self, 150 | hp_index: int, 151 | budget: int, 152 | learning_curve: List[float], 153 | ): 154 | """ 155 | Respond regarding the performance of a 156 | hyperparameter configuration. get_next should 157 | be called first to retrieve the configuration. 158 | 159 | Args: 160 | ----- 161 | hp_index: int 162 | The index of the evaluated hyperparameter configuration. 163 | budget: int 164 | The budget for which the hyperparameter configuration was evaluated. 165 | learning curve: np.ndarray, list 166 | validation accuracy curve. The last value is the same as the score. 167 | """ 168 | assert self.next_conf is not None, 'Call get_next first.' 169 | pruned_trial = False 170 | 171 | score = learning_curve[-1] 172 | self.trial.report(score, self.conf_budget) 173 | 174 | if self.trial.should_prune(): 175 | pruned_trial = True 176 | 177 | if pruned_trial: 178 | self.study.tell(self.trial, state=optuna.trial.TrialState.PRUNED) # tell the pruned state 179 | self.evaluated_hp_curves[self.next_conf] = learning_curve 180 | if self.conf_budget in self.evaluated_configurations: 181 | self.evaluated_configurations[self.conf_budget].add(self.next_conf) 182 | else: 183 | self.evaluated_configurations[self.conf_budget] = set([self.next_conf]) 184 | self.next_conf = None 185 | 186 | if self.conf_budget == self.max_budget: 187 | self.study.tell(self.trial, score, state=optuna.trial.TrialState.COMPLETE) 188 | self.evaluated_hp_curves[self.next_conf] = learning_curve 189 | if self.conf_budget in self.evaluated_configurations: 190 | self.evaluated_configurations[self.conf_budget].add(self.next_conf) 191 | else: 192 | self.evaluated_configurations[self.conf_budget] = set([self.next_conf]) 193 | self.next_conf = None 194 | else: 195 | self.conf_budget += 1 196 | 197 | def create_configuration_to_indices( 198 | self, 199 | hyperparameter_candidates: np.ndarray, 200 | ) -> Dict[tuple, int]: 201 | """ 202 | Maps every configuration to its index as specified 203 | in hyperparameter_candidates. 204 | 205 | Args: 206 | ----- 207 | hyperparameter_candidates: np.ndarray 208 | All the possible hyperparameter candidates given 209 | by the calling framework. 210 | 211 | Returns: 212 | ________ 213 | hyperparameter_mapping: dict 214 | A dictionary where the keys are tuples representing 215 | hyperparameter configurations and the values are indices 216 | representing their placement in hyperparameter_candidates. 217 | """ 218 | hyperparameter_mapping = dict() 219 | for i in range(0, hyperparameter_candidates.shape[0]): 220 | hyperparameter_mapping[tuple(hyperparameter_candidates[i])] = i 221 | 222 | return hyperparameter_mapping 223 | 224 | def map_configuration_to_index( 225 | self, 226 | hyperparameter_candidate: np.ndarray, 227 | ) -> int: 228 | """ 229 | Return the index of the hyperparameter_candidate from 230 | the given initial array of possible hyperparameters. 231 | 232 | Args: 233 | ----- 234 | hyperparameter_candidate: np.ndarray 235 | Hyperparameter configuration. 236 | 237 | Returns: 238 | ________ 239 | index of the hyperparameter_candidate. 240 | """ 241 | hyperparameter_candidate = tuple(hyperparameter_candidate) 242 | 243 | return self.hyperparameter_mapping[hyperparameter_candidate] 244 | 245 | def get_optuna_search_space(self): 246 | """ 247 | Get the optuna hyperparameter search space distribution. 248 | 249 | Returns: 250 | -------- 251 | distribution: dict 252 | The hyperparameter search space distribution for optuna. 253 | """ 254 | distribution = {} 255 | for i, (k, v) in enumerate(self.param_space.items()): 256 | hp_type = v[2] 257 | is_log = v[3] 258 | if hp_type == str: 259 | distribution[k] = optuna.distributions.UniformDistribution(0, 1) 260 | else: 261 | if is_log: 262 | distribution[k] = optuna.distributions.LogUniformDistribution(0.00001, 1) 263 | else: 264 | distribution[k] = optuna.distributions.UniformDistribution(0, 1) 265 | 266 | return distribution 267 | 268 | def get_hp_config_from_trial(self, budget: int): 269 | """ 270 | Get the hyperparameter config index from the 271 | optuna trial. 272 | 273 | Args: 274 | ----- 275 | budget: int 276 | The budget to run the hyperparameter configuration for. 277 | 278 | Returns: 279 | -------- 280 | conf_index: int 281 | The hyperparameter config index. 282 | """ 283 | hp_config = [] 284 | for hp_name in self.param_space.keys(): 285 | hp_config.append(self.trial.params[hp_name]) 286 | 287 | conf_index = self.map_closest_evaluated(hp_config, budget) 288 | 289 | return conf_index 290 | 291 | def map_closest_evaluated( 292 | self, 293 | config: List, 294 | budget: int, 295 | ) -> int: 296 | """ 297 | Maps the hyperparameter configuration to the closest 298 | available hyperparameter configuration. 299 | 300 | Args: 301 | ----- 302 | config: List 303 | The hyperparameter configuration suggested by the baseline. 304 | budget: int 305 | The budget of the hyperparameter configuration. 306 | 307 | Returns: 308 | -------- 309 | closest_configuration_index: int 310 | An index of the closest matching configuration. 311 | """ 312 | closest_configuration_index = None 313 | smallest_distance = np.inf 314 | 315 | for i in range(0, self.transformed_hp_candidates.shape[0]): 316 | current_distance = 0 317 | possible_config = self.transformed_hp_candidates[i, :] 318 | for hyperparameter_index in range(0, len(config)): 319 | main_config_hyperparameter_value = config[hyperparameter_index] 320 | candidate_config_hyperparameter_value = possible_config[hyperparameter_index] 321 | current_distance += abs(main_config_hyperparameter_value - candidate_config_hyperparameter_value) 322 | if current_distance < smallest_distance: 323 | if len(self.evaluated_configurations) != 0: 324 | # if a hyperparameter has already been evaluated for a certain 325 | # budget, we do not consider it anymore. 326 | if budget in self.evaluated_configurations and i in self.evaluated_configurations[budget]: 327 | continue 328 | smallest_distance = current_distance 329 | closest_configuration_index = i 330 | 331 | return closest_configuration_index 332 | 333 | def from_hp_value_to_unit_cube_values( 334 | self, 335 | hp_candidates: np.ndarray, 336 | ) -> np.ndarray: 337 | """ 338 | Maps the hyperparameter configurations from the original 339 | space to the unit cube space. 340 | 341 | Args: 342 | ----- 343 | hp_candidates: np.ndarray 344 | The hyperparameter configuration suggested by the baseline. 345 | 346 | Returns: 347 | -------- 348 | new_configs: np.ndarray 349 | An array representing the hyperparameter configurations 350 | in unit cube space. 351 | """ 352 | assert len(hp_candidates[0]) == len(self.param_space) 353 | 354 | new_configs = [] 355 | 356 | for i in range(0, hp_candidates.shape[0]): 357 | new_config = [] 358 | configuration = hp_candidates[i] 359 | for hp_index, (k, v) in enumerate(self.param_space.items()): 360 | hp_type = v[2] 361 | value = configuration[hp_index] 362 | lower, upper = v[0], v[1] 363 | is_log = v[3] 364 | if hp_type == str: 365 | unique_values = v[0] 366 | ranges = np.arange(start=0, stop=1, step=1 / len(unique_values)) 367 | for range_index, unique_value in enumerate(unique_values): 368 | if unique_value == value: 369 | step_size = (1 / len(unique_values)) 370 | # set the value at the middle of the hyperparameter 371 | # allocated range 372 | value = ranges[range_index] + step_size / 0.5 373 | else: 374 | # do nothing 375 | pass 376 | else: 377 | if is_log: 378 | log_range = np.log(upper) - np.log(lower) 379 | value = (np.log(value) - np.log(lower)) / log_range 380 | else: 381 | value = (value - lower) / (upper - lower) 382 | new_config.append(value) 383 | new_configs.append(new_config) 384 | 385 | return np.array(new_configs) 386 | -------------------------------------------------------------------------------- /surrogate_models/dehb/examples/00_interfacing_DEHB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### How to read this notebook\n", 8 | "\n", 9 | "This notebook is designed to serve as a high-level, highly abstracted view of DEHB and how it can be used. The examples here are mere placeholders and *only* offer an interface to run DEHB on toy or actual problems.\n", 10 | "***" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import time\n", 20 | "import warnings\n", 21 | "import numpy as np\n", 22 | "import ConfigSpace\n", 23 | "from typing import Dict, Union, List\n", 24 | "\n", 25 | "warnings.filterwarnings('ignore')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Getting started with DEHB" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "DEHB was designed to be an algorithm for Hyper Parameter Optimization (HPO). DEHB uses Differential Evolution (DE) under-the-hood as an Evolutionary Algorithm to power the black-box optimization that HPO problems pose. DE is a black-box optimization algorithm that generates candidate configurations $x$, to the black-box function $f(x)$, that is being optimized. The $x$ is evaluated by the black-box and the corresponding response $y$ is made available to the DE algorithm, which can then use this observation ($x$, $y$), along with previous such observations, to suggest a new candidate $x$ for the next evaluation. \n", 40 | "\n", 41 | "DEHB also uses Hyperband along with DE, to allow for cheaper approximations of the actual evaluations of $x$. Let $f(x)$ be the validation error of training a multilayer perceptron (MLP) on the complete training set. Multi-fidelity algorithms such as Hyperband, allow for cheaper approximations along a possible *fidelity*. For the MLP, a subset of the dataset maybe a cheaper approximation to the full data set evaluation. Whereas the fidelity can be quantifies as the fraction of the dataset used to evaluate the configuration $x$, instead of the full dataset. Such approximations can allow sneak-peek into the black-box, potentially revealing certain landscape feature of *f(x)*, thus rendering it a *gray*-box and not completely opaque and black! \n", 42 | "\n", 43 | "The $z$ parameter is the fidelity parameter to the black-box function. If $z \\in [budget_{min}, budget_{max}]$, then $f(x, budget_{max})$ would be equivalent to the black-box case of $f(x)$.\n", 44 | "\n", 45 | "![boxes](imgs/black-gray-box.png)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "HPO algorithms optimize such black/gray box by wrapping around this *target* function an interface, by which the algorithms can suggest new $x$ and also consume the result of the corresponding evaluation to store a collection of such ($x$, $y$) pairs. Therefore, to run DEHB, the most essential component required as input is the target function to optimize. Since DEHB can leverage a Hyperband, the target function interface should account for possible input of fidelity too. \n", 53 | "\n", 54 | "### Sample interface for target function that DEHB optimizes" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "def target_function(\n", 64 | " x: Union[ConfigSpace.Configuration, List, np.array], \n", 65 | " budget: Union[int, float] = None,\n", 66 | " **kwargs\n", 67 | ") -> Dict:\n", 68 | " \"\"\" Target/objective function to optimize\n", 69 | " \n", 70 | " Parameters\n", 71 | " ----------\n", 72 | " x : configuration that DEHB wants to evaluate\n", 73 | " budget : parameter determining cheaper evaluations\n", 74 | " \n", 75 | " Returns\n", 76 | " -------\n", 77 | " dict\n", 78 | " \"\"\"\n", 79 | " # ...\n", 80 | " # write your code here\n", 81 | " # ...\n", 82 | " \n", 83 | " # remove the code snippet below\n", 84 | " start = time.time()\n", 85 | " y = np.random.uniform() # placeholder response of evaluation\n", 86 | " time.sleep(budget) # simulates runtime (mostly proportional to fidelity)\n", 87 | " cost = time.time() - start\n", 88 | " \n", 89 | " # result dict passed to DE/DEHB as function evaluation output\n", 90 | " result = {\n", 91 | " \"fitness\": y, # must-have key that DE/DEHB minimizes\n", 92 | " \"cost\": cost, # must-have key that associates cost/runtime \n", 93 | " \"info\": dict() # optional key containing a dictionary of additional info\n", 94 | " }\n", 95 | " return result " 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "This `target_function` is the problem that needs to be solved, or the function to be optimized. The other prerequisite for this function is therefore the domain for its input $x$. In other words, the definition and constraints of the *search space* for DEHB. \n", 103 | "\n", 104 | "The DE component inside DEHB, **assumes that the input domain is scaled to a unit hypercube**. This is essential for effective search. If the [ConfigSpace](https://pypi.org/project/ConfigSpace/) library is used to define the domain of $x$, or the parameters of the search space, DEHB can internally handle the scaling to and from the unit hypercube required for search. If ConfigSpace is not used, one needs to additionally handle the scaling of the parameters as an extra interface between DEHB and the target function (or encode it within the target function). \n", 105 | "\n", 106 | "For this template notebook, we will illustrate how a ConfigSpace parameter space can be created." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Defining a sample search space" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 3, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Configuration space object:\n", 126 | " Hyperparameters:\n", 127 | " x0, Type: UniformFloat, Range: [3.0, 10.0], Default: 6.5\n", 128 | "\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "import ConfigSpace\n", 134 | "\n", 135 | "\n", 136 | "def create_search_space():\n", 137 | " # Creating a one-dimensional search space of real numbers in [3, 10]\n", 138 | " cs = ConfigSpace.ConfigurationSpace()\n", 139 | " cs.add_hyperparameter(ConfigSpace.UniformFloatHyperparameter(\"x0\", lower=3, upper=10, log=False))\n", 140 | " return cs\n", 141 | "\n", 142 | "\n", 143 | "cs = create_search_space()\n", 144 | "print(cs)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "1\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "# Finding dimensionality of search space\n", 162 | "dimensions = len(cs.get_hyperparameters())\n", 163 | "print(dimensions)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 5, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "Configuration:\n", 175 | " x0, Value: 6.3793522646424785" 176 | ] 177 | }, 178 | "execution_count": 5, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "# Sampling a random configuration\n", 185 | "cs.sample_configuration()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "The [ConfigSpace documentation](https://automl.github.io/ConfigSpace/master/index.html) can be referred to for more complicated search space creation.\n", 193 | "\n", 194 | "In a similar vein, for a complete gray-box definition, the fidelity domain needs to be defined too. For the earlier example of dataset fractions, the fidelity upper limit cannot clearly exceed 1, and therefore $[0.3, 1]$ is a legitimate definition for such a fidelity. In this template example, we shall simply define the lower and upper range of the fidelity as two parameters that can be input to DEHB. Given that fidelity is being used to simulate cost of runtime in our sample `target_function`, we shall use a reasonable time range as a placeholder for the fidelity in this case." 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "### Defining fidelity/budget range for the target function" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "min_budget, max_budget = (0.1, 3) " 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "***" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "The above definitions are all the information that DEHB needs about a problem. We are now in a position to call upon DEHB and start running it, to tune $x$." 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "### Instantiating and running DEHB" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 7, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "import sys\n", 241 | "sys.path.append(\"../\")\n", 242 | "from dehb import DEHB\n", 243 | "\n", 244 | "\n", 245 | "dehb = DEHB(\n", 246 | " f=target_function,\n", 247 | " dimensions=dimensions,\n", 248 | " cs=cs,\n", 249 | " min_budget=min_budget,\n", 250 | " max_budget=max_budget,\n", 251 | " output_path=\"./temp\",\n", 252 | " n_workers=1 # set to >1 to utilize parallel workers\n", 253 | ")\n", 254 | "\n", 255 | "# NOTE: the other hyperparameters to DEHB have been set to certain defaults that were \n", 256 | "# empirically determined through related literature, ablation analysis and other experiments,\n", 257 | "# but can be tuned as desired" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "DEHB allows the option of 3 different resources for its runtime budget:\n", 265 | "#### 1) Running DEHB for a certain number of (successive halving) *brackets*" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "Configuration:\n", 278 | " x0, Value: 4.647994905980703\n", 279 | "\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "_, _, _ = dehb.run(brackets=1, verbose=False, save_intermediate=True)\n", 285 | "print(dehb.vector_to_configspace(dehb.inc_config))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "#### 2) Running DEHB for total number of *function evaluations*" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 9, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "2021-03-22 17:53:27.292 | INFO | dehb.optimizers.dehb:reset:102 - \n", 305 | "\n", 306 | "RESET at 03/22/21 17:53:27 CET\n", 307 | "\n", 308 | "\n", 309 | "(Configuration:\n", 310 | " x0, Value: 3.5500062657482494\n", 311 | ", 0.04514887709783266)\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "# allows optimization to restart from the beginning by forgetting al observations\n", 317 | "dehb.reset() \n", 318 | "\n", 319 | "_, _, _ = dehb.run(fevals=20, verbose=False, save_intermediate=True)\n", 320 | "print(dehb.get_incumbents())" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "#### 3) Running DEHB for total amount of *wallclock time*" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 10, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "name": "stdout", 337 | "output_type": "stream", 338 | "text": [ 339 | "2021-03-22 17:53:29.658 | INFO | dehb.optimizers.dehb:reset:102 - \n", 340 | "\n", 341 | "RESET at 03/22/21 17:53:29 CET\n", 342 | "\n", 343 | "\n", 344 | "(Configuration:\n", 345 | " x0, Value: 6.898789003516494\n", 346 | ", 0.0068207743261361475)\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "# allows optimization to restart from the beginning by forgetting all observations\n", 352 | "dehb.reset() \n", 353 | "\n", 354 | "_, _, _ = dehb.run(total_cost=10, verbose=False, save_intermediate=True) # run for 10s\n", 355 | "print(dehb.get_incumbents())" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "Each `dehb` object initialized maintains a `log` file in the `output_path` specified, where the progress and other debugging information is updated. While every alternative DEHB evaluation (and after full optimization), an `incumbent.json` file is written to disk `output_path`, with the incumbent (best seen so far) configuration and its corresponding score. \n", 363 | "\n", 364 | "\n", 365 | "We now rerun DEHB in parallel with 2 workers, and show that the incumbents can be retrieved in any of the following manner:" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 11, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "(Configuration:\n", 378 | " x0, Value: 6.695906943430258\n", 379 | ", 0.015266398519239388)\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "dehb = DEHB(\n", 385 | " f=target_function,\n", 386 | " dimensions=dimensions,\n", 387 | " cs=cs,\n", 388 | " min_budget=min_budget,\n", 389 | " max_budget=max_budget,\n", 390 | " output_path=\"./temp\",\n", 391 | " n_workers=2\n", 392 | ")\n", 393 | "trajectory, runtime, history = dehb.run(\n", 394 | " total_cost=20, verbose=False,\n", 395 | ")\n", 396 | "\n", 397 | "print(dehb.get_incumbents())" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 12, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "Configuration:\n", 410 | " x0, Value: 6.695906943430258\n", 411 | "\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "print(dehb.vector_to_configspace(dehb.inc_config)) # config as ConfigSpace configuration" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 13, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "0.015266398519239388 0.015266398519239388\n", 429 | "Configuration:\n", 430 | " x0, Value: 6.695906943430258\n", 431 | "\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "print(trajectory[-1], dehb.inc_score)\n", 437 | "print(dehb.vector_to_configspace(dehb.inc_config))" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "***\n", 445 | "\n", 446 | "### Conclusion\n", 447 | "\n", 448 | "As detailed above, the problem definition needs to be input to DEHB as the following information:\n", 449 | "* the *target_function* (`f`) that is the primary black-box function to optimize\n", 450 | "* the fidelity range of `min_budget` and `max_budget` that allows the cheaper, faster gray-box optimization of `f`\n", 451 | "* the search space or the input domain of the function `f`, that can be represented as a `ConfigSpace` object and passed to DEHB at initialization\n", 452 | "\n", 453 | "\n", 454 | "Following which, DEHB can be run for any amount of practical real-world budget. It can be run for either:\n", 455 | "* a total amount of actual wallclock time, example one day (~86400 seconds), or\n", 456 | "* a total number of function evaluations, or the number of times we want the black-box to be accessed for evaluation, across all fidelities\n", 457 | "* the total number of *brackets* we want to run the DEHB algorithm for\n", 458 | "\n", 459 | "DEHB will terminate once its chosen runtime budget is exhausted, and report the incumbent found. DEHB, as an *anytime* algorithm, constantly writes to disk a lightweight `json` file with the best found configuration and its score seen till that point." 460 | ] 461 | } 462 | ], 463 | "metadata": { 464 | "kernelspec": { 465 | "display_name": "dask", 466 | "language": "python", 467 | "name": "dask" 468 | }, 469 | "language_info": { 470 | "codemirror_mode": { 471 | "name": "ipython", 472 | "version": 3 473 | }, 474 | "file_extension": ".py", 475 | "mimetype": "text/x-python", 476 | "name": "python", 477 | "nbconvert_exporter": "python", 478 | "pygments_lexer": "ipython3", 479 | "version": "3.6.9" 480 | } 481 | }, 482 | "nbformat": 4, 483 | "nbformat_minor": 4 484 | } 485 | --------------------------------------------------------------------------------