├── .github └── workflows │ ├── mkdocs.yml │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── CONTRIBUTING.md ├── README.md ├── ablation │ └── intro.md ├── assets │ ├── css │ │ ├── custom.css │ │ └── version-select.css │ ├── images │ │ ├── databricks_installation.png │ │ ├── firstgraph.png │ │ ├── hopsworks_installation.png │ │ ├── maggy.png │ │ ├── maggy_dt_video.png │ │ ├── maggy_hpo_video.png │ │ ├── maggyfav.png │ │ ├── scdgraph.png │ │ └── whitemaggy-eye.svg │ └── javascript │ │ └── version-select.js ├── blogs.md ├── dist_training │ ├── intro.md │ ├── tensorflow.md │ └── torch.md ├── hpo │ ├── intro.md │ └── strategies.md ├── publications.md ├── releases.md └── start │ ├── install.md │ └── quickstart.md ├── examples ├── Databricks │ ├── maggy-databricks-iris.ipynb │ └── maggy-databricks-mnist-example.ipynb └── README.md ├── maggy ├── __init__.py ├── ablation │ ├── __init__.py │ ├── ablationstudy.py │ └── ablator │ │ ├── __init__.py │ │ ├── abstractablator.py │ │ └── loco.py ├── callbacks.py ├── config │ ├── __init__.py │ ├── ablation.py │ ├── base_config.py │ ├── hyperparameter_optimization.py │ ├── lagom.py │ ├── tf_distributed.py │ └── torch_distributed.py ├── constants.py ├── core │ ├── __init__.py │ ├── config.py │ ├── environment │ │ ├── __init__.py │ │ ├── base.py │ │ ├── databricks.py │ │ ├── hopsworks.py │ │ └── singleton.py │ ├── exceptions.py │ ├── executors │ │ ├── __init__.py │ │ ├── base_executor.py │ │ ├── tf_dist_executor.py │ │ ├── torch_dist_executor.py │ │ └── trial_executor.py │ ├── experiment_driver │ │ ├── __init__.py │ │ ├── ablation_driver.py │ │ ├── base_driver.py │ │ ├── optimization_driver.py │ │ ├── python_driver.py │ │ ├── spark_driver.py │ │ ├── tf_distributed_training_driver.py │ │ └── torch_distributed_training_driver.py │ ├── patching │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── modules.py │ │ └── optim.py │ ├── reporter.py │ ├── rpc.py │ └── tf_patching │ │ ├── __init__.py │ │ └── tf_modules.py ├── earlystop │ ├── __init__.py │ ├── abstractearlystop.py │ ├── medianrule.py │ └── nostop.py ├── experiment │ ├── __init__.py │ ├── experiment.py │ ├── experiment_pyspark.py │ └── experiment_python.py ├── optimizer │ ├── __init__.py │ ├── abstractoptimizer.py │ ├── asha.py │ ├── bayes │ │ ├── __init__.py │ │ ├── acquisitions.py │ │ ├── base.py │ │ ├── gp.py │ │ └── tpe.py │ ├── gridsearch.py │ ├── randomsearch.py │ └── singlerun.py ├── pruner │ ├── __init__.py │ ├── abstractpruner.py │ └── hyperband.py ├── searchspace.py ├── tensorboard.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_maggy.py │ ├── test_randomsearch.py │ ├── test_searchspace.py │ ├── test_trial.py │ └── test_wordcount.py ├── trial.py ├── util.py └── version.py ├── mkdocs.yml ├── setup.cfg └── setup.py /.github/workflows/mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: mkdocs 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | jobs: 8 | publish-master: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | with: 14 | fetch-depth: 0 15 | - uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.8' 18 | - name: install deps 19 | run: pip install .[dev,docs] 20 | 21 | - name: copy files 22 | run: | 23 | rm docs/CONTRIBUTING.md docs/README.md 24 | cp -f CONTRIBUTING.md docs/ 25 | cp -f README.md docs/ 26 | 27 | - name: setup git 28 | run: | 29 | git config --global user.name Mike 30 | git config --global user.email mike@maggy.ai 31 | 32 | - name: mike deploy master 33 | run: mike deploy --push --update-aliases master dev 34 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | stylecheck: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: actions/setup-python@v2 12 | with: 13 | python-version: '3.8' 14 | - name: install deps 15 | run: pip install flake8==3.9.0 black==22.3.0 pre-commit-hooks==2.4.0 16 | 17 | - name: black 18 | run: black --check maggy 19 | 20 | - name: flake8 21 | run: flake8 maggy 22 | 23 | - name: trailing-whitespace-fixer 24 | run: trailing-whitespace-fixer $(find maggy -type f) || exit 1 25 | 26 | - name: end-of-file-fixer 27 | run: end-of-file-fixer $(find maggy -type f) || exit 1 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .vscode 3 | .idea 4 | scripts/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: (^setup.py|^maggy/tests/|^docs/) 2 | repos: 3 | - repo: https://github.com/psf/black 4 | rev: 22.3.0 5 | hooks: 6 | - id: black 7 | language_version: python3 8 | - repo: https://gitlab.com/pycqa/flake8 9 | rev: 3.9.0 10 | hooks: 11 | - id: flake8 12 | language_version: python3 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v2.4.0 15 | hooks: 16 | - id: trailing-whitespace 17 | - id: end-of-file-fixer 18 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Contributions are welcome! Not familiar with the codebase yet? No problem! 4 | There are many ways to contribute to open source projects: reporting bugs, 5 | helping with the documentation, spreading the word and of course, adding 6 | new features and patches. 7 | 8 | ## Reporting issues 9 | 10 | - Describe what you expected to happen. 11 | - If possible, include a [minimal, complete, and verifiable example](https://stackoverflow.com/help/mcve) to help 12 | us identify the issue. This also helps to check that the issue is not with 13 | your own code. 14 | - Describe what actually happened. Include the full traceback if there was an 15 | exception. 16 | - List your Python, Hopsworks and Maggy versions. If possible, check if this 17 | issue is already fixed in the repository. 18 | 19 | ## Contributing Code 20 | 21 | Code contributions, in the form of patches or features are welcome. In order to 22 | start developing, please follow the instructions below, to enable [pre-commit](https://pre-commit.com/) and 23 | ensure style and codechecks. 24 | 25 | ### Python Setup 26 | 27 | - Fork Maggy to your GitHub account by clicking the `Fork` button. 28 | 29 | - Clone your fork locally: 30 | 31 | ```bash 32 | git clone https://github.com/[username]/maggy.git 33 | cd maggy 34 | ``` 35 | 36 | - Add the upstream repository as a remote to update later:: 37 | 38 | ```bash 39 | git remote add upstream https://github.com/logicalclocks/maggy.git 40 | git fetch upstream 41 | ``` 42 | 43 | - Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda: 44 | 45 | ```bash 46 | python3 -m venv env 47 | . env/bin/activate 48 | # or "env\Scripts\activate" on Windows 49 | ``` 50 | 51 | or with conda: 52 | 53 | ```bash 54 | conda create --name maggy python=3.8 55 | conda activate maggy 56 | ``` 57 | 58 | verify your python version - we are using Python 3.8: 59 | 60 | ```bash 61 | python --version 62 | ``` 63 | 64 | - Install Maggy in editable mode with development dependencies:: 65 | 66 | ```bash 67 | pip install -e ".[dev]" 68 | ``` 69 | 70 | - Install pre-commit_ and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. Maggy uses pre-commit to ensure code-style and code formatting through [black](https://github.com/psf/black) and [flake8](https://gitlab.com/pycqa/flake8): 71 | 72 | ```bash 73 | pip install --user pre-commit 74 | pre-commit install 75 | ``` 76 | 77 | Afterwards, pre-commit will run whenever you commit. 78 | 79 | - To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use black and flake8, or run them via the command line: 80 | 81 | ```bash 82 | flake8 maggy 83 | black maggy 84 | ``` 85 | 86 | ### Start coding 87 | 88 | - Create a branch to identify the issue or feature you would like to work on. 89 | - Using your favorite editor, make your changes, committing as you go. 90 | - Follow [PEP8](https://pep8.org/). 91 | - Push your commits to GitHub and [create a pull request](https://help.github.com/articles/creating-a-pull-request/). 92 | - Celebrate 🎉 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |
5 |
8 |
12 |
16 |
20 |
24 |
28 |
32 |
40 |
43 |
44 |
2 |
3 |
4 |
5 |
8 |
12 |
16 |
20 |
24 |
28 |
32 |
40 |
43 |
44 |
2 |
3 |
4 |
5 |
9 | In this folder you will find example notebooks for Maggy on Databricks environments. 10 |
11 |12 | Please, if you are interested in using Maggy on Hopsworks or on Local environemts, check the notebooks example at the following link: 13 |
14 |15 | 16 | Maggy Examples 17 | 18 |
19 | 20 | -------------------------------------------------------------------------------- /maggy/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy import searchspace 18 | 19 | Searchspace = searchspace.Searchspace 20 | 21 | __all__ = ["Searchspace"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.ablation import ablationstudy 18 | 19 | AblationStudy = ablationstudy.AblationStudy 20 | 21 | __all__ = ["AblationStudy"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/ablator/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.ablation.ablator import abstractablator 18 | 19 | AbstractAblator = abstractablator.AbstractAblator 20 | 21 | __all__ = ["AbstractAblator"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/ablator/abstractablator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | 19 | 20 | class AbstractAblator(ABC): 21 | def __init__(self, ablation_study, final_store): 22 | self.ablation_study = ablation_study 23 | self.final_store = final_store 24 | self.trial_buffer = [] 25 | 26 | @abstractmethod 27 | def get_number_of_trials(self): 28 | """ 29 | If applicable, calculate and return the total number of trials of the ablation experiment. 30 | Make sure to also include the base (reference) trial in the count. 31 | 32 | :return: total number of trials of the ablation study experiment 33 | :rtype: int 34 | """ 35 | pass 36 | 37 | @abstractmethod 38 | def get_dataset_generator(self, ablated_feature, dataset_type="tfrecord"): 39 | """ 40 | Create and return a dataset generator function based on the ablation policy to be used in a trial. 41 | The returned function will be executed on the executor per each trial. 42 | 43 | :param ablated_feature: the name of the feature to be excluded from the training dataset. 44 | Must match a feature name in the corresponding feature group in the feature store. 45 | :type ablated_feature: str 46 | :param dataset_type: type of the dataset. For now, we only support 'tfrecord'. 47 | :return: A function that generates a TFRecordDataset 48 | :rtype: function 49 | """ 50 | pass 51 | 52 | @abstractmethod 53 | def get_model_generator(self, ablated_layer): 54 | pass 55 | 56 | @abstractmethod 57 | def initialize(self): 58 | """ 59 | Initialize the ablation study experiment by generating a number of trials. Depending on the ablation policy, 60 | this method might generate all the trials (e.g. as in LOCO), or generate a number of trials to warm-start the 61 | experiment. The trials should be added to `trial_buffer` in form of `Trial` objects. 62 | """ 63 | pass 64 | 65 | @abstractmethod 66 | def get_trial(self, ablation_trial=None): 67 | """ 68 | Return a `Trial` to be assigned to an executor, or `None` if there are no trials remaining in the experiment. 69 | The trial should contain a dataset generator and a model generator. 70 | Depending on the ablator policy, the trials could come from a list (buffer) of pre-made trials, 71 | or generated on the fly. 72 | 73 | :rtype: `Trial` or `None` 74 | """ 75 | pass 76 | 77 | @abstractmethod 78 | def finalize_experiment(self, trials): 79 | """ 80 | This method will be called before finishing the experiment. Developers can implement this method 81 | e.g. for cleanup or extra logging. 82 | """ 83 | pass 84 | 85 | def name(self): 86 | return str(self.__class__.__name__) 87 | -------------------------------------------------------------------------------- /maggy/callbacks.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import tensorflow as tf 18 | 19 | 20 | class KerasBatchEnd(tf.keras.callbacks.Callback): 21 | """A Keras callback reporting a specified `metric` at the end of the batch 22 | to the maggy experiment driver. 23 | 24 | `loss` is always available as a metric, and optionally `acc` (if accuracy 25 | monitoring is enabled, that is, accuracy is added to keras model metrics). 26 | Validation metrics are not available for the BatchEnd callback. Validation 27 | after every batch would be too expensive. 28 | Default is training loss (`loss`). 29 | 30 | Example usage: 31 | 32 | >>> from maggy.callbacks import KerasBatchEnd 33 | >>> callbacks = [KerasBatchEnd(reporter, metric='acc')] 34 | """ 35 | 36 | def __init__(self, reporter, metric="loss"): 37 | super().__init__() 38 | self.metric_name = metric 39 | self.reporter = reporter 40 | 41 | def on_batch_end(self, batch, logs={}): 42 | self.reporter.broadcast(logs.get(self.metric_name, 0)) 43 | 44 | 45 | class KerasEpochEnd(tf.keras.callbacks.Callback): 46 | """A Keras callback reporting a specified `metric` at the end of an epoch 47 | to the maggy experiment driver. 48 | 49 | `val_loss` is always available as a metric, and optionally `val_acc` (if 50 | accuracy monitoring is enabled, that is, accuracy is added to keras model 51 | metrics). Training metrics are available under the names `loss` and `acc`. 52 | Default is validation loss (`val_loss`). 53 | 54 | Example usage: 55 | 56 | >>> from maggy.callbacks import KerasBatchEnd 57 | >>> callbacks = [KerasBatchEnd(reporter, metric='val_acc')] 58 | """ 59 | 60 | def __init__(self, reporter, metric="val_loss"): 61 | super().__init__() 62 | self.metric_name = metric 63 | self.reporter = reporter 64 | 65 | def on_epoch_end(self, epoch, logs={}): 66 | self.reporter.broadcast(logs.get(self.metric_name, 0), epoch) 67 | -------------------------------------------------------------------------------- /maggy/config/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.config.lagom import LagomConfig 18 | from maggy.config.base_config import BaseConfig 19 | from maggy.config.ablation import AblationConfig 20 | from maggy.config.hyperparameter_optimization import HyperparameterOptConfig 21 | from maggy.config.torch_distributed import TorchDistributedConfig 22 | from maggy.config.tf_distributed import TfDistributedConfig 23 | 24 | __all__ = [ 25 | "LagomConfig", 26 | "BaseConfig", 27 | "AblationConfig", 28 | "HyperparameterOptConfig", 29 | "TfDistributedConfig", 30 | "TorchDistributedConfig", 31 | ] 32 | -------------------------------------------------------------------------------- /maggy/config/ablation.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from typing import Union, List 20 | 21 | from maggy.ablation.ablationstudy import AblationStudy 22 | from maggy.ablation.ablator import AbstractAblator 23 | from maggy.config import LagomConfig 24 | import tensorflow as tf 25 | from maggy.core import config as mc 26 | 27 | 28 | class AblationConfig(LagomConfig): 29 | """Config class for ablation study experiments.""" 30 | 31 | def __init__( 32 | self, 33 | ablation_study: AblationStudy, 34 | ablator: Union[str, AbstractAblator] = "loco", 35 | direction: str = "max", 36 | name: str = "ablationStudy", 37 | description: str = "", 38 | hb_interval: int = 1, 39 | model: tf.keras.Model = None, 40 | dataset: List[Union[str, tf.data.Dataset]] = None, 41 | ): 42 | """Initializes ablation study experiment parameters. 43 | 44 | :param ablation_study: Ablation study object that defines the entry point into the 45 | experiment. 46 | :param ablator: An instance of `AbstractAblator` or a supported ablator name that controls 47 | the manner in which parts of the model are ablated. 48 | :param direction: Optimization direction to evaluate the experiments. 49 | :param name: Experiment name. 50 | :param description: A description of the experiment. 51 | :param hb_interval: Heartbeat interval with which the server is polling. 52 | :param model: The class of the model to be used in the training function. 53 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset. 54 | These datasets represent the ones you are going to use in the training function. 55 | For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and 56 | extract them in the training function. If you want to load the set inside the training function, this can be 57 | disregarded. 58 | """ 59 | super().__init__(name, description, hb_interval) 60 | mc.initialize() 61 | if not mc.is_spark_available(): 62 | raise NotImplementedError("Ablation Study can run only on a Spark kernel.") 63 | self.ablator = ablator 64 | self.ablation_study = ablation_study 65 | self.direction = direction 66 | self.model = model 67 | self.dataset = dataset 68 | -------------------------------------------------------------------------------- /maggy/config/base_config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from maggy.config import LagomConfig 20 | from maggy.core import config as mc 21 | 22 | 23 | class BaseConfig(LagomConfig): 24 | def __init__( 25 | self, 26 | name: str = "base", 27 | hb_interval: int = 1, 28 | description: str = "", 29 | ): 30 | 31 | """Initializes Base configuration. 32 | 33 | :param name: Experiment name. 34 | :param hb_interval: Heartbeat interval with which the server is polling. 35 | :param description: A description of the experiment. 36 | """ 37 | super().__init__(name, description, hb_interval) 38 | mc.initialize() 39 | -------------------------------------------------------------------------------- /maggy/config/hyperparameter_optimization.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import typing 20 | from typing import Union, Type, Optional, List 21 | import tensorflow as tf 22 | 23 | if typing.TYPE_CHECKING: 24 | import torch 25 | 26 | from maggy import Searchspace 27 | from maggy.earlystop import AbstractEarlyStop 28 | from maggy.optimizer import AbstractOptimizer 29 | from maggy.config import LagomConfig 30 | from maggy.core import config as mc 31 | 32 | 33 | class HyperparameterOptConfig(LagomConfig): 34 | """Config class for hyperparameter optimization experiments.""" 35 | 36 | def __init__( 37 | self, 38 | num_trials: int, 39 | optimizer: Union[str, AbstractOptimizer], 40 | searchspace: Searchspace, 41 | optimization_key: str = "Metric", 42 | direction: str = "max", 43 | es_interval: int = 1, 44 | es_min: int = 10, 45 | es_policy: Union[str, AbstractEarlyStop] = "median", 46 | name: str = "HPOptimization", 47 | description: str = "", 48 | hb_interval: int = 1, 49 | model: Union[ 50 | tf.keras.Model, Type[torch.nn.Module], List[Type[torch.nn.Module]] 51 | ] = None, 52 | dataset: List[ 53 | Optional[Union[str, tf.data.Dataset, torch.util.data.Dataset]] 54 | ] = None, 55 | ): 56 | """Initializes HP optimization experiment parameters. 57 | 58 | :param num_trials: Controls how many seperate runs are conducted during the hp search. 59 | :param optimizer: Optimizer type for searching the hp searchspace. 60 | :param searchspace: A Searchspace object configuring the names, types and ranges of hps. 61 | :param optimization_key: Name of the metric to use for hp search evaluation. 62 | :param direction: Direction of optimization. 63 | :param es_interval: Early stopping polling frequency during an experiment run. 64 | :param es_min: Minimum number of experiments to conduct before starting the early stopping 65 | mechanism. Useful to establish a baseline for performance estimates. 66 | :param es_policy: Early stopping policy which formulates a rule for triggering aborts. 67 | :param name: Experiment name. 68 | :param description: A description of the experiment. 69 | :param hb_interval: Heartbeat interval with which the server is polling. 70 | :param model: The class of the model to be used in the training function. 71 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset or 72 | torch.util.data.Dataset. These datasets represent the ones you are going to use in the training function. 73 | For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and 74 | extract them in the training function. If you want to load the set inside the training function, this can be 75 | disregarded. 76 | """ 77 | super().__init__(name, description, hb_interval) 78 | if not mc.is_spark_available(): 79 | raise NotImplementedError( 80 | "Hyperparameter Optimization can run only on a Spark kernel." 81 | ) 82 | if not num_trials > 0: 83 | raise ValueError("Number of trials should be greater than zero!") 84 | self.num_trials = num_trials 85 | self.optimizer = optimizer 86 | self.optimization_key = optimization_key 87 | self.searchspace = searchspace 88 | self.direction = direction 89 | self.es_policy = es_policy 90 | self.es_interval = es_interval 91 | self.es_min = es_min 92 | self.model = model 93 | self.dataset = dataset 94 | -------------------------------------------------------------------------------- /maggy/config/lagom.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from abc import ABC 20 | 21 | 22 | class LagomConfig(ABC): 23 | """Base class for lagom configuration classes.""" 24 | 25 | def __init__(self, name: str, description: str, hb_interval: int): 26 | """Initializes basic experiment info. 27 | 28 | :param name: Experiment name. 29 | :param description: A description of the experiment. 30 | :param hb_interval: Heartbeat interval with which the server is polling. 31 | """ 32 | self.name = name 33 | self.description = description 34 | self.hb_interval = hb_interval 35 | -------------------------------------------------------------------------------- /maggy/config/tf_distributed.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from typing import Union, Callable, List, Optional 20 | 21 | from maggy.config import LagomConfig 22 | 23 | import tensorflow as tf 24 | 25 | 26 | class TfDistributedConfig(LagomConfig): 27 | def __init__( 28 | self, 29 | model: tf.keras.Model = None, 30 | dataset: List[Optional[Union[str, tf.data.Dataset]]] = None, 31 | process_data: Callable = None, 32 | mixed_precision: bool = False, 33 | name: str = "tfDist", 34 | hb_interval: int = 1, 35 | description: str = "", 36 | hparams: dict = None, 37 | ): 38 | 39 | """Initializes Tensorflow distributed training parameters. 40 | 41 | :param model: A tf.keras.Model superclass or list of them. 42 | Note that this has to be the class itself, not an instance. 43 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset. 44 | these datasets represent the ones you are going to use in the training function. For example, 45 | if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in 46 | the training function. If you want to load the set inside the training function, this can be disregarded. 47 | :param process_data: The function for processing the data. 48 | :param hparams: model parameters that should be used during model initialization. Primarily 49 | used to give an interface for hp optimization. 50 | :param name: Experiment name. 51 | :param hb_interval: Heartbeat interval with which the server is polling. 52 | :param description: A description of the experiment. 53 | """ 54 | super().__init__(name, description, hb_interval) 55 | self.model = model 56 | self.dataset = dataset 57 | self.process_data = process_data 58 | self.mixed_precision = mixed_precision 59 | self.hparams = hparams if hparams else {} 60 | -------------------------------------------------------------------------------- /maggy/config/torch_distributed.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import typing 20 | from typing import Union, Type, Optional, List 21 | from maggy.config import LagomConfig 22 | from maggy.core import config as mc 23 | 24 | if typing.TYPE_CHECKING: 25 | import torch 26 | 27 | 28 | class TorchDistributedConfig(LagomConfig): 29 | """LagomConfig class for running distributed PyTorch training.""" 30 | 31 | BACKENDS = ["torch", "deepspeed"] 32 | 33 | def __init__( 34 | self, 35 | module: Union[Type[torch.nn.Module], List[Type[torch.nn.Module]]], 36 | dataset: List[Optional[Union[str, torch.util.data.Dataset]]] = None, 37 | hparams: dict = None, 38 | backend: str = "torch", 39 | mixed_precision: bool = False, 40 | zero_lvl: int = 0, 41 | deepspeed_config: dict = None, 42 | name: str = "torchDist", 43 | hb_interval: int = 1, 44 | description: str = "", 45 | ): 46 | """Initializes PyTorch distributed training parameters. 47 | 48 | :param module: A PyTorch module class or list of PyTorch module classes. 49 | Note that this has to be the class itself, not an instance. 50 | :param dataset: A List of strings containing the dataset path or list of torch.util.data.Dataset. 51 | these datasets represent the ones you are going to use in the training function. For example, 52 | if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in 53 | the training function. If you want to load the set inside the training function, this can be disregarded. 54 | :param hparams: Hyperparameters that should be used during model initialization. Primarily 55 | used to give an interface for hp optimization. 56 | :param backend: The backend framework used for training. Note that `deepspeed` needs syntax 57 | changes to a normal PyTorch script! 58 | :param mixed_precision: Used to control the use of mixed precision training in `torch` 59 | backend mode with model sharding (`zero_lvl` 3). 60 | :param zero_lvl: Sets the ZeRO optimization stages for `torch`. Note: When using `deepspeed` 61 | backend, overwrites `deepspeed_config` zero level! 62 | :param deepspeed_config: A dictionary that represents a valid deepspeed ZeRO optimizer 63 | config. For information on the config, see https://www.deepspeed.ai/docs/config-json/. 64 | :param name: Experiment name. 65 | :param hb_interval: Heartbeat interval with which the server is polling. 66 | :param description: A description of the experiment. 67 | """ 68 | super().__init__(name, description, hb_interval) 69 | mc.initialize() 70 | if not mc.is_spark_available(): 71 | raise NotImplementedError( 72 | "Torch Distributed Training can run only on a Spark kernel." 73 | ) 74 | self.module = module 75 | self.dataset = dataset 76 | if backend not in self.BACKENDS: 77 | raise ValueError( 78 | """Backend {} not supported by Maggy. 79 | Supported types are: {}""".format( 80 | backend, self.BACKENDS 81 | ) 82 | ) 83 | self.backend = backend 84 | self.mixed_precision = mixed_precision 85 | self.hparams = hparams if hparams else {} 86 | self.zero_lvl = zero_lvl 87 | self.ds_config = deepspeed_config 88 | -------------------------------------------------------------------------------- /maggy/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Constants used in Maggy: Allowed datatypes etc. 19 | """ 20 | import numpy as np 21 | 22 | 23 | class USER_FCT: 24 | """User training function specifics.""" 25 | 26 | RETURN_TYPES = (float, int, np.number, dict) 27 | NUMERIC_TYPES = (float, int, np.number) 28 | -------------------------------------------------------------------------------- /maggy/core/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import tensorflow as tf 18 | 19 | SPARK_AVAILABLE = None 20 | try: 21 | from pyspark.sql import SparkSession # noqa: F401 22 | 23 | SPARK_AVAILABLE = True 24 | except ModuleNotFoundError: 25 | SPARK_AVAILABLE = False 26 | 27 | MODE = None 28 | TF_VERSION = None 29 | 30 | 31 | def initialize(): 32 | tf_full = tf.__version__.split(".")[0] 33 | # for building the docs since mock object doesn't mock int() 34 | global TF_VERSION 35 | global MODE 36 | if not isinstance(tf_full, str): 37 | TF_VERSION = 2 38 | else: 39 | TF_VERSION = int(tf_full) 40 | 41 | print("Detected Kernel: Python.") if not SPARK_AVAILABLE else print( 42 | "Detected Kernel: Spark." 43 | ) 44 | 45 | 46 | def is_spark_available(): 47 | return SPARK_AVAILABLE 48 | -------------------------------------------------------------------------------- /maggy/core/environment/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/environment/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import shutil 19 | import warnings 20 | 21 | from maggy import util 22 | from maggy.core.rpc import Client 23 | 24 | 25 | class BaseEnv: 26 | """ 27 | Support maggy on a local pyspark installation. 28 | """ 29 | 30 | def __init__(self): 31 | self.log_dir = os.path.join(os.getcwd(), "experiment_log") 32 | if not os.path.exists(self.log_dir): 33 | os.mkdir(self.log_dir) 34 | 35 | def set_ml_id(self, app_id=0, run_id=0): 36 | os.environ["ML_ID"] = str(app_id) + "_" + str(run_id) 37 | 38 | def create_experiment_dir(self, app_id, run_id): 39 | if not os.path.exists(os.path.join(self.log_dir, app_id)): 40 | os.mkdir(os.path.join(self.log_dir, app_id)) 41 | 42 | experiment_path = self.get_logdir(app_id, run_id) 43 | if os.path.exists(experiment_path): 44 | shutil.rmtree(experiment_path) 45 | 46 | os.mkdir(experiment_path) 47 | 48 | def get_logdir(self, app_id, run_id): 49 | return os.path.join(self.log_dir, str(app_id), str(run_id)) 50 | 51 | def populate_experiment( 52 | self, 53 | model_name, 54 | function, 55 | type, 56 | hp, 57 | description, 58 | app_id, 59 | direction, 60 | optimization_key, 61 | ): 62 | pass 63 | 64 | def attach_experiment_xattr(self, exp_ml_id, experiment_json, command): 65 | pass 66 | 67 | def exists(self, hdfs_path): 68 | return os.path.exists(hdfs_path) 69 | 70 | def mkdir(self, hdfs_path): 71 | return os.mkdir(hdfs_path) 72 | 73 | def isdir(self, dir_path, project=None): 74 | return os.path.isdir(dir_path) 75 | 76 | def ls(self, dir_path): 77 | return os.listdir(dir_path) 78 | 79 | def delete(self, path, recursive=False): 80 | 81 | if self.exists(path): 82 | if os.path.isdir(path): 83 | if recursive: 84 | # remove the directory recursively 85 | shutil.rmtree(path) 86 | elif not os.listdir(path): 87 | os.rmdir(path) 88 | else: 89 | warnings.warn( 90 | "Could not delete the dir {}, not empty.\n" 91 | "Use recursive=True when calling this function".format(path) 92 | ) 93 | elif os.path.isfile(path): 94 | os.remove(path) 95 | else: 96 | warnings.warn( 97 | "Could not delete the file in {}.\n" 98 | "File does not exists.".format(path) 99 | ) 100 | 101 | def dump(self, data, hdfs_path): 102 | head_tail = os.path.split(hdfs_path) 103 | if not os.path.exists(head_tail[0]): 104 | os.makedirs(head_tail[0]) 105 | with self.open_file(hdfs_path, flags="w") as file: 106 | file.write(data) 107 | 108 | def get_ip_address(self): 109 | sc = util.find_spark().sparkContext 110 | return sc._conf.get("spark.driver.host") 111 | 112 | def get_constants(self): 113 | pass 114 | 115 | def open_file(self, hdfs_path, flags="r", buff_size=-1): 116 | return open(hdfs_path, mode=flags, buffering=buff_size) 117 | 118 | def get_training_dataset_path( 119 | self, training_dataset, featurestore=None, training_dataset_version=1 120 | ): 121 | pass 122 | 123 | def get_training_dataset_tf_record_schema( 124 | self, training_dataset, training_dataset_version=1, featurestore=None 125 | ): 126 | pass 127 | 128 | def get_featurestore_metadata(self, featurestore=None, update_cache=False): 129 | pass 130 | 131 | def init_ml_tracking(self, app_id, run_id): 132 | pass 133 | 134 | def log_searchspace(self, app_id, run_id, searchspace): 135 | pass 136 | 137 | def connect_host(self, server_sock, server_host_port, exp_driver): 138 | if not server_host_port: 139 | server_sock.bind(("", 0)) 140 | # hostname may not be resolvable but IP address probably will be 141 | host = self.get_ip_address() 142 | port = server_sock.getsockname()[1] 143 | server_host_port = (host, port) 144 | 145 | else: 146 | server_sock.bind(server_host_port) 147 | 148 | server_sock.listen(10) 149 | 150 | return server_sock, server_host_port 151 | 152 | def _upload_file_output(self, retval, hdfs_exec_logdir): 153 | pass 154 | 155 | def project_path(self): 156 | return os.getcwd() 157 | 158 | def get_user(self): 159 | return "" 160 | 161 | def project_name(self): 162 | return "" 163 | 164 | def finalize_experiment( 165 | self, 166 | experiment_json, 167 | metric, 168 | app_id, 169 | run_id, 170 | state, 171 | duration, 172 | logdir, 173 | best_logdir, 174 | optimization_key, 175 | ): 176 | pass 177 | 178 | def str_or_byte(self, str): 179 | return str 180 | 181 | def get_executors(self, sc): 182 | 183 | if sc._conf.get("spark.dynamicAllocation.enabled") == "true": 184 | maxExecutors = int( 185 | sc._conf.get("spark.dynamicAllocation.maxExecutors", defaultValue="-1") 186 | ) 187 | if maxExecutors == -1: 188 | raise KeyError( 189 | 'Failed to find "spark.dynamicAllocation.maxExecutors" property, ' 190 | "but dynamicAllocation is enabled. " 191 | "Define the number of min and max executors when building the spark session." 192 | ) 193 | else: 194 | maxExecutors = int( 195 | sc._conf.get("spark.executor.instances", defaultValue="-1") 196 | ) 197 | if maxExecutors == -1: 198 | raise KeyError( 199 | 'Failed to find "spark.executor.instances" property, ' 200 | 'Define the number of executors using "spark.executor.instances" ' 201 | "when building the spark session." 202 | ) 203 | return maxExecutors 204 | 205 | def build_summary_json(self, logdir): 206 | pass 207 | 208 | def connect_hsfs(self): 209 | pass 210 | 211 | def convert_return_file_to_arr(self, return_file): 212 | pass 213 | 214 | def upload_file_output(self, retval, hdfs_exec_logdir): 215 | pass 216 | 217 | def get_client(self, server_addr, partition_id, hb_interval, secret, sock): 218 | client_addr = ( 219 | self.get_ip_address(), 220 | sock.getsockname()[1], 221 | ) 222 | return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret) 223 | -------------------------------------------------------------------------------- /maggy/core/environment/databricks.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | 19 | from maggy.core.environment.base import BaseEnv 20 | from maggy.core.rpc import Client 21 | 22 | 23 | class DatabricksEnv(BaseEnv): 24 | """ 25 | This class extends BaseEnv. 26 | Environment implemented for maggy usage on Databricks. 27 | """ 28 | 29 | def __init__(self): 30 | self.log_dir = "/dbfs/maggy_log/" 31 | if not os.path.exists(self.log_dir): 32 | os.mkdir(self.log_dir) 33 | 34 | def mkdir(self, hdfs_path): 35 | return os.mkdir(hdfs_path) 36 | 37 | def project_path(self, project=None, exclude_nn_addr=False): 38 | return "/dbfs/" 39 | 40 | def get_executors(self, sc): 41 | if ( 42 | sc._conf.get("spark.databricks.clusterUsageTags.clusterScalingType") 43 | == "autoscaling" 44 | ): 45 | maxExecutors = int( 46 | sc._conf.get( 47 | "spark.databricks.clusterUsageTags.clusterMaxWorkers", 48 | defaultValue="-1", 49 | ) 50 | ) 51 | if maxExecutors == -1: 52 | raise KeyError( 53 | 'Failed to find "spark.databricks.clusterUsageTags.clusterMaxWorkers" property, ' 54 | "but clusterScalingType is set to autoscaling." 55 | ) 56 | else: 57 | maxExecutors = int( 58 | sc._conf.get( 59 | "spark.databricks.clusterUsageTags.clusterWorkers", 60 | defaultValue="-1", 61 | ) 62 | ) 63 | if maxExecutors == -1: 64 | raise KeyError( 65 | 'Failed to find "spark.databricks.clusterUsageTags.clusterWorkers" property.' 66 | ) 67 | return maxExecutors 68 | 69 | def get_client(self, server_addr, partition_id, hb_interval, secret, sock): 70 | server_addr = (server_addr[0], server_addr[1]) 71 | client_addr = ( 72 | server_addr[0], 73 | sock.getsockname()[1], 74 | ) 75 | return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret) 76 | 77 | def get_logdir(self, app_id, run_id): 78 | return self.log_dir 79 | -------------------------------------------------------------------------------- /maggy/core/environment/singleton.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | 19 | 20 | class EnvSing(object): 21 | 22 | obj = None 23 | 24 | def __new__(cls, *args, **kwargs): 25 | if EnvSing.obj is not None: 26 | raise Exception("A Test Singleton instance already exists") 27 | 28 | # check hopsworks availability 29 | if "REST_ENDPOINT" in os.environ: 30 | print("Detected Environment: Hopsworks.") 31 | 32 | from maggy.core.environment import hopsworks 33 | 34 | EnvSing.obj = hopsworks.HopsworksEnv() 35 | 36 | elif os.environ.get("DATABRICKS_ROOT_CONDA_ENV") == "databricks-ml": 37 | print("Detected Environment: Databricks.") 38 | 39 | from maggy.core.environment import databricks 40 | 41 | EnvSing.obj = databricks.DatabricksEnv() 42 | 43 | else: 44 | print("Detected Environment: base.") 45 | 46 | from maggy.core.environment import base 47 | 48 | EnvSing.obj = base.BaseEnv() 49 | 50 | if EnvSing.obj is None: 51 | raise NotImplementedError( 52 | "environment_instance is None, environment not initialised." 53 | ) 54 | 55 | @staticmethod 56 | def get_instance(): 57 | """ 58 | return an instance of the environment to be used by maggy within a session. 59 | """ 60 | if EnvSing.obj is None: 61 | EnvSing() 62 | return EnvSing.obj 63 | -------------------------------------------------------------------------------- /maggy/core/exceptions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Maggy specific exceptions. 19 | """ 20 | 21 | 22 | class EarlyStopException(Exception): 23 | """Raised by the reporter when a early stop signal is received.""" 24 | 25 | def __init__(self, metric): 26 | super().__init__() 27 | self.metric = metric 28 | 29 | 30 | class NotSupportedError(Exception): 31 | """Raised when we are dealing with a situation that we do not (yet) 32 | support, e.g., a specific dataset type. 33 | """ 34 | 35 | def __init__(self, category, value, suggestion=""): 36 | self.message = "({0}: {1}) is not supported by Maggy.{2}".format( 37 | category, value, suggestion 38 | ) 39 | super().__init__(self.message) 40 | 41 | 42 | class ReturnTypeError(TypeError): 43 | """User defined training function returns value of wrong type.""" 44 | 45 | def __init__(self, optimization_key, return_type): 46 | self.message = ( 47 | "Training function cannot return value of type: {}. " 48 | "Return single numeric value or 'dict' containing optimization key" 49 | " `{}` with numeric value".format( 50 | type(return_type).__name__, optimization_key 51 | ) 52 | ) 53 | super().__init__(self.message) 54 | 55 | 56 | class MetricTypeError(TypeError): 57 | """User defined training function returns metric of wrong type.""" 58 | 59 | def __init__(self, optimization_key, return_type): 60 | self.message = ( 61 | "The optimization metric `{}` returned by the training function is" 62 | " of type: {}. The optimization metric can only be numeric".format( 63 | optimization_key, type(return_type).__name__ 64 | ) 65 | ) 66 | super().__init__(self.message) 67 | 68 | 69 | class BroadcastMetricTypeError(TypeError): 70 | """User defined training function broadcasts metric of wrong type.""" 71 | 72 | def __init__(self, return_type): 73 | self.message = ( 74 | "The optimization metric broadcast by the training function with " 75 | "the reporter is of type: {}. The optimization metric can only " 76 | "be numeric".format(type(return_type).__name__) 77 | ) 78 | super().__init__(self.message) 79 | 80 | 81 | class BroadcastStepTypeError(TypeError): 82 | """User defined training function broadcasts metric with a non-numeric step 83 | type. 84 | """ 85 | 86 | def __init__(self, value, step): 87 | self.message = ( 88 | "The optimization metric `{}` was broadcast by the training " 89 | " function in step {}, which is of type {}. The step value can " 90 | "only be numeric.".format(value, step, type(value).__name__) 91 | ) 92 | super().__init__(self.message) 93 | 94 | 95 | class BroadcastStepValueError(ValueError): 96 | """User defined training function broadcasts metric with a 97 | non-monotonically increasing step attribute. 98 | """ 99 | 100 | def __init__(self, value, step, prev_step): 101 | self.message = ( 102 | "The optimization metric `{}` was broadcast by the training " 103 | " function in step {}, while the previous step was {}. The steps " 104 | "should be a monotonically increasing attribute.".format( 105 | value, step, prev_step 106 | ) 107 | ) 108 | super().__init__(self.message) 109 | 110 | 111 | class BadArgumentsError(Exception): 112 | """Raised when a function or method has been called with incompatible arguments. 113 | This can be used by developers to prevent bad usage of their functions 114 | or classes by other developers. 115 | """ 116 | 117 | def __init__(self, callable, suggestion=""): 118 | self.message = "{0} was called using incompatible arguments. {1}".format( 119 | callable, suggestion 120 | ) 121 | super().__init__(self.message) 122 | -------------------------------------------------------------------------------- /maggy/core/executors/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/executors/base_executor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from typing import Callable, Any 18 | 19 | 20 | def base_executor_fn( 21 | train_fn: Callable, 22 | ) -> Callable: 23 | """Wraps the user supplied training function in order to be passed to the Spark Executors. 24 | 25 | :param train_fn: Original training function. 26 | :param config: Experiment config. 27 | 28 | :returns: Patched function to execute on the Spark executors. 29 | """ 30 | 31 | def wrapper_function(_: Any) -> None: 32 | """Patched function from tf_dist_executor_fn factory. 33 | 34 | :param _: Necessary catch for the iterator given by Spark to the 35 | function upon foreach calls. Can safely be disregarded. 36 | """ 37 | 38 | retval = train_fn() 39 | return retval 40 | 41 | return wrapper_function 42 | -------------------------------------------------------------------------------- /maggy/core/experiment_driver/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from .optimization_driver import HyperparameterOptDriver 18 | from .ablation_driver import AblationDriver 19 | from .base_driver import BaseDriver 20 | 21 | 22 | __all__ = ["HyperparameterOptDriver", "AblationDriver", "BaseDriver"] 23 | -------------------------------------------------------------------------------- /maggy/core/experiment_driver/torch_distributed_training_driver.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from pickle import PicklingError 18 | from typing import Callable, Type, Any 19 | 20 | from maggy import util 21 | from maggy.core.environment.singleton import EnvSing 22 | from maggy.config import TorchDistributedConfig 23 | from maggy.core.rpc import DistributedTrainingServer 24 | from maggy.core.experiment_driver.spark_driver import Driver 25 | from maggy.core.executors.torch_dist_executor import torch_dist_executor_fn 26 | 27 | 28 | class TorchDistributedTrainingDriver(Driver): 29 | """Driver for distributed learning on a Spark cluster. 30 | 31 | Registers the workers on an RPC server, ensures proper configuration and 32 | logging, and accumulates final results. 33 | """ 34 | 35 | def __init__(self, config: TorchDistributedConfig, app_id: int, run_id: int): 36 | """Initializes the server for initial training setup communication and log collection. 37 | 38 | :param config: Experiment config. 39 | :param app_id: Maggy application ID. 40 | :param run_id: Maggy run ID. 41 | """ 42 | super().__init__(config, app_id, run_id) 43 | self.server = DistributedTrainingServer(self.num_executors, config.__class__) 44 | self.results = [] 45 | 46 | def _exp_startup_callback(self) -> None: 47 | """No special startup actions required.""" 48 | 49 | def _exp_final_callback(self, job_end: float, _: Any) -> dict: 50 | """Calculates the average test error from all partitions. 51 | 52 | :param job_end: Time of the job end. 53 | :param _: Catches additional callback arguments. 54 | 55 | :returns: The result in a dictionary. 56 | """ 57 | result = {"test result": self.average_metric()} 58 | exp_ml_id = str(self.app_id) + "_" + str(self.run_id) 59 | EnvSing.get_instance().attach_experiment_xattr( 60 | exp_ml_id, 61 | {"state": "FINISHED", "duration": int(job_end - self.job_start) * 1000}, 62 | "FULL_UPDATE", 63 | ) 64 | print("Final average test loss: {:.3f}".format(self.average_metric())) 65 | print( 66 | "Finished experiment. Total run time: " 67 | + util.time_diff(self.job_start, job_end) 68 | ) 69 | return result 70 | 71 | def _exp_exception_callback(self, exc: Type[Exception]) -> None: 72 | """Catches pickling errors in case the input arguments (most likely 73 | the dataset) are too large to be pickled, or not compatible. 74 | 75 | :param exc: The exception to handle. 76 | 77 | :raises RuntimeError: Provides the user with additional information 78 | about avoiding pickle problems and includes the pickle error. 79 | """ 80 | if isinstance(exc, PicklingError): 81 | raise RuntimeError( 82 | """Pickling has failed. This is most likely caused by one of 83 | the following reasons: Your module class can't be pickled, or your 84 | dataset is too large. 85 | Consider passing a custom dataloader that reads from files in 86 | case of large datasets, and verify that your module is 87 | pickleable!""" 88 | ) 89 | raise exc 90 | 91 | def _patching_fn( 92 | self, train_fn: Callable, config: TorchDistributedConfig 93 | ) -> Callable: 94 | """Monkey patches the user training function with the distributed 95 | executor modifications for distributed training. 96 | 97 | :param train_fn: User provided training function. 98 | 99 | :returns: The monkey patched training function. 100 | """ 101 | return torch_dist_executor_fn( 102 | train_fn, 103 | config, 104 | self.app_id, 105 | self.run_id, 106 | self.server_addr, 107 | self.hb_interval, 108 | self._secret, 109 | self.log_dir, 110 | ) 111 | 112 | def _register_msg_callbacks(self) -> None: 113 | """Registers a metric message callback for heartbeat responses to spark 114 | magic and a final callback to process experiment results. 115 | """ 116 | self.message_callbacks["METRIC"] = self._log_msg_callback 117 | self.message_callbacks["FINAL"] = self._final_msg_callback 118 | 119 | def _log_msg_callback(self, msg: dict) -> None: 120 | """Callback for heartbeat messages with logs from the executors. 121 | 122 | :param msg: Message from the executors. Contains logs to be written to 123 | jupyter and the DFS. 124 | """ 125 | logs = msg.get("logs", None) 126 | if logs is not None: 127 | with self.log_lock: 128 | self.executor_logs = self.executor_logs + logs 129 | 130 | def _final_msg_callback(self, msg: dict) -> None: 131 | """Appends the test result from the workers to the result list. 132 | 133 | :param msg: Final message from the executors. 134 | """ 135 | self.results.append(msg.get("data", None)) 136 | 137 | def average_metric(self) -> float: 138 | """Calculates the current average over the valid results. 139 | 140 | :returns: The average result value. 141 | """ 142 | valid_results = [x for x in self.results if x is not None] 143 | if len(valid_results) > 0: 144 | return sum(valid_results) / len(valid_results) 145 | else: 146 | return 0 147 | -------------------------------------------------------------------------------- /maggy/core/patching/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import torch 18 | 19 | from .dataloader import MaggyDataLoader, MaggyPetastormDataLoader 20 | from .modules import ( 21 | get_maggy_ddp_wrapper, 22 | get_maggy_fairscale_wrapper, 23 | get_maggy_deepspeed_wrapper, 24 | ) 25 | 26 | __all__ = [ 27 | "get_maggy_ddp_wrapper", 28 | "get_maggy_fairscale_wrapper", 29 | "get_maggy_deepspeed_wrapper", 30 | "MaggyDataLoader", 31 | "MaggyPetastormDataLoader", 32 | ] 33 | 34 | # Check torch version, only import ZeroRedundancyOptimizer if >= 1.8 35 | _torch_version = torch.__version__.split(".") 36 | if int(_torch_version[0]) > 1 or int(_torch_version[1]) >= 8: 37 | from .optim import ( 38 | MaggyZeroAdadelta, 39 | MaggyZeroAdagrad, 40 | MaggyZeroAdam, 41 | MaggyZeroAdamW, 42 | MaggyZeroSparseAdam, 43 | MaggyZeroAdamax, 44 | MaggyZeroASGD, 45 | MaggyZeroLBFGS, 46 | MaggyZeroRMSprop, 47 | MaggyZeroRprop, 48 | MaggyZeroSGD, 49 | ) 50 | 51 | __all__ += [ 52 | "MaggyZeroAdadelta", 53 | "MaggyZeroAdagrad", 54 | "MaggyZeroAdam", 55 | "MaggyZeroAdamW", 56 | "MaggyZeroSparseAdam", 57 | "MaggyZeroAdamax", 58 | "MaggyZeroASGD", 59 | "MaggyZeroLBFGS", 60 | "MaggyZeroRMSprop", 61 | "MaggyZeroRprop", 62 | "MaggyZeroSGD", 63 | ] 64 | -------------------------------------------------------------------------------- /maggy/core/patching/dataloader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import os 20 | from typing import Type, Union, Optional, Any, Callable 21 | import collections 22 | 23 | import torch 24 | from torch.utils.data import Dataset, Sampler 25 | from torch.utils.data import DataLoader as TorchDataLoader 26 | from petastorm.reader import make_reader, make_batch_reader 27 | from petastorm.pytorch import DataLoader as PetastormDataLoader 28 | from petastorm.transform import TransformSpec 29 | 30 | from maggy.core.environment.singleton import EnvSing 31 | 32 | 33 | class MaggyDataLoader(TorchDataLoader): 34 | """Monkey patching class for PyTorch's DataLoader. 35 | 36 | Patches the DataLoader to include a distributed sampler. Uses environment 37 | variables for infos such as world size for the DataLoader. These can 38 | assumed to be present since Maggy's distributed experiment sets them prior 39 | to running thes training. 40 | Automatically moves training data to the GPU since distributed training 41 | requires execution on GPUs. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | dataset: Union[Type[Dataset], str], 47 | batch_size: int = 1, 48 | shuffle: Any = False, 49 | sampler: Optional[Sampler[int]] = None, 50 | batch_sampler: Optional[Any] = None, 51 | num_workers: int = 0, 52 | collate_fn: Optional[Callable] = None, 53 | pin_memory: bool = False, 54 | drop_last: bool = False, 55 | timeout: float = 0, 56 | worker_init_fn: Optional[Callable] = None, 57 | **_: Any, 58 | ): 59 | """Initializes a torch DataLoader. 60 | 61 | :param dataset: A PyTorch Dataset. 62 | :param batch_size: How many samples per batch to load (default: ``1``). 63 | :param shuffle: Discarded, not compatible with Maggy. 64 | :param sampler: Discarded, gets replaced by DistributedSampler. 65 | :param batch_sampler: Discarded, not compatible with Maggy. 66 | :param num_workers: Discarded, currently crashes Spark if set >0. 67 | :param collate_fn: Merges a list of samples to a minibatch of tensors. 68 | :param pin_memory: Automatically transfer tensors to GPU. 69 | :param drop_last: Drop last incomplete batch. 70 | :param timeout: Timeout for collecting a batch. 71 | :param worker_init_fn: Executed on each worker with worker ID. 72 | :param _: Argument catch to stay compatible with PyTorch. 73 | """ 74 | sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset) 75 | super().__init__( 76 | dataset, 77 | batch_size, 78 | shuffle=False, 79 | sampler=sampler, 80 | batch_sampler=None, 81 | num_workers=0, # Multiprocessing workers do not work at the moment. 82 | collate_fn=collate_fn, 83 | pin_memory=pin_memory, 84 | drop_last=drop_last, 85 | timeout=timeout, 86 | worker_init_fn=worker_init_fn, 87 | ) 88 | self.iterator = None 89 | 90 | def __iter__(self) -> MaggyDataLoader: 91 | # Reload the dataset when new iterator requested. 92 | self.iterator = TorchDataLoader.__iter__(self) 93 | return self 94 | 95 | def __next__(self) -> Union[torch.Tensor, list, dict]: 96 | data = self.iterator.__next__() 97 | return _to_cuda(data) 98 | 99 | 100 | class MaggyPetastormDataLoader(PetastormDataLoader): 101 | """Maggy implementation of a Petastorm parquet DataLoader. 102 | 103 | Arguments such as world size, reader and rank are automated to make 104 | PetastormDataLoader as similar to PyTorch's DataLoader as possible. 105 | """ 106 | 107 | def __init__( 108 | self, dataset: str, batch_size: int = 1, transform_spec: TransformSpec = None 109 | ): 110 | """Initializes a reader depending on the dataset (Petastorm/Parquet). 111 | 112 | :param dataset: Path to the dataset. 113 | :param batch_size: How many samples per batch to load (default: ``1``). 114 | :param transform_spec: Petastorm transform spec for data augmentation. 115 | """ 116 | num_workers = int(os.environ["WORLD_SIZE"]) # Is set at lagom startup. 117 | rank = int(os.environ["RANK"]) 118 | is_peta_ds = EnvSing.get_instance().exists( 119 | dataset.rstrip("/") + "/_common_metadata" 120 | ) 121 | # Make reader only compatible with petastorm dataset. 122 | ds_type = "Petastorm" if is_peta_ds else "Parquet" 123 | print(f"{ds_type} dataset detected in folder {dataset}") 124 | reader_factory = make_reader if is_peta_ds else make_batch_reader 125 | reader = reader_factory( 126 | dataset, 127 | cur_shard=rank, 128 | shard_count=num_workers, 129 | transform_spec=TransformSpec(transform_spec), 130 | ) 131 | super().__init__(reader, batch_size=batch_size) 132 | self.iterator = None 133 | 134 | def __iter__(self) -> MaggyPetastormDataLoader: 135 | # Reload the dataset when new iterator requested. 136 | self.iterator = PetastormDataLoader.__iter__(self) 137 | return self 138 | 139 | def __next__(self) -> Union[torch.Tensor, list, dict]: 140 | data = self.iterator.__next__() 141 | return _to_cuda(data) 142 | 143 | def __len__(self): 144 | raise NotImplementedError("Petastorm dataloader does not support __len__.") 145 | 146 | 147 | def _to_cuda(data: Union[torch.Tensor, list, dict]) -> Union[torch.Tensor, list, dict]: 148 | """Recurses into data, transfers tensors to GPU. 149 | 150 | :param data: The data structure to be transferred. 151 | 152 | :raises TypeError: In case of unsupported data structures. 153 | 154 | :returns: The transfered data structure. 155 | """ 156 | if isinstance(data, collections.abc.Mapping): 157 | return {key: _to_cuda(val) for key, val in data.items()} 158 | if isinstance(data, (list, tuple)): 159 | data_list = [_to_cuda(el) for el in data] 160 | return data_list if isinstance(data, list) else tuple(data_list) 161 | if isinstance(data, torch.Tensor): 162 | return data.cuda() 163 | raise TypeError(f"Type {type(data)} currently not supported!") 164 | -------------------------------------------------------------------------------- /maggy/core/patching/modules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from types import SimpleNamespace 20 | from typing import Type, Any 21 | 22 | from torch.nn import Module as TorchModule 23 | from torch.nn.parallel import DistributedDataParallel as TorchDistributedDataParallel 24 | 25 | try: 26 | from deepspeed.pipe import PipelineModule 27 | from deepspeed.runtime.engine import DeepSpeedEngine 28 | from fairscale.nn import ( 29 | FullyShardedDataParallel as FairscaleFullyShardedDataParallel, 30 | ) 31 | except ImportError: 32 | print( 33 | """Warning: deepspeed and/or fairscale import failed. DeepSpeed backend and zero_lvl 3 34 | won't be available""" 35 | ) 36 | 37 | 38 | def get_maggy_ddp_wrapper(module: Type[TorchModule]): 39 | """Factory function for MaggyDDPModuleWrapper. 40 | 41 | :param module: PyTorch module passed by the user. 42 | """ 43 | 44 | class MaggyDDPModuleWrapper(TorchDistributedDataParallel): 45 | """Wrapper around PyTorch's DDP Module. 46 | 47 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 48 | we cannot add the module as an additional parameter during initialization. Instead, it is 49 | configured by its factory function. 50 | """ 51 | 52 | __module = module # Avoid overwriting torch module 53 | 54 | def __init__(self, *args: Any, **kwargs: Any): 55 | """Initializes the previously set module, moves it to the GPU and initializes a DDP 56 | module with it. 57 | 58 | :param args: Arguments passed by the user for module initialization. 59 | :param kwargs: Keyword arguments passed by the user for module initialization. 60 | """ 61 | # Avoid self because bound method adds to args which makes the function call fail 62 | model = MaggyDDPModuleWrapper.__module(*args, **kwargs).cuda() 63 | super().__init__(model) 64 | 65 | return MaggyDDPModuleWrapper 66 | 67 | 68 | def get_maggy_fairscale_wrapper(module: TorchModule, mixed_precision: bool): 69 | """Factory function for MaggyFairScaleModuleWrapper. 70 | 71 | :param module: PyTorch module passed by the user. 72 | :param mixed_precision: Switches on mixed precision for the FairscaleModule. 73 | """ 74 | 75 | class MaggyFairScaleModuleWrapper(FairscaleFullyShardedDataParallel): 76 | """Wrapper around Fairscale's FullyShardedDataParallel Module. 77 | 78 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 79 | we cannot add the module as an additional parameter during initialization. Instead, it is 80 | configured by its factory function. 81 | """ 82 | 83 | __module = module 84 | __mixed_precision = mixed_precision 85 | 86 | def __init__(self, *args: Any, **kwargs: Any): 87 | """Initializes the previously set module, moves it to the GPU and initializes a 88 | Fairscale FullyShardedDataParallel module with it. 89 | 90 | :param args: Arguments passed by the user for module initialization. 91 | :param kwargs: Keyword arguments passed by the user for module initialization. 92 | """ 93 | # Avoid self because bound method adds to args which makes the function call fail 94 | model = MaggyFairScaleModuleWrapper.__module(*args, **kwargs).cuda() 95 | super().__init__(model, mixed_precision=self.__mixed_precision) 96 | 97 | return MaggyFairScaleModuleWrapper 98 | 99 | 100 | def get_maggy_deepspeed_wrapper(module: TorchModule, config_params: dict): 101 | """Factory function for MaggyDeepSpeedModuleWrapper. 102 | 103 | :param module: PyTorch module passed by the user. 104 | :param mixed_precision: DeepSpeed config dict passed by the user. 105 | """ 106 | assert ( 107 | module != PipelineModule 108 | ), """Maggy currently doesn't support pipeline 109 | modules with DeepSpeed ZeRO.""" 110 | 111 | class MaggyDeepSpeedModuleWrapper(DeepSpeedEngine): 112 | """Wrapper around DeepSpeed's DeepSpeedEngine. 113 | 114 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 115 | we cannot add the module as an additional parameter during initialization. Instead, it is 116 | configured by its factory function. 117 | """ 118 | 119 | __module = module 120 | __config_params = config_params 121 | 122 | def __init__(self, *args, **kwargs): 123 | """Initializes the previously set module and initializes a DeepSpeedEngine with it. 124 | 125 | :param args: Arguments passed by the user for module initialization. 126 | :param kwargs: Keyword arguments passed by the user for module initialization. 127 | """ 128 | # Avoid self because bound method adds to args which makes the function call fail. 129 | # No .cuda() calls for DeepSpeed necessary. 130 | model = MaggyDeepSpeedModuleWrapper.__module(*args, **kwargs) 131 | ds_args = SimpleNamespace(local_rank=0) 132 | super().__init__( 133 | ds_args, 134 | model, 135 | model_parameters=model.parameters(), 136 | config_params=self.__config_params, 137 | ) 138 | 139 | return MaggyDeepSpeedModuleWrapper 140 | -------------------------------------------------------------------------------- /maggy/core/patching/optim.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import inspect 20 | from typing import Any 21 | from abc import ABC, abstractclassmethod 22 | 23 | 24 | import torch.optim as optim 25 | from torch.distributed.optim import ZeroRedundancyOptimizer 26 | 27 | 28 | class MaggyZeroOptimizer(ZeroRedundancyOptimizer, ABC): 29 | """Abstract base class for Maggy's optimizer patching classes.""" 30 | 31 | def __init__(self, *args: Any, **kwargs: Any): 32 | """Initializes a ZeroRedundancyOptimizer with the defined optim_cls as optimizer class. 33 | 34 | Passes any arguments for initialization of the default optimizer to the Zero optimizer. 35 | :param args: Optimizer args. Get reassigned into kwargs. 36 | :param kwargs: Optimizer kwargs. 37 | """ 38 | # Move args to kwargs to pass args into kwargs only ZeroRedundancyOptimizer 39 | arg_spec = inspect.getfullargspec(self.optim_cls.__init__) 40 | for idx, arg in enumerate(args): 41 | kwargs[arg_spec.args[idx + 1]] = arg # +1 to skip self in arg_spec 42 | params = kwargs.pop("params", None) 43 | super().__init__( 44 | params, self.optim_cls, group=None, bucket_cap_kb=2**24, **kwargs 45 | ) 46 | 47 | @property 48 | @abstractclassmethod 49 | def optim_cls(cls: optim.Optimizer) -> MaggyZeroOptimizer: 50 | """Optimizer class property needs to be defined by each implementation of the base class.""" 51 | raise NotImplementedError 52 | 53 | 54 | class MaggyZeroAdadelta(MaggyZeroOptimizer): 55 | """Maggy's Zero wrapper around torch's Adadelta optimizer.""" 56 | 57 | optim_cls = optim.Adadelta 58 | 59 | 60 | class MaggyZeroAdagrad(MaggyZeroOptimizer): 61 | """Maggy's Zero wrapper around torch's Adagrad optimizer.""" 62 | 63 | optim_cls = optim.Adagrad 64 | 65 | 66 | class MaggyZeroAdam(MaggyZeroOptimizer): 67 | """Maggy's Zero wrapper around torch's Adam optimizer.""" 68 | 69 | optim_cls = optim.Adam 70 | 71 | 72 | class MaggyZeroAdamW(MaggyZeroOptimizer): 73 | """Maggy's Zero wrapper around torch's AdamW optimizer.""" 74 | 75 | optim_cls = optim.AdamW 76 | 77 | 78 | class MaggyZeroSparseAdam(MaggyZeroOptimizer): 79 | """Maggy's Zero wrapper around torch's SparseAdam optimizer.""" 80 | 81 | optim_cls = optim.SparseAdam 82 | 83 | 84 | class MaggyZeroAdamax(MaggyZeroOptimizer): 85 | """Maggy's Zero wrapper around torch's Adamax optimizer.""" 86 | 87 | optim_cls = optim.Adamax 88 | 89 | 90 | class MaggyZeroASGD(MaggyZeroOptimizer): 91 | """Maggy's Zero wrapper around torch's ASGD optimizer.""" 92 | 93 | optim_cls = optim.ASGD 94 | 95 | 96 | class MaggyZeroLBFGS(MaggyZeroOptimizer): 97 | """Maggy's Zero wrapper around torch's LBFGS optimizer.""" 98 | 99 | optim_cls = optim.LBFGS 100 | 101 | 102 | class MaggyZeroRMSprop(MaggyZeroOptimizer): 103 | """Maggy's Zero wrapper around torch's RMSprop optimizer.""" 104 | 105 | optim_cls = optim.RMSprop 106 | 107 | 108 | class MaggyZeroRprop(MaggyZeroOptimizer): 109 | """Maggy's Zero wrapper around torch's Rprop optimizer.""" 110 | 111 | optim_cls = optim.Rprop 112 | 113 | 114 | class MaggyZeroSGD(MaggyZeroOptimizer): 115 | """Maggy's Zero wrapper around torch's SGD optimizer.""" 116 | 117 | optim_cls = optim.SGD 118 | -------------------------------------------------------------------------------- /maggy/core/reporter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | API Module for the user to include in his training code. 19 | 20 | """ 21 | import threading 22 | from datetime import datetime 23 | 24 | from maggy import constants 25 | from maggy.core import exceptions 26 | 27 | from maggy.core.environment.singleton import EnvSing 28 | 29 | 30 | class Reporter(object): 31 | """ 32 | Thread-safe store for sending a metric and logs from executor to driver 33 | """ 34 | 35 | def __init__(self, log_file, partition_id, task_attempt, print_executor): 36 | self.metric = None 37 | self.step = -1 38 | self.lock = threading.RLock() 39 | self.stop = False 40 | self.trial_id = None 41 | self.trial_log_file = None 42 | self.logs = "" 43 | self.log_file = log_file 44 | self.partition_id = partition_id 45 | self.task_attempt = task_attempt 46 | self.print_executor = print_executor 47 | 48 | # Open executor log file descriptor 49 | # This log is for all maggy system related log messages 50 | env = EnvSing.get_instance() 51 | if not env.exists(log_file): 52 | env.dump("", log_file) 53 | self.fd = env.open_file(log_file, flags="w") 54 | self.trial_fd = None 55 | 56 | def init_logger(self, trial_log_file): 57 | """Initializes the trial log file""" 58 | self.trial_log_file = trial_log_file 59 | env = EnvSing.get_instance() 60 | # Open trial log file descriptor 61 | if not env.exists(self.trial_log_file): 62 | env.dump("", self.trial_log_file) 63 | self.trial_fd = env.open_file(self.trial_log_file, flags="w") 64 | 65 | def close_logger(self): 66 | """Savely closes the file descriptors of the log files. 67 | 68 | close() can be called multiple times and flushes the buffer contents 69 | before closing 70 | """ 71 | with self.lock: 72 | if self.trial_fd: 73 | self.trial_fd.close() 74 | self.fd.close() 75 | 76 | # report 77 | def broadcast(self, metric, step=None): 78 | """Broadcast a metric to the experiment driver with the heartbeat. 79 | 80 | :param metric: Metric to be broadcasted 81 | :type metric: int, float 82 | :param step: The iteration step which produced the metric, e.g. batch or 83 | epoch number, or any other monotonically increasing progress attribute 84 | :type step: int 85 | :raises exception: EarlyStopException if told by the experiment driver 86 | """ 87 | with self.lock: 88 | # if stop == True -> raise exception to break training function 89 | if step is None: 90 | step = self.step + 1 91 | if not isinstance(metric, constants.USER_FCT.NUMERIC_TYPES): 92 | raise exceptions.BroadcastMetricTypeError(metric) 93 | elif not isinstance(step, constants.USER_FCT.NUMERIC_TYPES): 94 | raise exceptions.BroadcastStepTypeError(metric, step) 95 | elif step < self.step: 96 | raise exceptions.BroadcastStepValueError(metric, step, self.step) 97 | else: 98 | self.step = step 99 | self.metric = metric 100 | if self.stop: 101 | raise exceptions.EarlyStopException(metric) 102 | 103 | def log(self, log_msg, jupyter=False): 104 | """Logs a message to the executor logfile and executor stderr and 105 | optionally prints the message in jupyter. 106 | 107 | :param log_msg: Message to log. 108 | :type log_msg: str 109 | :param verbose: Print in Jupyter Notebook, defaults to True 110 | :type verbose: bool, optional 111 | """ 112 | with self.lock: 113 | env = EnvSing.get_instance() 114 | try: 115 | msg = (datetime.now().isoformat() + " ({0}/{1}): {2} \n").format( 116 | self.partition_id, self.task_attempt, log_msg 117 | ) 118 | if jupyter: 119 | jupyter_log = str(self.partition_id) + ": " + log_msg 120 | if self.trial_fd: 121 | self.trial_fd.write(env.str_or_byte(msg)) 122 | self.logs = self.logs + jupyter_log + "\n" 123 | else: 124 | self.fd.write(env.str_or_byte(msg)) 125 | if self.trial_fd: 126 | self.trial_fd.write(env.str_or_byte(msg)) 127 | self.print_executor(msg) 128 | # Throws ValueError when operating on closed HDFS file object 129 | # Throws AttributeError when calling file ops on NoneType object 130 | except (IOError, ValueError, AttributeError) as e: 131 | self.fd.write( 132 | env.str_or_byte( 133 | "An error occurred while writing logs: {}".format(e) 134 | ) 135 | ) 136 | 137 | def get_data(self): 138 | """Returns the metric and logs to be sent to the experiment driver.""" 139 | with self.lock: 140 | log_to_send = self.logs 141 | self.logs = "" 142 | return self.metric, self.step, log_to_send 143 | 144 | def reset(self): 145 | """ 146 | Resets the reporter to the initial state in order to start a new 147 | trial. 148 | """ 149 | with self.lock: 150 | self.metric = None 151 | self.step = -1 152 | self.stop = False 153 | self.trial_id = None 154 | self.fd.flush() 155 | self.trial_fd.close() 156 | self.trial_fd = None 157 | self.trial_log_file = None 158 | 159 | def early_stop(self): 160 | with self.lock: 161 | if self.metric is not None: 162 | self.stop = True 163 | 164 | def get_trial_id(self): 165 | with self.lock: 166 | return self.trial_id 167 | 168 | def set_trial_id(self, trial_id): 169 | with self.lock: 170 | self.trial_id = trial_id 171 | -------------------------------------------------------------------------------- /maggy/core/tf_patching/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/tf_patching/tf_modules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | def get_wrapped_model(model, strategy, is_chief): 19 | """Build a wrap class for the user defined tensorflow model. 20 | 21 | :param model: The class of the user defined tensorflow model. 22 | :param strategy: A class of the strategy to be used for the training. 23 | 24 | :returns: The TensorflowModelWrapper class. 25 | """ 26 | 27 | class TensorflowModelWrapper(model): 28 | """A wrap for tensorflow model, the __init__() and compile() functions are overridden in order to launch 29 | train the model in a distributed fashion. 30 | """ 31 | 32 | def __init__(self, *args, **kwargs): 33 | self.__strategy = strategy 34 | self.is_chief = is_chief 35 | with self.__strategy.scope(): 36 | try: 37 | super().__init__(*args, **kwargs) 38 | except TypeError as e: 39 | raise TypeError( 40 | "The parameters passed to TensorflowConfig (model_parameters) " 41 | "do not corresponds to the parameters defined in your model " 42 | "constructor." 43 | ) from e 44 | 45 | return TensorflowModelWrapper 46 | -------------------------------------------------------------------------------- /maggy/earlystop/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.earlystop import abstractearlystop, medianrule, nostop 18 | 19 | AbstractEarlyStop = abstractearlystop.AbstractEarlyStop 20 | MedianStoppingRule = medianrule.MedianStoppingRule 21 | NoStoppingRule = nostop.NoStoppingRule 22 | 23 | __all__ = ["AbstractEarlyStop", "MedianStoppingRule", "NoStoppingRule"] 24 | -------------------------------------------------------------------------------- /maggy/earlystop/abstractearlystop.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | 19 | 20 | class AbstractEarlyStop(ABC): 21 | """An Abstract class to implement custom early stopping criteria.""" 22 | 23 | @staticmethod 24 | @abstractmethod 25 | def earlystop_check(to_check, finalized_trials, direction): 26 | """A abstract static method that needs to be implemented with a custom 27 | early stopping criterium. 28 | 29 | The function is called internally in the user specified interval 30 | with three arguments. It is necessary to add these to the function 31 | definition. 32 | 33 | :param to_check: A dictionary of currently running 34 | trials, where the key is the `trial_id` and values are Trial objects. 35 | :type to_check: dictionary 36 | :param finalized_trials: A list of finalized Trial objects. 37 | :type finalized_trials: list 38 | :param direction: A string describing the search objective, i.e. 'min' 39 | or 'max'. 40 | :type direction: str 41 | """ 42 | pass 43 | -------------------------------------------------------------------------------- /maggy/earlystop/medianrule.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import statistics 18 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop 19 | 20 | 21 | class MedianStoppingRule(AbstractEarlyStop): 22 | """The Median Stopping Rule implements the simple strategy of stopping a 23 | trial if its performance falls below the median of other trials at similar 24 | points in time. 25 | """ 26 | 27 | @staticmethod 28 | def earlystop_check(to_check, finalized_trials, direction): 29 | 30 | results = [] 31 | median = None 32 | 33 | # count step from zero so it can be used as index for array 34 | step = len(to_check.metric_history) 35 | 36 | if step > 0: 37 | 38 | for fin_trial in finalized_trials: 39 | 40 | if len(fin_trial.metric_history) >= step: 41 | avg = sum(fin_trial.metric_history[:step]) / float(step) 42 | results.append(avg) 43 | 44 | try: 45 | median = statistics.median(results) 46 | except statistics.StatisticsError as e: 47 | raise Exception( 48 | "Warning: StatisticsError when calling early stop method\n{}".format( 49 | e 50 | ) 51 | ) 52 | 53 | if median is not None: 54 | if direction == "max": 55 | if max(to_check.metric_history) < median: 56 | return to_check.trial_id 57 | elif direction == "min": 58 | if min(to_check.metric_history) > median: 59 | return to_check.trial_id 60 | return None 61 | -------------------------------------------------------------------------------- /maggy/earlystop/nostop.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop 18 | 19 | 20 | class NoStoppingRule(AbstractEarlyStop): 21 | """The no stopping rule never stops any trials early.""" 22 | 23 | @staticmethod 24 | def earlystop_check(to_check, finalized_trials, direction): 25 | return None 26 | -------------------------------------------------------------------------------- /maggy/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/experiment/experiment.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from typing import Callable 18 | from maggy.config import LagomConfig, BaseConfig 19 | 20 | 21 | def lagom(train_fn: Callable, config: LagomConfig = None) -> dict: 22 | 23 | """Entry point for Maggy experiment, this function passes the parameters to the lagom function 24 | depending whether the kernel is pyspark or python. 25 | **lagom** is a Swedish word meaning "just the right amount". 26 | 27 | :param train_fn: User defined experiment containing the model training. 28 | :param config: An experiment configuration. For more information, see config. 29 | 30 | :returns: The experiment results as a dict. 31 | """ 32 | from maggy.experiment import experiment_python 33 | from maggy.experiment import experiment_pyspark 34 | from maggy.core import config as maggyconfig 35 | 36 | if config is None: 37 | config = BaseConfig( 38 | name="maggy_experiment", 39 | description="experiment without config object", 40 | hb_interval=1, 41 | ) 42 | if maggyconfig.is_spark_available(): 43 | return experiment_pyspark.lagom(train_fn, config) 44 | else: 45 | return experiment_python.lagom(train_fn, config) 46 | -------------------------------------------------------------------------------- /maggy/experiment/experiment_pyspark.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Experiment module used for running asynchronous optimization tasks. 19 | The programming model is that you wrap the code containing the model 20 | training inside a wrapper function. 21 | Inside that wrapper function provide all imports and parts that make up your 22 | experiment, see examples below. Whenever a function to run an experiment is 23 | invoked it is also registered in the Experiments service along with the 24 | provided information. 25 | """ 26 | import atexit 27 | import time 28 | from functools import singledispatch 29 | from typing import Callable 30 | 31 | from maggy import util 32 | from maggy.core.environment.singleton import EnvSing 33 | from maggy.config import * 34 | from maggy.core.experiment_driver import HyperparameterOptDriver, AblationDriver 35 | 36 | 37 | APP_ID = None 38 | RUNNING = False 39 | RUN_ID = 1 40 | EXPERIMENT_JSON = {} 41 | 42 | 43 | def lagom(train_fn: Callable, config: LagomConfig) -> dict: 44 | """Launches a maggy experiment, which depending on 'config' can either 45 | be a hyperparameter optimization, an ablation study experiment or distributed 46 | training. Given a search space, objective and a model training procedure `train_fn` 47 | (black-box function), an experiment is the whole process of finding the 48 | best hyperparameter combination in the search space, optimizing the 49 | black-box function. Currently maggy supports random search and a median 50 | stopping rule. 51 | **lagom** is a Swedish word meaning "just the right amount". 52 | 53 | :param train_fn: User defined experiment containing the model training. 54 | :param config: An experiment configuration. For more information, see config. 55 | 56 | :returns: The experiment results as a dict. 57 | """ 58 | global APP_ID 59 | global RUNNING 60 | global RUN_ID 61 | job_start = time.time() 62 | try: 63 | if RUNNING: 64 | raise RuntimeError("An experiment is currently running.") 65 | RUNNING = True 66 | spark_context = util.find_spark().sparkContext 67 | APP_ID = str(spark_context.applicationId) 68 | APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) 69 | EnvSing.get_instance().set_app_id(APP_ID) 70 | driver = lagom_driver(config, APP_ID, RUN_ID) 71 | return driver.run_experiment(train_fn, config) 72 | except: # noqa: E722 73 | _exception_handler(util.seconds_to_milliseconds(time.time() - job_start)) 74 | raise 75 | finally: 76 | # Clean up spark jobs 77 | RUN_ID += 1 78 | RUNNING = False 79 | util.find_spark().sparkContext.setJobGroup("", "") 80 | 81 | 82 | @singledispatch 83 | def lagom_driver(config, app_id: int, run_id: int) -> None: 84 | """Dispatcher function for the experiment driver. 85 | 86 | Initializes the appropriate driver according to the config. 87 | 88 | :raises TypeError: Only gets called if no fitting config was found and 89 | raises an error. 90 | """ 91 | raise TypeError( 92 | "Invalid config type! LagomConfig is expected to be of type {}, {}, {} or {}, \ 93 | but is of type {}".format( 94 | HyperparameterOptConfig, 95 | AblationConfig, 96 | TorchDistributedConfig, 97 | TfDistributedConfig, 98 | type(config), 99 | ) 100 | ) 101 | 102 | 103 | @lagom_driver.register(HyperparameterOptConfig) 104 | def _( 105 | config: HyperparameterOptConfig, app_id: int, run_id: int 106 | ) -> HyperparameterOptDriver: 107 | return HyperparameterOptDriver(config, app_id, run_id) 108 | 109 | 110 | @lagom_driver.register(AblationConfig) 111 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver: 112 | return AblationDriver(config, app_id, run_id) 113 | 114 | 115 | @lagom_driver.register(TorchDistributedConfig) 116 | # Lazy import of TorchDistributedTrainingDriver to avoid Torch import until necessary 117 | def _( 118 | config: TorchDistributedConfig, app_id: int, run_id: int 119 | ) -> "TorchDistributedTrainingDriver": # noqa: F821 120 | from maggy.core.experiment_driver.torch_distributed_training_driver import ( 121 | TorchDistributedTrainingDriver, 122 | ) 123 | 124 | return TorchDistributedTrainingDriver(config, app_id, run_id) 125 | 126 | 127 | @lagom_driver.register(TfDistributedConfig) 128 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 129 | def _( 130 | config: TfDistributedConfig, app_id: int, run_id: int 131 | ) -> "TfDistributedTrainingDriver": # noqa: F821 132 | from maggy.core.experiment_driver.tf_distributed_training_driver import ( 133 | TfDistributedTrainingDriver, 134 | ) 135 | 136 | return TfDistributedTrainingDriver(config, app_id, run_id) 137 | 138 | 139 | @lagom_driver.register(LagomConfig) 140 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 141 | def _(config: LagomConfig, app_id: int, run_id: int) -> "BaseDriver": # noqa: F821 142 | from maggy.core.experiment_driver.base_driver import ( 143 | BaseDriver, 144 | ) 145 | 146 | return BaseDriver(config, app_id, run_id) 147 | 148 | 149 | def _exception_handler(duration: int) -> None: 150 | """Handles exceptions during execution of an experiment. 151 | 152 | :param duration: Duration of the experiment until exception in milliseconds 153 | """ 154 | try: 155 | global RUNNING 156 | global EXPERIMENT_JSON 157 | if RUNNING: 158 | EXPERIMENT_JSON["state"] = "FAILED" 159 | EXPERIMENT_JSON["duration"] = duration 160 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 161 | EnvSing.get_instance().attach_experiment_xattr( 162 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 163 | ) 164 | except Exception as err: 165 | util.log(err) 166 | 167 | 168 | def _exit_handler() -> None: 169 | """Handles jobs killed by the user.""" 170 | try: 171 | global RUNNING 172 | global EXPERIMENT_JSON 173 | if RUNNING: 174 | EXPERIMENT_JSON["status"] = "KILLED" 175 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 176 | EnvSing.get_instance().attach_experiment_xattr( 177 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 178 | ) 179 | except Exception as err: 180 | util.log(err) 181 | 182 | 183 | atexit.register(_exit_handler) 184 | -------------------------------------------------------------------------------- /maggy/experiment/experiment_python.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Experiment module used for running asynchronous optimization tasks. 19 | The programming model is that you wrap the code containing the model 20 | training inside a wrapper function. 21 | Inside that wrapper function provide all imports and parts that make up your 22 | experiment, see examples below. Whenever a function to run an experiment is 23 | invoked it is also registered in the Experiments service along with the 24 | provided information. 25 | """ 26 | import atexit 27 | import calendar 28 | import time 29 | from functools import singledispatch 30 | from typing import Callable 31 | 32 | from maggy import util 33 | from maggy.core.environment.singleton import EnvSing 34 | from maggy.config import * 35 | from maggy.core.experiment_driver import ( 36 | HyperparameterOptDriver, 37 | AblationDriver, 38 | BaseDriver, 39 | ) 40 | 41 | 42 | APP_ID = None 43 | RUNNING = False 44 | RUN_ID = 1 45 | EXPERIMENT_JSON = {} 46 | 47 | 48 | def lagom(train_fn: Callable, config) -> dict: 49 | """Launches a maggy experiment, which depending on 'config' can either 50 | be a hyperparameter optimization, an ablation study experiment or distributed 51 | training. Given a search space, objective and a model training procedure `train_fn` 52 | (black-box function), an experiment is the whole process of finding the 53 | best hyperparameter combination in the search space, optimizing the 54 | black-box function. Currently maggy supports random search and a median 55 | stopping rule. 56 | **lagom** is a Swedish word meaning "just the right amount". 57 | 58 | :param train_fn: User defined experiment containing the model training. 59 | :param config: An experiment configuration. For more information, see config. 60 | 61 | :returns: The experiment results as a dict. 62 | """ 63 | global APP_ID 64 | global RUNNING 65 | global RUN_ID 66 | job_start = time.time() 67 | try: 68 | if RUNNING: 69 | raise RuntimeError("An experiment is currently running.") 70 | RUNNING = True 71 | APP_ID = str(calendar.timegm(time.gmtime())) 72 | APP_ID = "application_" + APP_ID + "_0001" 73 | APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) 74 | driver = lagom_driver(config, APP_ID, RUN_ID) 75 | return driver.run_experiment(train_fn, config) 76 | except: # noqa: E722 77 | _exception_handler(util.seconds_to_milliseconds(time.time() - job_start)) 78 | raise 79 | finally: 80 | # Clean up spark jobs 81 | RUN_ID += 1 82 | RUNNING = False 83 | 84 | 85 | @singledispatch 86 | def lagom_driver(config, app_id: int, run_id: int) -> None: 87 | """Dispatcher function for the experiment driver. 88 | 89 | Initializes the appropriate driver according to the config. 90 | 91 | :raises TypeError: Only gets called if no fitting config was found and 92 | raises an error. 93 | """ 94 | raise TypeError( 95 | "Invalid config type! Config is expected to be of type {}, {}, {}, {} or {}, \ 96 | but is of type {}".format( 97 | HyperparameterOptConfig, 98 | AblationConfig, 99 | TorchDistributedConfig, 100 | TfDistributedConfig, 101 | BaseConfig, 102 | type(config), 103 | ) 104 | ) 105 | 106 | 107 | @lagom_driver.register(HyperparameterOptConfig) 108 | def _( 109 | config: HyperparameterOptConfig, app_id: int, run_id: int 110 | ) -> HyperparameterOptDriver: 111 | return HyperparameterOptDriver(config, app_id, run_id) 112 | 113 | 114 | @lagom_driver.register(AblationConfig) 115 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver: 116 | return AblationDriver(config, app_id, run_id) 117 | 118 | 119 | @lagom_driver.register(TorchDistributedConfig) 120 | # Lazy import of DistributedDriver to avoid Torch import until necessary 121 | def _( 122 | config: TorchDistributedConfig, app_id: int, run_id: int 123 | ) -> "TorchDistributedTrainingDriver": # noqa: F821 124 | from maggy.core.experiment_driver.torch_distributed_training_driver import ( 125 | TorchDistributedTrainingDriver, 126 | ) 127 | 128 | return TorchDistributedTrainingDriver(config, app_id, run_id) 129 | 130 | 131 | @lagom_driver.register(TfDistributedConfig) 132 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 133 | def _( 134 | config: TfDistributedConfig, app_id: int, run_id: int 135 | ) -> "TfDistributedTrainingDriver": # noqa: F821 136 | from maggy.core.experiment_driver.tf_distributed_training_driver import ( 137 | TfDistributedTrainingDriver, 138 | ) 139 | 140 | return TfDistributedTrainingDriver(config, app_id, run_id) 141 | 142 | 143 | @lagom_driver.register(BaseConfig) 144 | # Lazy import of BaseConfig 145 | def _(config: BaseConfig, app_id: int, run_id: int) -> BaseDriver: 146 | from maggy.core.experiment_driver.base_driver import ( 147 | BaseDriver, 148 | ) 149 | 150 | return BaseDriver(config, app_id, run_id) 151 | 152 | 153 | @lagom_driver.register(LagomConfig) 154 | # Lazy import of LagomConfig 155 | def _(config: LagomConfig, app_id: int, run_id: int) -> BaseDriver: 156 | from maggy.core.experiment_driver.base_driver import ( 157 | BaseDriver, 158 | ) 159 | 160 | return BaseDriver(config, app_id, run_id) 161 | 162 | 163 | def _exception_handler(duration: int) -> None: 164 | """Handles exceptions during execution of an experiment. 165 | 166 | :param duration: Duration of the experiment until exception in milliseconds 167 | """ 168 | try: 169 | global RUNNING 170 | global EXPERIMENT_JSON 171 | if RUNNING: 172 | EXPERIMENT_JSON["state"] = "FAILED" 173 | EXPERIMENT_JSON["duration"] = duration 174 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 175 | EnvSing.get_instance().attach_experiment_xattr( 176 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 177 | ) 178 | except Exception as err: 179 | util.log(err) 180 | 181 | 182 | def _exit_handler() -> None: 183 | """Handles jobs killed by the user.""" 184 | try: 185 | global RUNNING 186 | global EXPERIMENT_JSON 187 | if RUNNING: 188 | EXPERIMENT_JSON["status"] = "KILLED" 189 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 190 | EnvSing.get_instance().attach_experiment_xattr( 191 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 192 | ) 193 | except Exception as err: 194 | util.log(err) 195 | 196 | 197 | atexit.register(_exit_handler) 198 | -------------------------------------------------------------------------------- /maggy/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer import abstractoptimizer, randomsearch, asha, singlerun, gridsearch 18 | 19 | AbstractOptimizer = abstractoptimizer.AbstractOptimizer 20 | RandomSearch = randomsearch.RandomSearch 21 | Asha = asha.Asha 22 | SingleRun = singlerun.SingleRun 23 | GridSearch = gridsearch.GridSearch 24 | 25 | __all__ = ["AbstractOptimizer", "RandomSearch", "Asha", "SingleRun", "GridSearch"] 26 | -------------------------------------------------------------------------------- /maggy/optimizer/asha.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import math 18 | 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 20 | from maggy.trial import Trial 21 | 22 | 23 | class Asha(AbstractOptimizer): 24 | """Implements the Asynchronous Successiv Halving Algorithm - ASHA 25 | (https://arxiv.org/abs/1810.05934). ASHA needs three additional parameters: 26 | 'reduction_factor', 'resource_min' and 'resource_max'. To set custom values 27 | for these, initialize the optimizer first and pass it as an argument to 28 | 'experiment.lagom()'. 29 | 30 | Sample usage: 31 | 32 | >>> # Import Asha optimizer 33 | >>> from maggy.optimizer import Asha 34 | >>> # Instantiate the optimizer with custom arguments 35 | >>> asha = Asha(3, 1, 9) 36 | >>> experiment.lagom(..., optimizer=asha, ...) 37 | """ 38 | 39 | def __init__(self, reduction_factor=2, resource_min=1, resource_max=4): 40 | super().__init__() 41 | 42 | if reduction_factor < 2 or not isinstance(reduction_factor, int): 43 | raise Exception( 44 | "Can't initialize ASHA optimizer. 'reduction_factor'" 45 | + "has to be an integer equal to or larger than 2: {}".format( 46 | reduction_factor 47 | ) 48 | ) 49 | else: 50 | self.reduction_factor = reduction_factor 51 | 52 | if not isinstance(resource_min, int): 53 | raise Exception( 54 | "Can't initialize ASHA optimizer. 'resource_min'" 55 | + "not of type INTEGER." 56 | ) 57 | if not isinstance(resource_max, int): 58 | raise Exception( 59 | "Can't initialize ASHA optimizer. 'resource_max'" 60 | + "not of type INTEGER." 61 | ) 62 | if resource_min >= resource_max: 63 | raise Exception( 64 | "Can't initialize ASHA optimizer. 'resource_min' is larger" 65 | + "than 'resource_max'." 66 | ) 67 | 68 | self.resource_min = resource_min 69 | self.resource_max = resource_max 70 | 71 | def initialize(self): 72 | 73 | # maps rung index k to trials in that rung 74 | self.rungs = {0: []} 75 | # maps rung index k to trial ids of trials that were promoted 76 | self.promoted = {0: []} 77 | 78 | self.max_rung = int( 79 | math.floor( 80 | math.log(self.resource_max / self.resource_min, self.reduction_factor) 81 | ) 82 | ) 83 | 84 | assert self.num_trials >= self.reduction_factor ** (self.max_rung + 1) 85 | 86 | def get_suggestion(self, trial=None): 87 | 88 | if trial is not None: 89 | # stopping criterium: one trial in max rung 90 | if self.max_rung in self.rungs: 91 | # return None to signal end to experiment driver 92 | return None 93 | 94 | # for each rung 95 | for k in range(self.max_rung - 1, -1, -1): 96 | # if rung doesn't exist yet go one lower 97 | if k not in self.rungs: 98 | continue 99 | 100 | # get top_k 101 | rung_finished = len( 102 | [x for x in self.rungs[k] if x.status == Trial.FINALIZED] 103 | ) 104 | 105 | if (rung_finished // self.reduction_factor) - len( 106 | self.promoted.get(k, []) 107 | ) > 0: 108 | candidates = self._top_k( 109 | k, (rung_finished // self.reduction_factor) 110 | ) 111 | else: 112 | candidates = [] 113 | 114 | # if there are no candidates, check one rung below 115 | if not candidates: 116 | continue 117 | 118 | # select all that haven't been promoted yet 119 | promotable = [ 120 | t for t in candidates if t.trial_id not in self.promoted.get(k, []) 121 | ] 122 | 123 | nr_promotable = len(promotable) 124 | if nr_promotable >= 1: 125 | new_rung = k + 1 126 | # sorted in decending order, take highest -> index 0 127 | old_trial = promotable[0] 128 | # make copy of params to be able to change resource 129 | params = old_trial.params.copy() 130 | params["budget"] = self.resource_min * ( 131 | self.reduction_factor**new_rung 132 | ) 133 | promote_trial = Trial(params) 134 | 135 | # open new rung if not exists 136 | if new_rung in self.rungs: 137 | self.rungs[new_rung].append(promote_trial) 138 | else: 139 | self.rungs[new_rung] = [promote_trial] 140 | 141 | # remember promoted trial 142 | if k in self.promoted: 143 | self.promoted[k].append(old_trial.trial_id) 144 | else: 145 | self.promoted[k] = [old_trial.trial_id] 146 | 147 | return promote_trial 148 | 149 | # else return random configuration in base rung 150 | params = self.searchspace.get_random_parameter_values(1)[0] 151 | # set resource to minimum 152 | params["budget"] = self.resource_min 153 | to_return = Trial(params) 154 | # add to bottom rung 155 | self.rungs[0].append(to_return) 156 | return to_return 157 | 158 | def finalize_experiment(self, trials): 159 | return 160 | 161 | def _top_k(self, rung_k, number): 162 | """Find top-`number` trials in `rung_k`.""" 163 | if number > 0: 164 | filtered = [x for x in self.rungs[rung_k] if x.status == Trial.FINALIZED] 165 | filtered.sort(key=lambda x: x.final_metric, reverse=True) 166 | # return top k trials if finalized 167 | return filtered[:number] 168 | else: 169 | return [] 170 | -------------------------------------------------------------------------------- /maggy/optimizer/bayes/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer.bayes import base, gp, tpe 18 | 19 | BaseAsyncBO = base.BaseAsyncBO 20 | GP = gp.GP 21 | TPE = tpe.TPE 22 | 23 | __all__ = [ 24 | "TPE", 25 | "BaseAsyncBO", 26 | "GP", 27 | ] 28 | -------------------------------------------------------------------------------- /maggy/optimizer/bayes/acquisitions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC 18 | from abc import abstractmethod 19 | 20 | import numpy as np 21 | from skopt.acquisition import _gaussian_acquisition 22 | from skopt.acquisition import gaussian_acquisition_1D 23 | 24 | 25 | class AbstractAcquisitionFunction(ABC): 26 | @staticmethod 27 | @abstractmethod 28 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 29 | """evaluates acquisition function at given points 30 | 31 | :param X: Values where the acquisition function should be computed. shape = (n_locations, n_hparams) 32 | :type X: np.ndarray 33 | :param surrogate_model: the surrogate model of the bayesian optimizer. 34 | :type surrogate_model: GaussianProcessRegressor 35 | :param y_opt: currently best observed value 36 | :type y_opt: float 37 | :param acq_func_kwargs: additional arguments for the acquisition function 38 | :type acq_func_kwargs: dict|None 39 | :return: Acquisition function values computed at X. shape = (n_locations,) 40 | :rtype: np.ndarray 41 | """ 42 | pass 43 | 44 | @staticmethod 45 | @abstractmethod 46 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 47 | """A wrapper around the acquisition function that is called by fmin_l_bfgs_b. 48 | This is because lbfgs allows only 1-D input. 49 | 50 | :param x: value where acquisition function should be evaluated. shape=(n_hparams, ) 51 | :type x: np.ndarray 52 | :param surrogate_model: the surrogate model of the bayesian optimizer. 53 | :type surrogate_model: GaussianProcessRegressor 54 | :param y_opt: currently best observed value 55 | :type y_opt: float 56 | :param acq_func_kwargs: additional arguments for the acquisition function 57 | :type acq_func_kwargs: dict|None 58 | :return: tuple containing two arrays. the first holds the evaluated values of the acquisition function at value 59 | x; shape = (1,) . the second holds the gradients; shape = (n_hparams,). 60 | :rtype: tuple 61 | """ 62 | pass 63 | 64 | def name(self): 65 | return str(self.__class__.__name__) 66 | 67 | 68 | class GaussianProcess_EI(AbstractAcquisitionFunction): 69 | """xi in acq_func_kwargs""" 70 | 71 | @staticmethod 72 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 73 | return _gaussian_acquisition( 74 | X=X, 75 | model=surrogate_model, 76 | y_opt=y_opt, 77 | acq_func="EI", 78 | acq_func_kwargs=acq_func_kwargs, 79 | ) 80 | 81 | @staticmethod 82 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 83 | return gaussian_acquisition_1D( 84 | X=x, 85 | model=surrogate_model, 86 | y_opt=y_opt, 87 | acq_func="EI", 88 | acq_func_kwargs=acq_func_kwargs, 89 | ) 90 | 91 | 92 | class GaussianProcess_PI(AbstractAcquisitionFunction): 93 | @staticmethod 94 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 95 | return _gaussian_acquisition( 96 | X=X, 97 | model=surrogate_model, 98 | y_opt=y_opt, 99 | acq_func="PI", 100 | acq_func_kwargs=acq_func_kwargs, 101 | ) 102 | 103 | @staticmethod 104 | def evaluate_1_d(X, surrogate_model, y_opt, acq_func_kwargs=None): 105 | return gaussian_acquisition_1D( 106 | X=X, 107 | model=surrogate_model, 108 | y_opt=y_opt, 109 | acq_func="PI", 110 | acq_func_kwargs=acq_func_kwargs, 111 | ) 112 | 113 | 114 | class GaussianProcess_LCB(AbstractAcquisitionFunction): 115 | """kappa in acq_func_kwargs""" 116 | 117 | @staticmethod 118 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 119 | return _gaussian_acquisition( 120 | X=X, 121 | model=surrogate_model, 122 | y_opt=y_opt, 123 | acq_func="LCB", 124 | acq_func_kwargs=acq_func_kwargs, 125 | ) 126 | 127 | @staticmethod 128 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 129 | return gaussian_acquisition_1D( 130 | X=x, 131 | model=surrogate_model, 132 | y_opt=y_opt, 133 | acq_func="LCB", 134 | acq_func_kwargs=acq_func_kwargs, 135 | ) 136 | 137 | 138 | class GaussianProcess_UCB(AbstractAcquisitionFunction): 139 | @staticmethod 140 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 141 | raise NotImplementedError 142 | 143 | @staticmethod 144 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 145 | raise NotImplementedError 146 | 147 | 148 | class TPE_EI(AbstractAcquisitionFunction): 149 | @staticmethod 150 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 151 | raise NotImplementedError 152 | 153 | @staticmethod 154 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 155 | raise NotImplementedError 156 | 157 | 158 | class AsyTS(AbstractAcquisitionFunction): 159 | @staticmethod 160 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 161 | return surrogate_model.sample_y(X).reshape( 162 | X.shape[0], 163 | ) 164 | 165 | @staticmethod 166 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 167 | """A wrapper around the acquisition function that is called by fmin_l_bfgs_b. 168 | This is because lbfgs allows only 1-D input. 169 | 170 | :param x: value where acquisition function should be evaluated. shape=(n_hparams, ) 171 | :type x: np.ndarray 172 | :param surogate_model: the surrogate model of the bayesian optimizer. 173 | :type surogate_model: GaussianProcessRegressor 174 | :param y_opt: currently best observed value 175 | :type y_opt: float 176 | :param acq_func_kwargs: additional arguments for the acquisition function 177 | :type acq_func_kwargs: dict|None 178 | :return: values of the acquisition function at value x. shape = (1,) 179 | :rtype: np.ndarray 180 | """ 181 | return surrogate_model.sample_y(np.expand_dims(x, axis=0)).reshape( 182 | 1, 183 | ) 184 | 185 | 186 | class HLP(AbstractAcquisitionFunction): 187 | @staticmethod 188 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 189 | raise NotImplementedError 190 | 191 | @staticmethod 192 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 193 | raise NotImplementedError 194 | -------------------------------------------------------------------------------- /maggy/optimizer/gridsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import itertools 18 | 19 | from maggy import Searchspace 20 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 21 | 22 | 23 | class GridSearch(AbstractOptimizer): 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | self.config_buffer = [] 27 | 28 | def initialize(self): 29 | self._validate_searchspace(self.searchspace) 30 | # create all trials ahead of time 31 | self.config_buffer = self._grid_params(self.searchspace) 32 | 33 | @classmethod 34 | def get_num_trials(cls, searchspace): 35 | """For grid search the number of trials is determined by the size of the 36 | cartisian product, depending on the user-set number of parameters and values 37 | 38 | This method is duplicating part of the code in the `initialize()` mainly to keep 39 | the flow of things the same as for other optimizers, where the user sets only 40 | the number of trials to evaluate. 41 | """ 42 | cls._validate_searchspace(searchspace) 43 | return len(cls._grid_params(searchspace)) 44 | 45 | def get_suggestion(self, trial=None): 46 | # sampling routine for randomsearch + pruner 47 | if self.pruner: 48 | raise NotImplementedError( 49 | "Grid search in combination with trial pruning " 50 | "is currently not supported." 51 | ) 52 | elif self.config_buffer: 53 | run_budget = 0 54 | next_trial_params = self.config_buffer.pop() 55 | next_trial = self.create_trial( 56 | hparams=next_trial_params, 57 | sample_type="grid", 58 | run_budget=run_budget, 59 | ) 60 | 61 | self._log( 62 | "start trial {}: {}, {} \n".format( 63 | next_trial.trial_id, next_trial.params, next_trial.info_dict 64 | ) 65 | ) 66 | 67 | return next_trial 68 | else: 69 | return None 70 | 71 | def finalize_experiment(self, trials): 72 | return 73 | 74 | @staticmethod 75 | def _grid_params(searchspace): 76 | return_list = [] 77 | for hparams in itertools.product( 78 | *[item["values"] for item in searchspace.items()] 79 | ): 80 | return_list.append(searchspace.list_to_dict(hparams)) 81 | return return_list 82 | 83 | @staticmethod 84 | def _validate_searchspace(searchspace): 85 | if ( 86 | Searchspace.DOUBLE in searchspace.names().values() 87 | or Searchspace.INTEGER in searchspace.names().values() 88 | ): 89 | raise NotImplementedError( 90 | "Searchspace can only contain `discrete` or `categorical` " 91 | "hyperparameters for grid search." 92 | ) 93 | -------------------------------------------------------------------------------- /maggy/optimizer/randomsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import time 17 | from copy import deepcopy 18 | 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 20 | from maggy.searchspace import Searchspace 21 | 22 | 23 | class RandomSearch(AbstractOptimizer): 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | self.config_buffer = [] 27 | 28 | def initialize(self): 29 | 30 | if ( 31 | Searchspace.DOUBLE not in self.searchspace.names().values() 32 | and Searchspace.INTEGER not in self.searchspace.names().values() 33 | ): 34 | raise NotImplementedError( 35 | "Searchspace needs at least one continuous parameter for random search." 36 | ) 37 | 38 | self.config_buffer = self.searchspace.get_random_parameter_values( 39 | self.num_trials 40 | ) 41 | 42 | def get_suggestion(self, trial=None): 43 | self._log("### start get_suggestion ###") 44 | self.sampling_time_start = time.time() 45 | 46 | # sampling routine for randomsearch + pruner 47 | if self.pruner: 48 | next_trial_info = self.pruner.pruning_routine() 49 | if next_trial_info == "IDLE": 50 | self._log( 51 | "Worker is IDLE and has to wait until a new trial can be scheduled" 52 | ) 53 | return "IDLE" 54 | elif next_trial_info is None: 55 | # experiment is finished 56 | self._log("Experiment has finished") 57 | return None 58 | elif next_trial_info["trial_id"]: 59 | # copy hparams of given promoted trial and start new trial with it 60 | parent_trial_id = next_trial_info["trial_id"] 61 | parent_trial_hparams = deepcopy( 62 | self.get_hparams_dict(trial_ids=parent_trial_id)[parent_trial_id] 63 | ) 64 | # update trial info dict and create new trial object 65 | next_trial = self.create_trial( 66 | hparams=parent_trial_hparams, 67 | sample_type="promoted", 68 | run_budget=next_trial_info["budget"], 69 | ) 70 | self._log("use hparams from promoted trial {}".format(parent_trial_id)) 71 | else: 72 | # start sampling procedure with given budget 73 | parent_trial_id = None 74 | run_budget = next_trial_info["budget"] 75 | hparams = self.searchspace.get_random_parameter_values(1)[0] 76 | next_trial = self.create_trial( 77 | hparams=hparams, sample_type="random", run_budget=run_budget 78 | ) 79 | 80 | # report new trial id to pruner 81 | self.pruner.report_trial( 82 | original_trial_id=parent_trial_id, new_trial_id=next_trial.trial_id 83 | ) 84 | 85 | self._log( 86 | "start trial {}: {}. info_dict: {} \n".format( 87 | next_trial.trial_id, next_trial.params, next_trial.info_dict 88 | ) 89 | ) 90 | return next_trial 91 | 92 | # sampling routine for pure random search 93 | elif self.config_buffer: 94 | run_budget = 0 95 | next_trial_params = self.config_buffer.pop() 96 | next_trial = self.create_trial( 97 | hparams=next_trial_params, 98 | sample_type="random", 99 | run_budget=run_budget, 100 | ) 101 | 102 | self._log( 103 | "start trial {}: {}, {} \n".format( 104 | next_trial.trial_id, next_trial.params, next_trial.info_dict 105 | ) 106 | ) 107 | 108 | return next_trial 109 | else: 110 | return None 111 | 112 | def finalize_experiment(self, trials): 113 | return 114 | -------------------------------------------------------------------------------- /maggy/optimizer/singlerun.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 18 | from maggy.trial import Trial 19 | 20 | 21 | class SingleRun(AbstractOptimizer): 22 | def __init__(self): 23 | super().__init__() 24 | self.trial_buffer = [] 25 | 26 | def initialize(self): 27 | for _ in range(self.num_trials): 28 | self.trial_buffer.append(Trial({})) 29 | 30 | def get_suggestion(self, trial=None): 31 | if self.trial_buffer: 32 | return self.trial_buffer.pop() 33 | else: 34 | return None 35 | 36 | def finalize_experiment(self, trials): 37 | return 38 | -------------------------------------------------------------------------------- /maggy/pruner/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.pruner import hyperband, abstractpruner 18 | 19 | Hyperband = hyperband.Hyperband 20 | AbstractPruner = abstractpruner.AbstractPruner 21 | 22 | __all__ = ["Hyperband", "AbstractPruner"] 23 | -------------------------------------------------------------------------------- /maggy/pruner/abstractpruner.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | from datetime import datetime 19 | 20 | from maggy.core.environment.singleton import EnvSing 21 | 22 | 23 | class AbstractPruner(ABC): 24 | def __init__(self, trial_metric_getter): 25 | """ 26 | :param trial_metric_getter: a function that returns a dict with `trial_id` as key and `metric` as value 27 | with the lowest metric being the "best" 28 | It's only argument is `trial_ids`, it can be either str of single trial or list of trial ids 29 | :type trial_metric_getter: function 30 | """ 31 | 32 | self.trial_metric_getter = trial_metric_getter 33 | 34 | # logger variables 35 | self.log_file = None 36 | self.fd = None 37 | 38 | @abstractmethod 39 | def pruning_routine(self): 40 | """ 41 | runs pruning routine. 42 | interface top `optimizer` 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def report_trial(self): 48 | """ 49 | hook for reporting trial id of created trial from optimizer to pruner 50 | """ 51 | pass 52 | 53 | @abstractmethod 54 | def finished(self): 55 | """ 56 | checks if experiment is finished 57 | """ 58 | pass 59 | 60 | @abstractmethod 61 | def num_trials(self): 62 | """ 63 | calculates the number of trials in the experiment 64 | 65 | :return: number of trials 66 | :rtype: int 67 | """ 68 | 69 | def name(self): 70 | return str(self.__class__.__name__) 71 | 72 | def initialize_logger(self, exp_dir): 73 | """Initialize logger of optimizer 74 | 75 | :param exp_dir: path of experiment directory 76 | :rtype exp_dir: str 77 | """ 78 | env = EnvSing.get_instance() 79 | # configure logger 80 | self.log_file = exp_dir + "/pruner.log" 81 | 82 | if not env.exists(self.log_file): 83 | env.dump("", self.log_file) 84 | self.fd = env.open_file(self.log_file, flags="w") 85 | self._log("Initialized Pruner Logger") 86 | 87 | def _log(self, msg): 88 | if self.fd and not self.fd.closed: 89 | msg = datetime.now().isoformat() + ": " + str(msg) 90 | self.fd.write(EnvSing.get_instance().str_or_byte(msg + "\n")) 91 | 92 | def _close_log(self): 93 | if not self.fd.closed: 94 | self.fd.flush() 95 | self.fd.close() 96 | -------------------------------------------------------------------------------- /maggy/tensorboard.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Module to encapsulate functionality related to writing to the tensorboard 19 | log dir and programmatically structure the outputs. 20 | """ 21 | 22 | import tensorflow as tf 23 | from tensorboard.plugins.hparams import api as hp 24 | 25 | _tensorboard_dir = None 26 | 27 | 28 | def _register(trial_dir): 29 | global _tensorboard_dir 30 | _tensorboard_dir = trial_dir 31 | 32 | 33 | def logdir(): 34 | """Returns the path to the tensorboard log directory. 35 | 36 | Instead of hardcoding a log dir path in a training function, users should 37 | make use of this function call, which will programmatically create a folder 38 | structure for tensorboard to visualize the machine learning experiment. 39 | 40 | :return: Path of the log directory in HOPSFS 41 | :rtype: str 42 | """ 43 | global _tensorboard_dir 44 | return _tensorboard_dir 45 | 46 | 47 | def _create_hparams_config(searchspace): 48 | hparams = [] 49 | 50 | for key, val in searchspace.names().items(): 51 | if val == "DOUBLE": 52 | hparams.append( 53 | hp.HParam( 54 | key, 55 | hp.RealInterval( 56 | float(searchspace.get(key)[0]), float(searchspace.get(key)[1]) 57 | ), 58 | ) 59 | ) 60 | elif val == "INTEGER": 61 | hparams.append( 62 | hp.HParam( 63 | key, 64 | hp.IntInterval(searchspace.get(key)[0], searchspace.get(key)[1]), 65 | ) 66 | ) 67 | elif val == "DISCRETE": 68 | hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key)))) 69 | elif val == "CATEGORICAL": 70 | hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key)))) 71 | 72 | return hparams 73 | 74 | 75 | def _write_hparams_config(log_dir, searchspace): 76 | HPARAMS = _create_hparams_config(searchspace) 77 | METRICS = [ 78 | hp.Metric( 79 | "epoch_accuracy", 80 | group="validation", 81 | display_name="accuracy (val.)", 82 | ), 83 | hp.Metric( 84 | "epoch_loss", 85 | group="validation", 86 | display_name="loss (val.)", 87 | ), 88 | hp.Metric( 89 | "epoch_accuracy", 90 | group="train", 91 | display_name="accuracy (train)", 92 | ), 93 | hp.Metric( 94 | "epoch_loss", 95 | group="train", 96 | display_name="loss (train)", 97 | ), 98 | ] 99 | 100 | with tf.summary.create_file_writer(log_dir).as_default(): 101 | hp.hparams_config(hparams=HPARAMS, metrics=METRICS) 102 | 103 | 104 | def _write_hparams(hparams, trial_id): 105 | global _tensorboard_dir 106 | with tf.summary.create_file_writer(_tensorboard_dir).as_default(): 107 | hp.hparams(hparams, trial_id) 108 | -------------------------------------------------------------------------------- /maggy/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ pytest fixtures that can be resued across tests. the filename needs to be conftest.py 18 | """ 19 | 20 | # make sure env variables are set correctly 21 | import findspark # this needs to be the first import 22 | 23 | findspark.init() 24 | 25 | import logging 26 | import pytest 27 | 28 | from pyspark import HiveContext 29 | from pyspark import SparkConf 30 | from pyspark import SparkContext 31 | from pyspark.streaming import StreamingContext 32 | 33 | 34 | def quiet_py4j(): 35 | """turn down spark logging for the test context""" 36 | logger = logging.getLogger("py4j") 37 | logger.setLevel(logging.WARN) 38 | 39 | 40 | def pytest_addoption(parser): 41 | parser.addoption( 42 | "--spark-master", 43 | action="store", 44 | default=None, 45 | help='spark-master: "spark://name.local:7077"', 46 | ) 47 | 48 | 49 | @pytest.fixture(scope="session") 50 | def sc(request): 51 | """fixture for creating a spark context 52 | Args: 53 | request: pytest.FixtureRequest object 54 | """ 55 | 56 | assert ( 57 | request.config.getoption("--spark-master") is not None 58 | ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" ' 59 | 60 | conf = ( 61 | SparkConf() 62 | .setMaster(request.config.getoption("--spark-master")) 63 | .setAppName("pytest-pyspark-local-testing") 64 | .set("spark.dynamicAllocation.maxExecutors", 2) 65 | .set("spark.executor.instances", 2) 66 | ) 67 | scont = SparkContext(conf=conf) 68 | request.addfinalizer(lambda: scont.stop()) 69 | 70 | quiet_py4j() 71 | return scont 72 | 73 | 74 | @pytest.fixture(scope="session") 75 | def hive_context(sc): 76 | """fixture for creating a Hive Context. Creating a fixture enables it to be reused across all 77 | tests in a session 78 | Args: 79 | spark_context: spark_context fixture 80 | Returns: 81 | HiveContext for tests 82 | """ 83 | return HiveContext(sc) 84 | 85 | 86 | @pytest.fixture(scope="session") 87 | def streaming_context(sc): 88 | return StreamingContext(sc, 1) 89 | -------------------------------------------------------------------------------- /maggy/tests/test_maggy.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | from maggy.searchspace import Searchspace 19 | from maggy.optimizer import RandomSearch 20 | 21 | # this allows using the fixture in all tests in this module 22 | pytestmark = pytest.mark.usefixtures("sc") 23 | 24 | 25 | def test_nr_executors(sc): 26 | 27 | executor_instances = int(sc._conf.get("spark.executor.instances")) 28 | expected_number = 2 29 | assert executor_instances == expected_number 30 | 31 | 32 | def test_random_search(sc): 33 | 34 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 35 | 36 | rs = RandomSearch() 37 | rs.searchspace = sp 38 | 39 | rs.num_trials = 5 40 | exp_result = {"argument_param": "DOUBLE"} 41 | 42 | assert sp.names() == exp_result 43 | assert rs.num_trials == 5 44 | assert rs.searchspace == sp 45 | -------------------------------------------------------------------------------- /maggy/tests/test_randomsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | import tensorflow as tf 22 | from tensorflow import keras 23 | import numpy as np 24 | 25 | from maggy.searchspace import Searchspace 26 | from maggy.optimizer import RandomSearch 27 | from maggy import experiment 28 | from maggy.config import HyperparameterOptConfig, TfDistributedConfig 29 | 30 | # this allows using the fixture in all tests in this module 31 | pytestmark = pytest.mark.usefixtures("sc") 32 | 33 | 34 | def test_randomsearch_init(): 35 | 36 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 37 | 38 | rs = RandomSearch(5, sp, []) 39 | 40 | assert rs.num_trials == 5 41 | assert rs.searchspace == sp 42 | 43 | 44 | def test_randomsearch_initialize(): 45 | 46 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 47 | 48 | rs = RandomSearch(5, sp, []) 49 | 50 | rs.initialize() 51 | 52 | assert len(rs.trial_buffer) == 5 53 | 54 | 55 | def test_rs_initialize2(): 56 | 57 | sp = Searchspace(argument_param=("DISCRETE", [1, 5])) 58 | 59 | rs = RandomSearch() 60 | rs.searchspace = sp 61 | 62 | with pytest.raises(NotImplementedError) as excinfo: 63 | rs.initialize() 64 | assert "Searchspace needs at least one continuous parameter" in str(excinfo.value) 65 | 66 | 67 | def test_randomsearch(sc): 68 | def train(model, train_set, test_set, hparams, reporter): 69 | 70 | if "argument_param" in hparams.keys(): 71 | print( 72 | "Entered train function with param {}".format(hparams["argument_param"]) 73 | ) 74 | 75 | for i in range(5): 76 | acc = i + random.random() 77 | reporter.broadcast(metric=acc) 78 | reporter.log("Metric: {}".format(acc)) 79 | 80 | # do something with HP. 81 | if "argument_param" in hparams.keys(): 82 | time.sleep(hparams["argument_param"]) 83 | 84 | return acc 85 | 86 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 87 | 88 | config = HyperparameterOptConfig( 89 | searchspace=sp, 90 | optimizer="randomsearch", 91 | direction="max", 92 | num_trials=5, 93 | name="test", 94 | hb_interval=1, 95 | es_interval=10, 96 | ) 97 | 98 | result = experiment.lagom(train_fn=train, config=config) 99 | assert type(result) == type({}) 100 | 101 | test_dt_tensorflow(sc) 102 | 103 | 104 | def test_dt_tensorflow(sc): 105 | 106 | mnist = tf.keras.datasets.mnist 107 | 108 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 109 | 110 | x_train, x_test = x_train / 255.0, x_test / 255.0 111 | x_train = np.reshape(x_train, (60000, 28, 28, 1)) 112 | x_test = np.reshape(x_test, (10000, 28, 28, 1)) 113 | 114 | def training_function(model, train_set, test_set, hparams): 115 | from tensorflow import keras 116 | 117 | # Define training parameters 118 | num_epochs = 10 119 | batch_size = 256 120 | learning_rate = 0.1 121 | 122 | criterion = keras.losses.SparseCategoricalCrossentropy() 123 | optimizer = keras.optimizers.SGD( 124 | learning_rate=learning_rate, momentum=0.9, decay=1e-5 125 | ) 126 | 127 | model = model(nlayers=2) 128 | 129 | model.compile(optimizer=optimizer, loss=criterion, metrics=["accuracy"]) 130 | 131 | model.fit( 132 | x_train, 133 | y_train, 134 | # batch_size=batch_size, 135 | # epochs=num_epochs, 136 | ) 137 | 138 | print("Testing") 139 | 140 | loss = model.evaluate(x_test, y_test) 141 | 142 | return loss 143 | 144 | class NeuralNetwork(tf.keras.Model): 145 | def __init__(self, nlayers): 146 | super().__init__() 147 | self.conv1 = keras.layers.Conv2D(28, 2, activation="relu") 148 | self.flatten = keras.layers.Flatten() 149 | self.d1 = keras.layers.Dense(32, activation="relu") 150 | self.d2 = keras.layers.Dense(10, activation="softmax") 151 | 152 | def call(self, x): 153 | x = self.conv1(x) 154 | x = self.flatten(x) 155 | x = self.d1(x) 156 | return self.d2(x) 157 | 158 | model = NeuralNetwork 159 | 160 | # define the constructor parameters of your model 161 | model_parameters = { 162 | "train_batch_size": 30000, 163 | "test_batch_size": 5000, 164 | "nlayers": 2, 165 | } 166 | 167 | # pass the model parameters in the last 168 | config = TfDistributedConfig( 169 | name="tf_test", 170 | model=model, 171 | train_set=None, 172 | test_set=None, 173 | hparams=model_parameters, 174 | ) 175 | 176 | result = experiment.lagom(train_fn=training_function, config=config) 177 | 178 | assert type(result) == list 179 | -------------------------------------------------------------------------------- /maggy/tests/test_searchspace.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | from maggy import Searchspace 22 | 23 | 24 | def test_searchspace_init(): 25 | 26 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 27 | 28 | exp_get = [1, 5] 29 | 30 | assert sp.get("argument_param") == exp_get 31 | assert sp.argument_param == exp_get # pylint: disable=no-member 32 | 33 | 34 | def test_searchspace_add(): 35 | 36 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 37 | 38 | with pytest.raises(ValueError) as excinfo: 39 | sp.add("argument_param", ("DOUBLE", [1, 5])) 40 | assert "Hyperparameter name is reserved" in str(excinfo.value) 41 | 42 | with pytest.raises(ValueError) as excinfo: 43 | # add tuple with too many elements 44 | sp.add("param", ("DOUBLE", [1, 5], "too many")) 45 | assert "Hyperparameter tuple has to be of length two" in str(excinfo.value) 46 | 47 | with pytest.raises(ValueError) as excinfo: 48 | # add unknown type 49 | sp.add("param", ("FLOAT", [1, 5])) 50 | assert "Hyperparameter type is not of type " in str(excinfo.value) 51 | 52 | with pytest.raises(ValueError) as excinfo: 53 | # add empty region list 54 | sp.add("param", ("DOUBLE", [])) 55 | assert "Hyperparameter feasible region list" in str(excinfo.value) 56 | 57 | with pytest.raises(AssertionError) as excinfo: 58 | # add incompatible type and feasible region 59 | sp.add("param", ("DOUBLE", [1, 5, 5])) 60 | sp.add("param2", ("INTEGER", [1, 5, 5])) 61 | assert "For DOUBLE or " in str(excinfo.value) 62 | 63 | with pytest.raises(AssertionError) as excinfo: 64 | # lower bound higher than upper bound 65 | sp.add("param", ("DOUBLE", [5, 1])) 66 | sp.add("param2", ("INTEGER", [4, 1])) 67 | assert "Lower bound " in str(excinfo.value) 68 | 69 | with pytest.raises(ValueError) as excinfo: 70 | # Non integer boundaries for integer type parameter 71 | sp.add("param2", ("INTEGER", [1.5, 5])) 72 | assert "type INTEGER need to be integer:" in str(excinfo.value) 73 | 74 | with pytest.raises(ValueError) as excinfo: 75 | # Non numeric interval boundaries 76 | sp.add("param2", ("DOUBLE", ["lower", 5])) 77 | assert "type DOUBLE need to be integer or float:" in str(excinfo.value) 78 | -------------------------------------------------------------------------------- /maggy/tests/test_trial.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | from maggy import Trial 22 | 23 | 24 | def test_trial_init(): 25 | 26 | trial = Trial({"param1": 5, "param2": "ada"}) 27 | 28 | exp = {"param1": 5, "param2": "ada"} 29 | 30 | assert trial.params == exp 31 | assert trial.status == Trial.PENDING 32 | assert trial.trial_id == "3d1cc9fdb1d4d001" 33 | 34 | 35 | def test_trial_serialization(): 36 | 37 | trial = Trial({"param1": 5, "param2": "ada"}) 38 | 39 | exp = {"param1": 5, "param2": "ada"} 40 | 41 | json_str = trial.to_json() 42 | 43 | new_trial = Trial.from_json(json_str) 44 | 45 | assert isinstance(new_trial, Trial) 46 | assert new_trial.params == exp 47 | assert new_trial.status == Trial.PENDING 48 | assert new_trial.trial_id == "3d1cc9fdb1d4d001" 49 | -------------------------------------------------------------------------------- /maggy/tests/test_wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | from operator import add 19 | 20 | # this allows using the fixture in all tests in this module 21 | pytestmark = pytest.mark.usefixtures("sc") 22 | 23 | # Can also use a decorator such as this to use specific fixtures in specific functions 24 | # @pytest.mark.usefixtures("spark_context", "hive_context") 25 | 26 | 27 | def do_word_counts(lines): 28 | """count of words in an rdd of lines""" 29 | 30 | counts = lines.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(add) 31 | results = {word: count for word, count in counts.collect()} 32 | return results 33 | 34 | 35 | # start function with test_ so pytest can discover them 36 | def test_do_word_counts(sc): 37 | """test that a single event is parsed correctly 38 | Args: 39 | spark_context: test fixture SparkContext 40 | hive_context: test fixture HiveContext 41 | """ 42 | 43 | test_input = [" hello spark ", " hello again spark spark"] 44 | 45 | input_rdd = sc.parallelize(test_input, 1) 46 | results = do_word_counts(input_rdd) 47 | 48 | expected_results = {"hello": 2, "spark": 3, "again": 1} 49 | assert results == expected_results 50 | -------------------------------------------------------------------------------- /maggy/trial.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import json 18 | import threading 19 | import hashlib 20 | 21 | from maggy import util 22 | 23 | 24 | class Trial(object): 25 | """A Trial object contains all relevant information about the evaluation 26 | of an hyperparameter combination. 27 | 28 | It is used as shared memory between 29 | the worker thread and rpc server thread. The server thread performs only 30 | lookups on the `early_stop` and `params` attributes. 31 | """ 32 | 33 | PENDING = "PENDING" 34 | SCHEDULED = "SCHEDULED" 35 | RUNNING = "RUNNING" 36 | ERROR = "ERROR" 37 | FINALIZED = "FINALIZED" 38 | 39 | def __init__(self, params, trial_type="optimization", info_dict=None): 40 | """Create a new trial object from a hyperparameter combination 41 | ``params``. 42 | 43 | :param params: A dictionary of Hyperparameters as key value pairs. 44 | :type params: dict 45 | :param info_dict: dict containing additional information about the trial including 46 | - sample_type 47 | - sampling_time 48 | - run_budget 49 | - model_budget (optinally) 50 | see `create_trial()` method of base.py for further reference 51 | :type info_dict: dict 52 | """ 53 | # XXX before merge, we should remove the default value for trial_type 54 | # and make sure everywhere Trial() is called (e.g. in all optimizers) 55 | # trial_type is passed 56 | # @Moritz 57 | 58 | self.trial_type = trial_type 59 | # XXX temp fix, have to come up with abstractions 60 | if self.trial_type == "optimization": 61 | self.trial_id = Trial._generate_id(params) 62 | elif self.trial_type == "ablation": 63 | serializable_params = { 64 | "ablated_feature": params.get("ablated_feature", None), 65 | "ablated_layer": params.get("ablated_layer", None), 66 | } 67 | self.trial_id = Trial._generate_id(serializable_params) 68 | self.params = params 69 | self.status = Trial.PENDING 70 | self.early_stop = False 71 | self.final_metric = None 72 | self.metric_history = [] 73 | self.step_history = [] 74 | self.metric_dict = {} 75 | self.start = None 76 | self.duration = None 77 | self.lock = threading.RLock() 78 | if info_dict is None: 79 | self.info_dict = {} 80 | else: 81 | self.info_dict = info_dict 82 | 83 | def get_early_stop(self): 84 | """Return the early stopping flag of the trial.""" 85 | with self.lock: 86 | return self.early_stop 87 | 88 | def set_early_stop(self): 89 | """Set the early stopping flag of the trial to true.""" 90 | with self.lock: 91 | self.early_stop = True 92 | 93 | def append_metric(self, metric_data): 94 | """Append a metric from the heartbeats to the history.""" 95 | with self.lock: 96 | # from python 3.7 dicts are insertion ordered, 97 | # so two of these data structures can be removed 98 | if ( 99 | metric_data["step"] not in self.metric_dict 100 | and metric_data["value"] is not None 101 | ): 102 | self.metric_dict[metric_data["step"]] = metric_data["value"] 103 | self.metric_history.append(metric_data["value"]) 104 | self.step_history.append(metric_data["step"]) 105 | # return step number to indicate that it was a new unique step 106 | return metric_data["step"] 107 | # return None to indicate that no new step has finished 108 | return None 109 | 110 | @classmethod 111 | def _generate_id(cls, params): 112 | """ 113 | Class method to generate a hash from a hyperparameter dictionary. 114 | 115 | All keys in the dictionary have to be strings. The hash is a to 16 116 | characters truncated md5 hash and stable across processes. 117 | 118 | :param params: Hyperparameters 119 | :type params: dictionary 120 | :raises ValueError: All hyperparameter names have to be strings. 121 | :raises ValueError: Hyperparameters need to be a dictionary. 122 | :return: Sixteen character truncated md5 hash 123 | :rtype: str 124 | """ 125 | 126 | # ensure params is a dictionary 127 | if isinstance(params, dict): 128 | # check that all keys are strings 129 | if False in set(isinstance(k, str) for k in params.keys()): 130 | raise ValueError("All hyperparameter names have to be strings.") 131 | 132 | return hashlib.md5( 133 | json.dumps(params, sort_keys=True).encode("utf-8") 134 | ).hexdigest()[:16] 135 | 136 | raise ValueError("Hyperparameters need to be a dictionary.") 137 | 138 | def to_json(self): 139 | return json.dumps(self.to_dict(), default=util.json_default_numpy) 140 | 141 | def to_dict(self): 142 | obj_dict = {"__class__": self.__class__.__name__} 143 | 144 | temp_dict = self.__dict__.copy() 145 | temp_dict.pop("lock") 146 | temp_dict.pop("start") 147 | 148 | obj_dict.update(temp_dict) 149 | 150 | return obj_dict 151 | 152 | @classmethod 153 | def from_json(cls, json_str): 154 | """Creates a Trial instance from a previously json serialized Trial 155 | object instance. 156 | 157 | :param json_str: String containing the object. 158 | :type json_str: str 159 | :raises ValueError: json_str is not a Trial object. 160 | :return: Instantiated object instance of Trial. 161 | :rtype: Trial 162 | """ 163 | 164 | temp_dict = json.loads(json_str) 165 | if temp_dict.get("__class__", None) != "Trial": 166 | raise ValueError("json_str is not a Trial object.") 167 | if temp_dict.get("params", None) is not None: 168 | instance = cls(temp_dict.get("params")) 169 | instance.trial_id = temp_dict["trial_id"] 170 | instance.status = temp_dict["status"] 171 | instance.early_stop = temp_dict.get("early_stop", False) 172 | instance.final_metric = temp_dict["final_metric"] 173 | instance.metric_history = temp_dict["metric_history"] 174 | instance.duration = temp_dict["duration"] 175 | 176 | return instance 177 | -------------------------------------------------------------------------------- /maggy/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | __version__ = "1.1.2" 18 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "MAGGY" 2 | site_description: "Official website and documentation for MAGGY - Distribution transparent Machine Learning experiments on Apache Spark." 3 | site_author: "Logical Clocks" 4 | site_url: "https://maggy.ai" 5 | 6 | # Repository 7 | repo_name: logicalclocks/maggy 8 | repo_url: https://github.com/logicalclocks/maggy 9 | edit_uri: "" 10 | 11 | nav: 12 | - Home: 13 | - Introduction: README.md 14 | - Blogs: blogs.md 15 | - Publications: publications.md 16 | - Releases: releases.md 17 | - Contributing: CONTRIBUTING.md 18 | - Issues: https://github.com/logicalclocks/maggy/issues 19 | - Hopsworks.ai: https://hopsworks.ai/ 20 | - Getting Started: 21 | - Installation: start/install.md 22 | - Quickstart: start/quickstart.md 23 | - Hyperparameter Optimization: 24 | - Introduction: hpo/intro.md 25 | - Strategies: hpo/strategies.md 26 | - Ablation Studies: 27 | - Introduction: ablation/intro.md 28 | - Distributed Training: 29 | - Introduction: dist_training/intro.md 30 | - TensorFlow: dist_training/tensorflow.md 31 | - PyTorch: dist_training/torch.md 32 | 33 | theme: 34 | name: material 35 | favicon: assets/images/maggyfav.png 36 | logo: assets/images/whitemaggy-eye.svg 37 | icon: 38 | repo: fontawesome/brands/github 39 | font: 40 | text: "Roboto" 41 | palette: 42 | accent: orange 43 | features: 44 | - navigation.tabs 45 | - navigation.tabs.sticky 46 | 47 | extra: 48 | generator: false 49 | social: 50 | - icon: fontawesome/brands/twitter 51 | link: https://twitter.com/logicalclocks 52 | - icon: fontawesome/brands/github 53 | link: https://github.com/logicalclocks 54 | - icon: fontawesome/brands/discourse 55 | link: https://community.hopsworks.ai/ 56 | - icon: fontawesome/brands/linkedin 57 | link: https://www.linkedin.com/company/logicalclocks/ 58 | analytics: 59 | provider: google 60 | property: G-J3F4GSLKE8 61 | 62 | extra_css: 63 | - assets/css/custom.css 64 | - assets/css/version-select.css 65 | 66 | extra_javascript: 67 | - assets/javascript/version-select.js 68 | 69 | plugins: 70 | - search 71 | 72 | markdown_extensions: 73 | - admonition 74 | - codehilite 75 | - footnotes 76 | - pymdownx.tabbed: 77 | alternate_style: true 78 | - pymdownx.arithmatex 79 | - pymdownx.superfences 80 | - pymdownx.details 81 | - pymdownx.caret 82 | - pymdownx.mark 83 | - pymdownx.tilde 84 | - pymdownx.critic 85 | - toc: 86 | permalink: "#" 87 | toc_depth: 3 88 | - pymdownx.tasklist: 89 | custom_checkbox: true 90 | - markdown_include.include: 91 | base_path: docs 92 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = maggy/tests 3 | max-line-length = 80 4 | select = C,E,F,W,B,B950 5 | ignore = E203, E501, W503 6 | per-file-ignores = 7 | maggy/experiment/experiment_python.py:F403, F405 8 | maggy/experiment/experiment_pyspark.py:F403, F405 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | from setuptools import setup, find_packages 19 | from importlib.machinery import SourceFileLoader 20 | 21 | 22 | version = ( 23 | SourceFileLoader("maggy.version", os.path.join("maggy", "version.py")).load_module().__version__ 24 | ) 25 | 26 | 27 | def read(fname): 28 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 29 | 30 | 31 | setup( 32 | name='maggy', 33 | version=version, 34 | install_requires=[ 35 | 'numpy>=1.19.2', 'scikit-optimize==0.9.0', 'statsmodels==0.12.2', 'scipy==1.10.0' 36 | ], 37 | extras_require={ 38 | 'pydoop': ['pydoop'], 39 | 'tf': ['tensorflow==2.4.1'], 40 | 'torch': ['torch==1.7.1'], # Should be 1.8.1 if we want to support PyTorch's ZeRO. 41 | 'zero': ['deepspeed==0.3.13', 42 | 'fairscale==0.3.0'], 43 | 'docs': [ 44 | 'mkdocs==1.5.3', 45 | 'mike==2.0.0', 46 | 'mkdocs-material==9.5.10', 47 | 'markdown-include==0.8.1', 48 | ], 49 | 'dev': [ 50 | 'black==20.8b1', 51 | 'flake8==3.9.0', 52 | 'pre-commit==2.11.1', 53 | ], 54 | 'spark': ['pyspark==2.4.3'] 55 | }, 56 | author='Moritz Meister', 57 | author_email='moritz@logicalclocks.com', 58 | description='Distribution transparent Machine Learning experiments on Apache Spark ', 59 | license='Apache License 2.0', 60 | keywords='Hyperparameter, Optimization, Distributed, Training, Keras, PyTorch, TensorFlow, Spark', 61 | url='https://github.com/logicalclocks/maggy', 62 | download_url='', 63 | packages=find_packages(), 64 | long_description=read('README.md'), 65 | long_description_content_type="text/markdown", 66 | python_requires=">=3.7", 67 | classifiers=[ 68 | 'Development Status :: 5 - Production/Stable', 69 | 'Topic :: Utilities', 70 | 'License :: OSI Approved :: Apache Software License', 71 | 'Programming Language :: Python :: 3', 72 | 'Programming Language :: Python :: 3.7', 73 | 'Intended Audience :: Developers', 74 | ] 75 | ) 76 | --------------------------------------------------------------------------------