├── .github └── workflows │ ├── mkdocs.yml │ └── pre-commit.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── CONTRIBUTING.md ├── README.md ├── ablation │ └── intro.md ├── assets │ ├── css │ │ ├── custom.css │ │ └── version-select.css │ ├── images │ │ ├── databricks_installation.png │ │ ├── firstgraph.png │ │ ├── hopsworks_installation.png │ │ ├── maggy.png │ │ ├── maggy_dt_video.png │ │ ├── maggy_hpo_video.png │ │ ├── maggyfav.png │ │ ├── scdgraph.png │ │ └── whitemaggy-eye.svg │ └── javascript │ │ └── version-select.js ├── blogs.md ├── dist_training │ ├── intro.md │ ├── tensorflow.md │ └── torch.md ├── hpo │ ├── intro.md │ └── strategies.md ├── publications.md ├── releases.md └── start │ ├── install.md │ └── quickstart.md ├── examples ├── Databricks │ ├── maggy-databricks-iris.ipynb │ └── maggy-databricks-mnist-example.ipynb └── README.md ├── maggy ├── __init__.py ├── ablation │ ├── __init__.py │ ├── ablationstudy.py │ └── ablator │ │ ├── __init__.py │ │ ├── abstractablator.py │ │ └── loco.py ├── callbacks.py ├── config │ ├── __init__.py │ ├── ablation.py │ ├── base_config.py │ ├── hyperparameter_optimization.py │ ├── lagom.py │ ├── tf_distributed.py │ └── torch_distributed.py ├── constants.py ├── core │ ├── __init__.py │ ├── config.py │ ├── environment │ │ ├── __init__.py │ │ ├── base.py │ │ ├── databricks.py │ │ ├── hopsworks.py │ │ └── singleton.py │ ├── exceptions.py │ ├── executors │ │ ├── __init__.py │ │ ├── base_executor.py │ │ ├── tf_dist_executor.py │ │ ├── torch_dist_executor.py │ │ └── trial_executor.py │ ├── experiment_driver │ │ ├── __init__.py │ │ ├── ablation_driver.py │ │ ├── base_driver.py │ │ ├── optimization_driver.py │ │ ├── python_driver.py │ │ ├── spark_driver.py │ │ ├── tf_distributed_training_driver.py │ │ └── torch_distributed_training_driver.py │ ├── patching │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── modules.py │ │ └── optim.py │ ├── reporter.py │ ├── rpc.py │ └── tf_patching │ │ ├── __init__.py │ │ └── tf_modules.py ├── earlystop │ ├── __init__.py │ ├── abstractearlystop.py │ ├── medianrule.py │ └── nostop.py ├── experiment │ ├── __init__.py │ ├── experiment.py │ ├── experiment_pyspark.py │ └── experiment_python.py ├── optimizer │ ├── __init__.py │ ├── abstractoptimizer.py │ ├── asha.py │ ├── bayes │ │ ├── __init__.py │ │ ├── acquisitions.py │ │ ├── base.py │ │ ├── gp.py │ │ └── tpe.py │ ├── gridsearch.py │ ├── randomsearch.py │ └── singlerun.py ├── pruner │ ├── __init__.py │ ├── abstractpruner.py │ └── hyperband.py ├── searchspace.py ├── tensorboard.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_maggy.py │ ├── test_randomsearch.py │ ├── test_searchspace.py │ ├── test_trial.py │ └── test_wordcount.py ├── trial.py ├── util.py └── version.py ├── mkdocs.yml ├── setup.cfg └── setup.py /.github/workflows/mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: mkdocs 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | jobs: 8 | publish-master: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | with: 14 | fetch-depth: 0 15 | - uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.8' 18 | - name: install deps 19 | run: pip install .[dev,docs] 20 | 21 | - name: copy files 22 | run: | 23 | rm docs/CONTRIBUTING.md docs/README.md 24 | cp -f CONTRIBUTING.md docs/ 25 | cp -f README.md docs/ 26 | 27 | - name: setup git 28 | run: | 29 | git config --global user.name Mike 30 | git config --global user.email mike@maggy.ai 31 | 32 | - name: mike deploy master 33 | run: mike deploy --push --update-aliases master dev 34 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | stylecheck: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: actions/setup-python@v2 12 | with: 13 | python-version: '3.8' 14 | - name: install deps 15 | run: pip install flake8==3.9.0 black==22.3.0 pre-commit-hooks==2.4.0 16 | 17 | - name: black 18 | run: black --check maggy 19 | 20 | - name: flake8 21 | run: flake8 maggy 22 | 23 | - name: trailing-whitespace-fixer 24 | run: trailing-whitespace-fixer $(find maggy -type f) || exit 1 25 | 26 | - name: end-of-file-fixer 27 | run: end-of-file-fixer $(find maggy -type f) || exit 1 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .vscode 3 | .idea 4 | scripts/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | env.bak/ 96 | venv.bak/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | .spyproject 101 | 102 | # Rope project settings 103 | .ropeproject 104 | 105 | # mkdocs documentation 106 | /site 107 | 108 | # mypy 109 | .mypy_cache/ 110 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: (^setup.py|^maggy/tests/|^docs/) 2 | repos: 3 | - repo: https://github.com/psf/black 4 | rev: 22.3.0 5 | hooks: 6 | - id: black 7 | language_version: python3 8 | - repo: https://gitlab.com/pycqa/flake8 9 | rev: 3.9.0 10 | hooks: 11 | - id: flake8 12 | language_version: python3 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v2.4.0 15 | hooks: 16 | - id: trailing-whitespace 17 | - id: end-of-file-fixer 18 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Contributions are welcome! Not familiar with the codebase yet? No problem! 4 | There are many ways to contribute to open source projects: reporting bugs, 5 | helping with the documentation, spreading the word and of course, adding 6 | new features and patches. 7 | 8 | ## Reporting issues 9 | 10 | - Describe what you expected to happen. 11 | - If possible, include a [minimal, complete, and verifiable example](https://stackoverflow.com/help/mcve) to help 12 | us identify the issue. This also helps to check that the issue is not with 13 | your own code. 14 | - Describe what actually happened. Include the full traceback if there was an 15 | exception. 16 | - List your Python, Hopsworks and Maggy versions. If possible, check if this 17 | issue is already fixed in the repository. 18 | 19 | ## Contributing Code 20 | 21 | Code contributions, in the form of patches or features are welcome. In order to 22 | start developing, please follow the instructions below, to enable [pre-commit](https://pre-commit.com/) and 23 | ensure style and codechecks. 24 | 25 | ### Python Setup 26 | 27 | - Fork Maggy to your GitHub account by clicking the `Fork` button. 28 | 29 | - Clone your fork locally: 30 | 31 | ```bash 32 | git clone https://github.com/[username]/maggy.git 33 | cd maggy 34 | ``` 35 | 36 | - Add the upstream repository as a remote to update later:: 37 | 38 | ```bash 39 | git remote add upstream https://github.com/logicalclocks/maggy.git 40 | git fetch upstream 41 | ``` 42 | 43 | - Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda: 44 | 45 | ```bash 46 | python3 -m venv env 47 | . env/bin/activate 48 | # or "env\Scripts\activate" on Windows 49 | ``` 50 | 51 | or with conda: 52 | 53 | ```bash 54 | conda create --name maggy python=3.8 55 | conda activate maggy 56 | ``` 57 | 58 | verify your python version - we are using Python 3.8: 59 | 60 | ```bash 61 | python --version 62 | ``` 63 | 64 | - Install Maggy in editable mode with development dependencies:: 65 | 66 | ```bash 67 | pip install -e ".[dev]" 68 | ``` 69 | 70 | - Install pre-commit_ and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. Maggy uses pre-commit to ensure code-style and code formatting through [black](https://github.com/psf/black) and [flake8](https://gitlab.com/pycqa/flake8): 71 | 72 | ```bash 73 | pip install --user pre-commit 74 | pre-commit install 75 | ``` 76 | 77 | Afterwards, pre-commit will run whenever you commit. 78 | 79 | - To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use black and flake8, or run them via the command line: 80 | 81 | ```bash 82 | flake8 maggy 83 | black maggy 84 | ``` 85 | 86 | ### Start coding 87 | 88 | - Create a branch to identify the issue or feature you would like to work on. 89 | - Using your favorite editor, make your changes, committing as you go. 90 | - Follow [PEP8](https://pep8.org/). 91 | - Push your commits to GitHub and [create a pull request](https://help.github.com/articles/creating-a-pull-request/). 92 | - Celebrate 🎉 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Maggy 4 | 5 |

6 | 7 |

8 | Hopsworks Community 12 | Maggy Documentation 16 | PyPiStatus 20 | Downloads 24 | CodeStyle 28 | License 32 |

33 | 34 | Maggy is a framework for **distribution transparent** machine learning experiments on [Apache Spark](https://spark.apache.org/). 35 | In this post, we introduce a new unified framework for writing core ML training logic as **oblivious training functions**. 36 | Maggy enables you to reuse the same training code whether training small models on your laptop or reusing the same code to scale out hyperparameter tuning or distributed deep learning on a cluster. 37 | Maggy enables the replacement of the current waterfall development process for distributed ML applications, where code is rewritten at every stage to account for the different distribution context. 38 | 39 |

40 |

41 | 42 | Maggy 43 | 44 |
Maggy uses the same distribution transparent training function in all steps of the machine learning development process.
45 |
46 |

47 | 48 | ## Quick Start 49 | 50 | Maggy uses PySpark as an engine to distribute the training processes. To get started, install Maggy in the Python environment used by your Spark Cluster, or install Maggy in your local Python environment with the `'spark'` extra, to run on Spark in local mode: 51 | 52 | ```python 53 | pip install maggy 54 | ``` 55 | 56 | The programming model consists of wrapping the code containing the model training 57 | inside a function. Inside that wrapper function provide all imports and 58 | parts that make up your experiment. 59 | 60 | Single run experiment: 61 | 62 | ```python 63 | def train_fn(): 64 | # This is your training iteration loop 65 | for i in range(number_iterations): 66 | ... 67 | # add the maggy reporter to report the metric to be optimized 68 | reporter.broadcast(metric=accuracy) 69 | ... 70 | # Return metric to be optimized or any metric to be logged 71 | return accuracy 72 | 73 | from maggy import experiment 74 | result = experiment.lagom(train_fn=train_fn, name='MNIST') 75 | ``` 76 | 77 | **lagom** is a Swedish word meaning "just the right amount". This is how MAggy 78 | uses your resources. 79 | 80 | 81 | ## Documentation 82 | 83 | Full documentation is available at [maggy.ai](https://maggy.ai/) 84 | 85 | ## Contributing 86 | 87 | There are various ways to contribute, and any contribution is welcome, please follow the 88 | CONTRIBUTING guide to get started. 89 | 90 | ## Issues 91 | 92 | Issues can be reported on the official [GitHub repo](https://github.com/logicalclocks/maggy/issues) of Maggy. 93 | 94 | ## Citation 95 | 96 | Please see our publications on [maggy.ai](https://maggy.ai/publications) to find out how to cite our work. 97 | 98 | ## Acknowledgements 99 | 100 | The development of Maggy is supported by the EU H2020 Deep Cube Project (Grant agreement ID: 101004188). 101 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Contributions are welcome! Not familiar with the codebase yet? No problem! 4 | There are many ways to contribute to open source projects: reporting bugs, 5 | helping with the documentation, spreading the word and of course, adding 6 | new features and patches. 7 | 8 | ## Reporting issues 9 | 10 | - Describe what you expected to happen. 11 | - If possible, include a [minimal, complete, and verifiable example](https://stackoverflow.com/help/mcve) to help 12 | us identify the issue. This also helps to check that the issue is not with 13 | your own code. 14 | - Describe what actually happened. Include the full traceback if there was an 15 | exception. 16 | - List your Python, Hopsworks and Maggy versions. If possible, check if this 17 | issue is already fixed in the repository. 18 | 19 | ## Contributing Code 20 | 21 | Code contributions, in the form of patches or features are welcome. In order to 22 | start developing, please follow the instructions below, to enable [pre-commit](https://pre-commit.com/) and 23 | ensure style and codechecks. 24 | 25 | ### Python Setup 26 | 27 | - Fork Maggy to your GitHub account by clicking the `Fork` button. 28 | 29 | - Clone your fork locally: 30 | 31 | ```bash 32 | git clone https://github.com/[username]/maggy.git 33 | cd maggy 34 | ``` 35 | 36 | - Add the upstream repository as a remote to update later:: 37 | 38 | ```bash 39 | git remote add upstream https://github.com/logicalclocks/maggy.git 40 | git fetch upstream 41 | ``` 42 | 43 | - Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda: 44 | 45 | ```bash 46 | python3 -m venv env 47 | . env/bin/activate 48 | # or "env\Scripts\activate" on Windows 49 | ``` 50 | 51 | or with conda: 52 | 53 | ```bash 54 | conda create --name maggy python=3.8 55 | conda activate maggy 56 | ``` 57 | 58 | verify your python version - we are using Python 3.8: 59 | 60 | ```bash 61 | python --version 62 | ``` 63 | 64 | - Install Maggy in editable mode with development dependencies:: 65 | 66 | ```bash 67 | pip install -e ".[dev]" 68 | ``` 69 | 70 | - Install pre-commit_ and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. Maggy uses pre-commit to ensure code-style and code formatting through [black](https://github.com/psf/black) and [flake8](https://gitlab.com/pycqa/flake8): 71 | 72 | ```bash 73 | pip install --user pre-commit 74 | pre-commit install 75 | ``` 76 | 77 | Afterwards, pre-commit will run whenever you commit. 78 | 79 | - To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use black and flake8, or run them via the command line: 80 | 81 | ```bash 82 | flake8 maggy 83 | black maggy 84 | ``` 85 | 86 | ### Start coding 87 | 88 | - Create a branch to identify the issue or feature you would like to work on. 89 | - Using your favorite editor, make your changes, committing as you go. 90 | - Follow [PEP8](https://pep8.org/). 91 | - Push your commits to GitHub and [create a pull request](https://help.github.com/articles/creating-a-pull-request/). 92 | - Celebrate 🎉 93 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Maggy 4 | 5 |

6 | 7 |

8 | Hopsworks Community 12 | Maggy Documentation 16 | PyPiStatus 20 | Downloads 24 | CodeStyle 28 | License 32 |

33 | 34 | Maggy is a framework for **distribution transparent** machine learning experiments on [Apache Spark](https://spark.apache.org/). 35 | In this post, we introduce a new unified framework for writing core ML training logic as **oblivious training functions**. 36 | Maggy enables you to reuse the same training code whether training small models on your laptop or reusing the same code to scale out hyperparameter tuning or distributed deep learning on a cluster. 37 | Maggy enables the replacement of the current waterfall development process for distributed ML applications, where code is rewritten at every stage to account for the different distribution context. 38 | 39 |

40 |

41 | 42 | Maggy 43 | 44 |
Maggy uses the same distribution transparent training function in all steps of the machine learning development process.
45 |
46 |

47 | 48 | ## Quick Start 49 | 50 | Maggy uses PySpark as an engine to distribute the training processes. To get started, install Maggy in the Python environment used by your Spark Cluster, or install Maggy in your local Python environment with the `'spark'` extra, to run on Spark in local mode: 51 | 52 | ```python 53 | pip install maggy 54 | ``` 55 | 56 | The programming model consists of wrapping the code containing the model training 57 | inside a function. Inside that wrapper function provide all imports and 58 | parts that make up your experiment. 59 | 60 | Single run experiment: 61 | 62 | ```python 63 | def train_fn(): 64 | # This is your training iteration loop 65 | for i in range(number_iterations): 66 | ... 67 | # add the maggy reporter to report the metric to be optimized 68 | reporter.broadcast(metric=accuracy) 69 | ... 70 | # Return metric to be optimized or any metric to be logged 71 | return accuracy 72 | 73 | from maggy import experiment 74 | result = experiment.lagom(train_fn=train_fn, name='MNIST') 75 | ``` 76 | 77 | **lagom** is a Swedish word meaning "just the right amount". This is how MAggy 78 | uses your resources. 79 | 80 | 81 | ## Documentation 82 | 83 | Full documentation is available [here](https://maggy.readthedocs.io/en/latest/). 84 | 85 | ## Contributing 86 | 87 | There are various ways to contribute, and any contribution is welcome, please follow the 88 | CONTRIBUTING guide to get started. 89 | 90 | ## Issues 91 | 92 | Issues can be reported on the official [GitHub repo](https://github.com/logicalclocks/maggy/issues) of Maggy. 93 | 94 | ## Citation 95 | 96 | Please see our publications on [maggy.ai](https://maggy.ai/publications) to find out how to cite our work. 97 | 98 | ## Acknowledgements 99 | 100 | The development of Maggy is supported by the EU H2020 Deep Cube Project (Grant agreement ID: 101004188). 101 | -------------------------------------------------------------------------------- /docs/assets/css/custom.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --md-primary-fg-color: #F15A24; 3 | --md-secondary-fg-color: #333333; 4 | } 5 | 6 | .md-header__button.md-logo { 7 | margin: .1rem; 8 | padding: .1rem; 9 | } 10 | 11 | .md-header__button.md-logo img, .md-header__button.md-logo svg { 12 | display: block; 13 | width: 5.2rem; 14 | height: 2rem; 15 | fill: currentColor; 16 | } 17 | 18 | .md-tabs { 19 | width: 100%; 20 | overflow: auto; 21 | color: var(--md-primary-bg-color); 22 | background-color: var(--md-secondary-fg-color); 23 | transition: background-color 250ms; 24 | } 25 | -------------------------------------------------------------------------------- /docs/assets/css/version-select.css: -------------------------------------------------------------------------------- 1 | @media only screen and (max-width:76.1875em) { 2 | } 3 | 4 | #version-selector select.form-control { 5 | appearance: none; 6 | -webkit-appearance: none; 7 | -moz-appearance: none; 8 | 9 | background-color: #F5F5F5; 10 | 11 | background-position: center right; 12 | background-repeat: no-repeat; 13 | border: 0px; 14 | border-radius: 2px; 15 | /* box-shadow: 0px 1px 3px rgb(0 0 0 / 10%); */ 16 | color: inherit; 17 | width: -webkit-fill-available; 18 | width: -moz-available; 19 | max-width: 200px; 20 | font-size: inherit; 21 | /* font-weight: 600; */ 22 | margin: 10px; 23 | overflow: hidden; 24 | padding: 7px 10px; 25 | text-overflow: ellipsis; 26 | white-space: nowrap; 27 | } 28 | 29 | #version-selector::after { 30 | content: '⌄'; 31 | font-family: inherit; 32 | font-size: 22px; 33 | margin: -35px; 34 | vertical-align: 7%; 35 | padding-bottom: 10px; 36 | } 37 | -------------------------------------------------------------------------------- /docs/assets/images/databricks_installation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/databricks_installation.png -------------------------------------------------------------------------------- /docs/assets/images/firstgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/firstgraph.png -------------------------------------------------------------------------------- /docs/assets/images/hopsworks_installation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/hopsworks_installation.png -------------------------------------------------------------------------------- /docs/assets/images/maggy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy.png -------------------------------------------------------------------------------- /docs/assets/images/maggy_dt_video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy_dt_video.png -------------------------------------------------------------------------------- /docs/assets/images/maggy_hpo_video.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy_hpo_video.png -------------------------------------------------------------------------------- /docs/assets/images/maggyfav.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggyfav.png -------------------------------------------------------------------------------- /docs/assets/images/scdgraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/scdgraph.png -------------------------------------------------------------------------------- /docs/assets/images/whitemaggy-eye.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 | 17 | 19 | 21 | 22 | 24 | 26 | 27 | 28 | 43 | 44 | 45 | 46 | 47 | 49 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /docs/assets/javascript/version-select.js: -------------------------------------------------------------------------------- 1 | window.addEventListener("DOMContentLoaded", function() { 2 | // This is a bit hacky. Figure out the base URL from a known CSS file the 3 | // template refers to... 4 | var ex = new RegExp("/?assets/css/version-select.css$"); 5 | var sheet = document.querySelector('link[href$="version-select.css"]'); 6 | 7 | var ABS_BASE_URL = sheet.href.replace(ex, ""); 8 | var CURRENT_VERSION = ABS_BASE_URL.split("/").pop(); 9 | 10 | function makeSelect(options, selected) { 11 | var select = document.createElement("select"); 12 | select.classList.add("form-control"); 13 | 14 | options.forEach(function(i) { 15 | var option = new Option(i.text, i.value, undefined, 16 | i.value === selected); 17 | select.add(option); 18 | }); 19 | 20 | return select; 21 | } 22 | 23 | var xhr = new XMLHttpRequest(); 24 | xhr.open("GET", ABS_BASE_URL + "/../versions.json"); 25 | xhr.onload = function() { 26 | var versions = JSON.parse(this.responseText); 27 | 28 | var realVersion = versions.find(function(i) { 29 | return i.version === CURRENT_VERSION || 30 | i.aliases.includes(CURRENT_VERSION); 31 | }).version; 32 | 33 | var select = makeSelect(versions.map(function(i) { 34 | if (i.aliases.length > 0) { 35 | var aliasString = " [" + i.aliases.join(", ") + "]"; 36 | } else { 37 | var aliasString = ""; 38 | } 39 | return {text: i.title + aliasString, value: i.version}; 40 | }), realVersion); 41 | select.addEventListener("change", function(event) { 42 | window.location.href = ABS_BASE_URL + "/../" + this.value; 43 | }); 44 | 45 | var container = document.createElement("div"); 46 | container.id = "version-selector"; 47 | // container.className = "md-nav__item"; 48 | container.appendChild(select); 49 | 50 | var sidebar = document.querySelector(".md-nav--primary > .md-nav__list"); 51 | sidebar.parentNode.insertBefore(container, sidebar.nextSibling); 52 | }; 53 | xhr.send(); 54 | }); 55 | -------------------------------------------------------------------------------- /docs/blogs.md: -------------------------------------------------------------------------------- 1 | # Blogs 2 | -------------------------------------------------------------------------------- /docs/dist_training/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Distributed training is useful for big models that can't fit in a single machine and for very big datasets. 4 | There are several techniques available. As an example, the Mirrored Strategies replicate the models over the workers and 5 | train them using splits of the data. 6 | 7 | With Maggy, you can train a Machine Learning model in a distributed fashion without rewriting the code of the training. 8 | Distributed Training with Maggy is available on TensorFlow and PyTorch. 9 | 10 | If you want to know more on how to use Maggy for Distributed Training, you can watch the presentation in the next section. 11 | 12 | When you are ready, you can inspect an example on [TensorFlow](tensorflow.md) or [PyTorch](torch.md). 13 | 14 | ## Maggy Distributed Model Training 15 | [![Maggy Distributed Model Training](../assets/images/maggy_dt_video.png)]( 16 | https://www.youtube.com/watch?v=1SHOwl37I5c) 17 | -------------------------------------------------------------------------------- /docs/dist_training/tensorflow.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | Using maggy for Distributed Training works as follows: 4 | 5 | * Optionally, define a model generator object, similarly to what is done for Ablation Studies. 6 | ```py 7 | class MyModel(tf.keras.Model): 8 | 9 | def __init__(self, ...): 10 | super().__init__() 11 | ... 12 | 13 | def call(self, ...): 14 | ... 15 | 16 | ... 17 | ``` 18 | * Optionally, define your train and test datasets, these will be sharded by Maggy. 19 | ```py 20 | # Extract the data 21 | (x_train, y_train),(x_test, y_test) = split_dataset(dataset) 22 | 23 | # Do some preprocessing operations 24 | ... 25 | ``` 26 | * Define a training function containing the training logic. 27 | ```py 28 | def training_function(model, train_set, test_set, hparams): 29 | #training and testing logic 30 | ... 31 | ``` 32 | 33 | * Create the configuration object and run the optimization. 34 | ```py 35 | config = TfDistributedConfig(name="tf_test", 36 | model=model, 37 | train_set=(x_train, y_train), 38 | test_set=(x_test, y_test), 39 | hparams=model_parameters), 40 | ... 41 | ) 42 | 43 | experiment.lagom(train_fn=training_function, config=config) 44 | ``` 45 | There are many parameters for the configuration object: 46 | * model: A tf.keras.Model superclass or list of them. 47 | Note that this has to be the class itself, not an instance. 48 | * train_set: The training set for the training function. If you want to load the set 49 | inside the training function, this can be disregarded. 50 | * test_set: The test set for the training function. If you want to load the set 51 | inside the training function, this can be disregarded. 52 | * process_data: The function for processing the data 53 | * hparams: model parameters that should be used during model initialization. Primarily 54 | used to give an interface for hp optimization. 55 | * name: Experiment name. 56 | * hb_interval: Heartbeat interval with which the server is polling. 57 | * description: A description of the experiment. -------------------------------------------------------------------------------- /docs/dist_training/torch.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | Maggy enables you to train with Microsoft’s DeepSpeed ZeRO optimizer. Since DeepSpeed does not follow the common 4 | PyTorch programming model, Maggy is unable to provide full distribution transparency to the user. 5 | This means that if you want to use DeepSpeed for your training, you will have to make small changes 6 | to your code. In this notebook, we will show you what exactly you have to change in order to make 7 | DeepSpeed run with Maggy. 8 | 9 | * First off, we have to define our model as we did for TensorFlow and Ablation studies. 10 | ```py 11 | class MyModel(torch.nn.Module): 12 | 13 | def __init__(self, ...): 14 | super().__init__(...) 15 | ... 16 | 17 | def forward(self, ...): 18 | ... 19 | ``` 20 | 21 | * There are a few minor changes that have to be done in order to train with DeepSpeed: - There is no need for an 22 | optimizer anymore. You can configure your optimizer later in the DeepSpeed config. - DeepSpeed’s ZeRO requires you to 23 | use FP16 training. Therefore, convert your data to half precision! - The backward call is not executed on the loss, 24 | but on the model (```model.backward(loss)``` instead of ```loss.backward()```). - 25 | The step call is not executed on the optimizer, 26 | but also on the model (```model.step()``` instead of ```optimizer.step()```). - 27 | As we have no optimizer anymore, there is also 28 | no need to call ```optimizer.zero_grad()```. 29 | You do not have to worry about the implementation of these calls, 30 | Maggy configures your model at runtime to act as a DeepSpeed engine. 31 | ```py 32 | def train_fn(...): 33 | ... 34 | ``` 35 | 36 | * In order to use DeepSpeed’s ZeRO, the deepspeed backend has to be chosen. This 37 | backend also requires its own config. You can read a full specification of the possible settings 38 | [here](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training). 39 | ```py 40 | ds_config = {"train_micro_batch_size_per_gpu": 1, 41 | "gradient_accumulation_steps": 1, 42 | "optimizer": {"type": "Adam", "params": {"lr": 0.1}}, 43 | "fp16": {"enabled": True}, 44 | "zero_optimization": {"stage": 2}, 45 | } 46 | 47 | config = TorchDistributedConfig(module=MyModel, backend="deepspeed", deepspeed_config=ds_config, ...) 48 | ``` 49 | 50 | * Start the training with ```lagom()``` 51 | ```py 52 | result = experiment.lagom(train_fn, config) 53 | ``` -------------------------------------------------------------------------------- /docs/hpo/intro.md: -------------------------------------------------------------------------------- 1 | 2 | # Introduction 3 | 4 | Maggy is a framework for asynchronous trials and early-stopping with global knowledge, guided by an Optimizer. 5 | Developers can use an existing Optimizer, such as asynchronous successive halving (ASHA), or provide their own one. 6 | The basic approach we followed was to add support for the Driver and Executors to communicate via RPCs. 7 | The Optimizer that guides hyperparameter search is located on the Driver, and it assigns trials to Executors. 8 | Executors periodically send back to the Driver the current performance of their trial, 9 | and the Optimizer can decide to early-stop its ongoing trial, followed by sending the Executor with a new trial. 10 | Because of the impedance mismatch between trials and the stage-/task-based execution model of Spark, 11 | we are blocking Executors with long-running tasks to run multiple trials per task. 12 | In this way, Executors are always kept busy running trials, and global information needed for efficient 13 | early-stopping is aggregated in the Optimizer. 14 | If you want to know more about Maggy for Hyperparameter Optimization (HPO), you can watch the presentation in the video posted below. 15 | Otherwise, if you feel ready to explore more details, you can jump to the [strategies](strategies.md) section. 16 | 17 | 18 | ## Spark/AI summit presentation of Maggy for HPO 19 | [![Maggy Parallel Hyperparameter Optimization](../assets/images/maggy_hpo_video.png)](https://www.youtube.com/watch?v=0Hd1iYEL03w) 20 | -------------------------------------------------------------------------------- /docs/hpo/strategies.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | Using maggy for Hyperparameter Optimization (HPO) works as follows: 4 | 5 | * Define a training function containing the training logic. 6 | ```py 7 | def training_function(model, train_set, test_set, hparams): 8 | #training and testing logic 9 | ... 10 | ``` 11 | 12 | * Define a search space, containing the hparams we want to optimize, their type and range. 13 | ```py 14 | #define the hyperparemeters to optimize, together with their possible values 15 | sp = Searchspace(kernel=('DISCRETE', [2, 8]), pool=('DISCRETE', [2, 8]), dropout=('DISCRETE', [0.01, 0.99])) 16 | ``` 17 | 18 | * Create the configuration object and run the optimization. 19 | ```py 20 | config = OptimizationConfig(num_trials=4, 21 | optimizer="randomsearch", 22 | searchspace=sp, 23 | direction="max", 24 | es_interval=1, 25 | es_min=5, 26 | name="hp_tuning_test") 27 | 28 | experiment.lagom(train_fn=training_function, config=config) 29 | ``` 30 | There are many parameters for the configuration object: 31 | * num_trials: Controls how many seperate runs are conducted during the hp search. 32 | * optimizer: Optimizer type for searching the hp searchspace. 33 | * searchspace: A Searchspace object configuring the names, types and ranges of hps. 34 | * optimization_key: Name of the metric to use for hp search evaluation. 35 | * direction: Direction of optimization. 36 | * es_interval: Early stopping polling frequency during an experiment run. 37 | * es_min: Minimum number of experiments to conduct before starting the early stopping 38 | mechanism. Useful to establish a baseline for performance estimates. 39 | * es_policy: Early stopping policy which formulates a rule for triggering aborts. 40 | * name: Experiment name. 41 | * description: A description of the experiment. 42 | * hb_interval: Heartbeat interval with which the server is polling. 43 | * model: The class of the model to be used in the training function. 44 | * train_set: The train_set to be used in the training function. 45 | * test_set: The test_set to be used in the training function. 46 | 47 | # Strategies 48 | 49 | ### Random Search 50 | 51 | With Random Search, the hparams are selected randomly within the search space defined. The search space is defined 52 | depending on how many trials (_num_trials_) you choose. 53 | 54 | In the following example, _num_trials_ is set to 4, therefore, Maggy will choose randomly 4 combinations of kernel, 55 | pool and dropout values. 56 | ```py 57 | def training_function(hparams): 58 | #training and testing logic 59 | ... 60 | #define the hyperparemeters to optimize, together with their possible values 61 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99])) 62 | 63 | config = OptimizationConfig(num_trials=4, 64 | optimizer="randomsearch", 65 | searchspace=sp, 66 | direction="max", 67 | es_interval=1, 68 | es_min=5, 69 | name="hp_tuning_test") 70 | 71 | #run optimization 72 | result = experiment.lagom(train_fn=training_function, config=config) 73 | ``` 74 | ### Grid Search 75 | 76 | ```py 77 | def training_function(): 78 | #training and testing logic 79 | ... 80 | #define the hyperparemeters to optimize, together with their possible values 81 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99])) 82 | 83 | config = OptimizationConfig(num_trials=4, 84 | optimizer="gridsearch", 85 | searchspace=sp, 86 | direction="max", 87 | es_interval=1, 88 | es_min=5, 89 | name="hp_tuning_test") 90 | 91 | #run optimization 92 | result = experiment.lagom(train_fn=training_function, config=config) 93 | ``` 94 | 95 | ### Asynchronous Successive Halving Algorithm (ASHA) 96 | 97 | This strategy is a combination of random search and early stopping. 98 | ASHA tackles large-scale hyperparameter optimization problems, and it is especially useful for challenges that need a 99 | high number of parallelism (i.e. there are a lot of hparams and a lot of workers are available). 100 | 101 | ```py 102 | def training_function(): 103 | #training and testing logic 104 | ... 105 | #define the hyperparemeters to optimize, together with their possible values 106 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99])) 107 | 108 | config = OptimizationConfig(num_trials=4, 109 | optimizer='asha', 110 | searchspace=sp, 111 | direction="max", 112 | es_interval=1, 113 | es_min=5, 114 | name="hp_tuning_test") 115 | 116 | experiment.lagom(train_fn=training_function, config=config) 117 | ``` 118 | 119 | you can define custom ASHA optimizers by setting 3 parameters: _reduction_factor, resource_min_ and _resource_max_. 120 | The standard values are 2, 1, and 4, respectively. 121 | To use custom values, import the class _Asha_ from _maggy.optimizer_ and create the object with custom 122 | parameters. 123 | 124 | ```py 125 | from maggy.optimizer import Asha 126 | 127 | asha = Asha(3,1,10) 128 | config = OptimizationConfig(..., 129 | optimizer=asha, 130 | ...) 131 | ``` 132 | 133 | ### Bayesian Optimization 134 | 135 | WIth Bayesian Optimization (BO), the hparams are chosen based on the space of the hparams. 136 | In order to do that, the algorithm infer a function of the hparams in order to optimize the cost function of a given model. 137 | 138 | There are two different BO methods available in Maggy, namely Gaussian Process (GP) and Tree Parzen Estimators (TPE). 139 | The GP is a tool used to infer the value of a function in which predictions follow a normal distribution. 140 | We use that set of predictions and pick new points where we should evaluate next. From that new point, we add it to 141 | the samples and re-build the Gaussian Process with that new information… 142 | We keep doing this until we reach the maximal number of iterations or the limit time for example. 143 | TPE is an iterative process that uses history of evaluated hyperparameters to create probabilistic model, 144 | which is used to suggest next set of hyperparameters to evaluate. 145 | 146 | 147 | ```py 148 | def training_function(): 149 | #training and testing logic 150 | ... 151 | #define the hyperparemeters to optimize, together with their possible values 152 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99])) 153 | 154 | config = OptimizationConfig(num_trials=4, 155 | optimizer='gp', #or 'tpe' 156 | searchspace=sp, 157 | direction="max", 158 | es_interval=1, 159 | es_min=5, 160 | name="hp_tuning_test") 161 | 162 | experiment.lagom(train_fn=training_function, config=config) 163 | ``` -------------------------------------------------------------------------------- /docs/publications.md: -------------------------------------------------------------------------------- 1 | # Publications 2 | 3 | If you use Maggy for research, or write about Maggy please cite the following papers: 4 | 5 | ## Maggy Hyperparameter Optimization 6 | 7 | ### Maggy: Scalable Asynchronous Parallel Hyperparameter Search 8 | 9 | #### Authors 10 | 11 | Moritz Meister, Sina Sheikholeslami, Amir H. Payberah, Vladimir Vlassov, Jim Dowling 12 | 13 | #### Abstract 14 | 15 | Running extensive experiments is essential for building Machine Learning (ML) models. Such experiments usually require iterative execution of many trials with varying run times. In recent years, Apache Spark has become the de-facto standard for parallel data processing in the industry, in which iterative processes are im- plemented within the bulk-synchronous parallel (BSP) execution model. The BSP approach is also being used to parallelize ML trials in Spark. However, the BSP task synchronization barriers prevent asynchronous execution of trials, which leads to a reduced number of trials that can be run on a given computational budget. In this paper, we introduce Maggy, an open-source framework based on Spark, to execute ML trials asynchronously in parallel, with the ability to early stop poorly performing trials. In the experiments, we compare Maggy with the BSP execution of parallel trials in Spark and show that on random hyperparameter search on a con- volutional neural network for the Fashion-MNIST dataset Maggy reduces the required time to execute a fixed number of trials by 33% to 58%, without any loss in the final model accuracy. 16 | 17 | [Download Paper](https://content.logicalclocks.com/hubfs/Maggy%20Scalable%20Asynchronous%20Parallel%20Hyperparameter%20Search.pdf) 18 | 19 | #### Cite 20 | 21 | ``` 22 | @inproceedings{10.1145/3426745.3431338, 23 | author = {Meister, Moritz and Sheikholeslami, Sina and Payberah, Amir H. and Vlassov, Vladimir and Dowling, Jim}, 24 | title = {Maggy: Scalable Asynchronous Parallel Hyperparameter Search}, 25 | year = {2020}, 26 | isbn = {9781450381826}, 27 | publisher = {Association for Computing Machinery}, 28 | address = {New York, NY, USA}, 29 | url = {https://doi.org/10.1145/3426745.3431338}, 30 | doi = {10.1145/3426745.3431338}, 31 | booktitle = {Proceedings of the 1st Workshop on Distributed Machine Learning}, 32 | pages = {28–33}, 33 | numpages = {6}, 34 | keywords = {Scalable Hyperparameter Search, Machine Learning, Asynchronous Hyperparameter Optimization}, 35 | location = {Barcelona, Spain}, 36 | series = {DistributedML'20} 37 | } 38 | ``` 39 | 40 | ## Oblivious Training Functions 41 | 42 | ### Towards Distribution Transparency for Supervised ML With Oblivious Training Functions 43 | 44 | #### Authors 45 | 46 | Moritz Meister, Sina Sheikholeslami, Robin Andersson, Alexandru A. Ormenisan, Jim Dowling 47 | 48 | #### Abstract 49 | 50 | Building and productionizing Machine Learning (ML) models is a process of interdependent steps of iterative code updates, including exploratory model design, hyperparameter tuning, ablation experiments, and model training. Industrial-strength ML involves doing this at scale, using many compute resources, and this requires rewriting the training code to account for distribution. The result is that moving from a single host program to a cluster hinders iterative development of the software, as iterative development would require multiple versions of the software to be maintained and kept consistent. In this paper, we introduce the distribution oblivious training function as an abstraction for ML development in Python, whereby developers can reuse the same training function when running a notebook on a laptop or performing scale-out hyperparameter search and distributed training on clusters. Programs written in our framework look like industry-standard ML programs as we factor out dependencies using best-practice programming idioms (such as functions to generate models and data batches). We believe that our approach takes a step towards unifying single-host and distributed ML development. 51 | 52 | [Download Paper](https://content.logicalclocks.com/hubfs/research/oblivious-training_mlsys20.pdf) 53 | 54 | #### Cite 55 | 56 | ``` 57 | @inproceedings{oblivious-mlops, 58 | author = {Meister, Moritz and Sheikholeslami, Sina and Andersson, Robin and Ormenisan, Alexandru A. and Dowling, Jim}, 59 | title = {Towards Distribution Transparency for Supervised ML With Oblivious Training Functions}, 60 | year = {2020}, 61 | booktitle = {MLSys ’20: Workshop on MLOps Systems, March 02–04}, 62 | location = {Austin, Texas, USA} 63 | } 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/releases.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/releases.md -------------------------------------------------------------------------------- /docs/start/install.md: -------------------------------------------------------------------------------- 1 | # Installing Maggy in your laptop 2 | 3 | Maggy is available via pip. 4 | 5 | Simply run the following commnad in your terminal or conda environment. 6 | 7 | ``` 8 | pip install maggy 9 | ``` 10 | 11 | If you want to use another version of Maggy, you can run the following command. 12 | 13 | ``` 14 | pip install maggy==x.y.z 15 | ``` 16 | 17 | The available versions are listed in PyPi https://pypi.org/project/maggy/ . 18 | 19 | # Installing Maggy in Hopsworks 20 | 21 | If you are using Hopsworks, Maggy should be already installed and ready to be used. 22 | 23 | However, it is possible to check the installation from the platform by entering a project, 24 | then navigate to the "Python" section from the sidebar and click on "Manage Environment" 25 | on the top bar. Finally, search for "Maggy". 26 | 27 | If you want to change the version of Maggy, click on "Install" in the top bar and type "Maggy" 28 | on the search input. Finally, select the version you want to install and click. 29 | The progress of the installation is displayed in the "Ongoing Operations" section. 30 | 31 | ![](../assets/images/hopsworks_installation.png) 32 | 33 | 34 | 35 | # Installing Maggy in Databricks 36 | 37 | 38 | It is very simple to install Maggy in your Databricks cluster. 39 | From your project, click on Libraries in the navigation bar and Install New, at this point it is possible to install 40 | the latest release of Maggy in the PyPi section. In order to do that, just write "maggy" in the Package input section. 41 | 42 | You can install other version of Maggy by uploading the wheel on the Upload section. 43 | 44 | ![It is easy to install Maggy in Databricks, just click on Libraries in the navigation bar and then click 45 | on Install New. Finally, write "maggy" on the Package input in the PyPi section. 46 | ](../assets/images/databricks_installation.png "Maggy Installation in Databricks") 47 | 48 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Maggy 4 | 5 |

6 | 7 |

8 |

9 | In this folder you will find example notebooks for Maggy on Databricks environments. 10 |

11 |

12 | Please, if you are interested in using Maggy on Hopsworks or on Local environemts, check the notebooks example at the following link: 13 |

14 |

15 | 16 | Maggy Examples 17 | 18 |

19 |

20 | -------------------------------------------------------------------------------- /maggy/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy import searchspace 18 | 19 | Searchspace = searchspace.Searchspace 20 | 21 | __all__ = ["Searchspace"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.ablation import ablationstudy 18 | 19 | AblationStudy = ablationstudy.AblationStudy 20 | 21 | __all__ = ["AblationStudy"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/ablator/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.ablation.ablator import abstractablator 18 | 19 | AbstractAblator = abstractablator.AbstractAblator 20 | 21 | __all__ = ["AbstractAblator"] 22 | -------------------------------------------------------------------------------- /maggy/ablation/ablator/abstractablator.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | 19 | 20 | class AbstractAblator(ABC): 21 | def __init__(self, ablation_study, final_store): 22 | self.ablation_study = ablation_study 23 | self.final_store = final_store 24 | self.trial_buffer = [] 25 | 26 | @abstractmethod 27 | def get_number_of_trials(self): 28 | """ 29 | If applicable, calculate and return the total number of trials of the ablation experiment. 30 | Make sure to also include the base (reference) trial in the count. 31 | 32 | :return: total number of trials of the ablation study experiment 33 | :rtype: int 34 | """ 35 | pass 36 | 37 | @abstractmethod 38 | def get_dataset_generator(self, ablated_feature, dataset_type="tfrecord"): 39 | """ 40 | Create and return a dataset generator function based on the ablation policy to be used in a trial. 41 | The returned function will be executed on the executor per each trial. 42 | 43 | :param ablated_feature: the name of the feature to be excluded from the training dataset. 44 | Must match a feature name in the corresponding feature group in the feature store. 45 | :type ablated_feature: str 46 | :param dataset_type: type of the dataset. For now, we only support 'tfrecord'. 47 | :return: A function that generates a TFRecordDataset 48 | :rtype: function 49 | """ 50 | pass 51 | 52 | @abstractmethod 53 | def get_model_generator(self, ablated_layer): 54 | pass 55 | 56 | @abstractmethod 57 | def initialize(self): 58 | """ 59 | Initialize the ablation study experiment by generating a number of trials. Depending on the ablation policy, 60 | this method might generate all the trials (e.g. as in LOCO), or generate a number of trials to warm-start the 61 | experiment. The trials should be added to `trial_buffer` in form of `Trial` objects. 62 | """ 63 | pass 64 | 65 | @abstractmethod 66 | def get_trial(self, ablation_trial=None): 67 | """ 68 | Return a `Trial` to be assigned to an executor, or `None` if there are no trials remaining in the experiment. 69 | The trial should contain a dataset generator and a model generator. 70 | Depending on the ablator policy, the trials could come from a list (buffer) of pre-made trials, 71 | or generated on the fly. 72 | 73 | :rtype: `Trial` or `None` 74 | """ 75 | pass 76 | 77 | @abstractmethod 78 | def finalize_experiment(self, trials): 79 | """ 80 | This method will be called before finishing the experiment. Developers can implement this method 81 | e.g. for cleanup or extra logging. 82 | """ 83 | pass 84 | 85 | def name(self): 86 | return str(self.__class__.__name__) 87 | -------------------------------------------------------------------------------- /maggy/callbacks.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import tensorflow as tf 18 | 19 | 20 | class KerasBatchEnd(tf.keras.callbacks.Callback): 21 | """A Keras callback reporting a specified `metric` at the end of the batch 22 | to the maggy experiment driver. 23 | 24 | `loss` is always available as a metric, and optionally `acc` (if accuracy 25 | monitoring is enabled, that is, accuracy is added to keras model metrics). 26 | Validation metrics are not available for the BatchEnd callback. Validation 27 | after every batch would be too expensive. 28 | Default is training loss (`loss`). 29 | 30 | Example usage: 31 | 32 | >>> from maggy.callbacks import KerasBatchEnd 33 | >>> callbacks = [KerasBatchEnd(reporter, metric='acc')] 34 | """ 35 | 36 | def __init__(self, reporter, metric="loss"): 37 | super().__init__() 38 | self.metric_name = metric 39 | self.reporter = reporter 40 | 41 | def on_batch_end(self, batch, logs={}): 42 | self.reporter.broadcast(logs.get(self.metric_name, 0)) 43 | 44 | 45 | class KerasEpochEnd(tf.keras.callbacks.Callback): 46 | """A Keras callback reporting a specified `metric` at the end of an epoch 47 | to the maggy experiment driver. 48 | 49 | `val_loss` is always available as a metric, and optionally `val_acc` (if 50 | accuracy monitoring is enabled, that is, accuracy is added to keras model 51 | metrics). Training metrics are available under the names `loss` and `acc`. 52 | Default is validation loss (`val_loss`). 53 | 54 | Example usage: 55 | 56 | >>> from maggy.callbacks import KerasBatchEnd 57 | >>> callbacks = [KerasBatchEnd(reporter, metric='val_acc')] 58 | """ 59 | 60 | def __init__(self, reporter, metric="val_loss"): 61 | super().__init__() 62 | self.metric_name = metric 63 | self.reporter = reporter 64 | 65 | def on_epoch_end(self, epoch, logs={}): 66 | self.reporter.broadcast(logs.get(self.metric_name, 0), epoch) 67 | -------------------------------------------------------------------------------- /maggy/config/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.config.lagom import LagomConfig 18 | from maggy.config.base_config import BaseConfig 19 | from maggy.config.ablation import AblationConfig 20 | from maggy.config.hyperparameter_optimization import HyperparameterOptConfig 21 | from maggy.config.torch_distributed import TorchDistributedConfig 22 | from maggy.config.tf_distributed import TfDistributedConfig 23 | 24 | __all__ = [ 25 | "LagomConfig", 26 | "BaseConfig", 27 | "AblationConfig", 28 | "HyperparameterOptConfig", 29 | "TfDistributedConfig", 30 | "TorchDistributedConfig", 31 | ] 32 | -------------------------------------------------------------------------------- /maggy/config/ablation.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from typing import Union, List 20 | 21 | from maggy.ablation.ablationstudy import AblationStudy 22 | from maggy.ablation.ablator import AbstractAblator 23 | from maggy.config import LagomConfig 24 | import tensorflow as tf 25 | from maggy.core import config as mc 26 | 27 | 28 | class AblationConfig(LagomConfig): 29 | """Config class for ablation study experiments.""" 30 | 31 | def __init__( 32 | self, 33 | ablation_study: AblationStudy, 34 | ablator: Union[str, AbstractAblator] = "loco", 35 | direction: str = "max", 36 | name: str = "ablationStudy", 37 | description: str = "", 38 | hb_interval: int = 1, 39 | model: tf.keras.Model = None, 40 | dataset: List[Union[str, tf.data.Dataset]] = None, 41 | ): 42 | """Initializes ablation study experiment parameters. 43 | 44 | :param ablation_study: Ablation study object that defines the entry point into the 45 | experiment. 46 | :param ablator: An instance of `AbstractAblator` or a supported ablator name that controls 47 | the manner in which parts of the model are ablated. 48 | :param direction: Optimization direction to evaluate the experiments. 49 | :param name: Experiment name. 50 | :param description: A description of the experiment. 51 | :param hb_interval: Heartbeat interval with which the server is polling. 52 | :param model: The class of the model to be used in the training function. 53 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset. 54 | These datasets represent the ones you are going to use in the training function. 55 | For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and 56 | extract them in the training function. If you want to load the set inside the training function, this can be 57 | disregarded. 58 | """ 59 | super().__init__(name, description, hb_interval) 60 | mc.initialize() 61 | if not mc.is_spark_available(): 62 | raise NotImplementedError("Ablation Study can run only on a Spark kernel.") 63 | self.ablator = ablator 64 | self.ablation_study = ablation_study 65 | self.direction = direction 66 | self.model = model 67 | self.dataset = dataset 68 | -------------------------------------------------------------------------------- /maggy/config/base_config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from maggy.config import LagomConfig 20 | from maggy.core import config as mc 21 | 22 | 23 | class BaseConfig(LagomConfig): 24 | def __init__( 25 | self, 26 | name: str = "base", 27 | hb_interval: int = 1, 28 | description: str = "", 29 | ): 30 | 31 | """Initializes Base configuration. 32 | 33 | :param name: Experiment name. 34 | :param hb_interval: Heartbeat interval with which the server is polling. 35 | :param description: A description of the experiment. 36 | """ 37 | super().__init__(name, description, hb_interval) 38 | mc.initialize() 39 | -------------------------------------------------------------------------------- /maggy/config/hyperparameter_optimization.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import typing 20 | from typing import Union, Type, Optional, List 21 | import tensorflow as tf 22 | 23 | if typing.TYPE_CHECKING: 24 | import torch 25 | 26 | from maggy import Searchspace 27 | from maggy.earlystop import AbstractEarlyStop 28 | from maggy.optimizer import AbstractOptimizer 29 | from maggy.config import LagomConfig 30 | from maggy.core import config as mc 31 | 32 | 33 | class HyperparameterOptConfig(LagomConfig): 34 | """Config class for hyperparameter optimization experiments.""" 35 | 36 | def __init__( 37 | self, 38 | num_trials: int, 39 | optimizer: Union[str, AbstractOptimizer], 40 | searchspace: Searchspace, 41 | optimization_key: str = "Metric", 42 | direction: str = "max", 43 | es_interval: int = 1, 44 | es_min: int = 10, 45 | es_policy: Union[str, AbstractEarlyStop] = "median", 46 | name: str = "HPOptimization", 47 | description: str = "", 48 | hb_interval: int = 1, 49 | model: Union[ 50 | tf.keras.Model, Type[torch.nn.Module], List[Type[torch.nn.Module]] 51 | ] = None, 52 | dataset: List[ 53 | Optional[Union[str, tf.data.Dataset, torch.util.data.Dataset]] 54 | ] = None, 55 | ): 56 | """Initializes HP optimization experiment parameters. 57 | 58 | :param num_trials: Controls how many seperate runs are conducted during the hp search. 59 | :param optimizer: Optimizer type for searching the hp searchspace. 60 | :param searchspace: A Searchspace object configuring the names, types and ranges of hps. 61 | :param optimization_key: Name of the metric to use for hp search evaluation. 62 | :param direction: Direction of optimization. 63 | :param es_interval: Early stopping polling frequency during an experiment run. 64 | :param es_min: Minimum number of experiments to conduct before starting the early stopping 65 | mechanism. Useful to establish a baseline for performance estimates. 66 | :param es_policy: Early stopping policy which formulates a rule for triggering aborts. 67 | :param name: Experiment name. 68 | :param description: A description of the experiment. 69 | :param hb_interval: Heartbeat interval with which the server is polling. 70 | :param model: The class of the model to be used in the training function. 71 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset or 72 | torch.util.data.Dataset. These datasets represent the ones you are going to use in the training function. 73 | For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and 74 | extract them in the training function. If you want to load the set inside the training function, this can be 75 | disregarded. 76 | """ 77 | super().__init__(name, description, hb_interval) 78 | if not mc.is_spark_available(): 79 | raise NotImplementedError( 80 | "Hyperparameter Optimization can run only on a Spark kernel." 81 | ) 82 | if not num_trials > 0: 83 | raise ValueError("Number of trials should be greater than zero!") 84 | self.num_trials = num_trials 85 | self.optimizer = optimizer 86 | self.optimization_key = optimization_key 87 | self.searchspace = searchspace 88 | self.direction = direction 89 | self.es_policy = es_policy 90 | self.es_interval = es_interval 91 | self.es_min = es_min 92 | self.model = model 93 | self.dataset = dataset 94 | -------------------------------------------------------------------------------- /maggy/config/lagom.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from abc import ABC 20 | 21 | 22 | class LagomConfig(ABC): 23 | """Base class for lagom configuration classes.""" 24 | 25 | def __init__(self, name: str, description: str, hb_interval: int): 26 | """Initializes basic experiment info. 27 | 28 | :param name: Experiment name. 29 | :param description: A description of the experiment. 30 | :param hb_interval: Heartbeat interval with which the server is polling. 31 | """ 32 | self.name = name 33 | self.description = description 34 | self.hb_interval = hb_interval 35 | -------------------------------------------------------------------------------- /maggy/config/tf_distributed.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from typing import Union, Callable, List, Optional 20 | 21 | from maggy.config import LagomConfig 22 | 23 | import tensorflow as tf 24 | 25 | 26 | class TfDistributedConfig(LagomConfig): 27 | def __init__( 28 | self, 29 | model: tf.keras.Model = None, 30 | dataset: List[Optional[Union[str, tf.data.Dataset]]] = None, 31 | process_data: Callable = None, 32 | mixed_precision: bool = False, 33 | name: str = "tfDist", 34 | hb_interval: int = 1, 35 | description: str = "", 36 | hparams: dict = None, 37 | ): 38 | 39 | """Initializes Tensorflow distributed training parameters. 40 | 41 | :param model: A tf.keras.Model superclass or list of them. 42 | Note that this has to be the class itself, not an instance. 43 | :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset. 44 | these datasets represent the ones you are going to use in the training function. For example, 45 | if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in 46 | the training function. If you want to load the set inside the training function, this can be disregarded. 47 | :param process_data: The function for processing the data. 48 | :param hparams: model parameters that should be used during model initialization. Primarily 49 | used to give an interface for hp optimization. 50 | :param name: Experiment name. 51 | :param hb_interval: Heartbeat interval with which the server is polling. 52 | :param description: A description of the experiment. 53 | """ 54 | super().__init__(name, description, hb_interval) 55 | self.model = model 56 | self.dataset = dataset 57 | self.process_data = process_data 58 | self.mixed_precision = mixed_precision 59 | self.hparams = hparams if hparams else {} 60 | -------------------------------------------------------------------------------- /maggy/config/torch_distributed.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import typing 20 | from typing import Union, Type, Optional, List 21 | from maggy.config import LagomConfig 22 | from maggy.core import config as mc 23 | 24 | if typing.TYPE_CHECKING: 25 | import torch 26 | 27 | 28 | class TorchDistributedConfig(LagomConfig): 29 | """LagomConfig class for running distributed PyTorch training.""" 30 | 31 | BACKENDS = ["torch", "deepspeed"] 32 | 33 | def __init__( 34 | self, 35 | module: Union[Type[torch.nn.Module], List[Type[torch.nn.Module]]], 36 | dataset: List[Optional[Union[str, torch.util.data.Dataset]]] = None, 37 | hparams: dict = None, 38 | backend: str = "torch", 39 | mixed_precision: bool = False, 40 | zero_lvl: int = 0, 41 | deepspeed_config: dict = None, 42 | name: str = "torchDist", 43 | hb_interval: int = 1, 44 | description: str = "", 45 | ): 46 | """Initializes PyTorch distributed training parameters. 47 | 48 | :param module: A PyTorch module class or list of PyTorch module classes. 49 | Note that this has to be the class itself, not an instance. 50 | :param dataset: A List of strings containing the dataset path or list of torch.util.data.Dataset. 51 | these datasets represent the ones you are going to use in the training function. For example, 52 | if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in 53 | the training function. If you want to load the set inside the training function, this can be disregarded. 54 | :param hparams: Hyperparameters that should be used during model initialization. Primarily 55 | used to give an interface for hp optimization. 56 | :param backend: The backend framework used for training. Note that `deepspeed` needs syntax 57 | changes to a normal PyTorch script! 58 | :param mixed_precision: Used to control the use of mixed precision training in `torch` 59 | backend mode with model sharding (`zero_lvl` 3). 60 | :param zero_lvl: Sets the ZeRO optimization stages for `torch`. Note: When using `deepspeed` 61 | backend, overwrites `deepspeed_config` zero level! 62 | :param deepspeed_config: A dictionary that represents a valid deepspeed ZeRO optimizer 63 | config. For information on the config, see https://www.deepspeed.ai/docs/config-json/. 64 | :param name: Experiment name. 65 | :param hb_interval: Heartbeat interval with which the server is polling. 66 | :param description: A description of the experiment. 67 | """ 68 | super().__init__(name, description, hb_interval) 69 | mc.initialize() 70 | if not mc.is_spark_available(): 71 | raise NotImplementedError( 72 | "Torch Distributed Training can run only on a Spark kernel." 73 | ) 74 | self.module = module 75 | self.dataset = dataset 76 | if backend not in self.BACKENDS: 77 | raise ValueError( 78 | """Backend {} not supported by Maggy. 79 | Supported types are: {}""".format( 80 | backend, self.BACKENDS 81 | ) 82 | ) 83 | self.backend = backend 84 | self.mixed_precision = mixed_precision 85 | self.hparams = hparams if hparams else {} 86 | self.zero_lvl = zero_lvl 87 | self.ds_config = deepspeed_config 88 | -------------------------------------------------------------------------------- /maggy/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Constants used in Maggy: Allowed datatypes etc. 19 | """ 20 | import numpy as np 21 | 22 | 23 | class USER_FCT: 24 | """User training function specifics.""" 25 | 26 | RETURN_TYPES = (float, int, np.number, dict) 27 | NUMERIC_TYPES = (float, int, np.number) 28 | -------------------------------------------------------------------------------- /maggy/core/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/config.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import tensorflow as tf 18 | 19 | SPARK_AVAILABLE = None 20 | try: 21 | from pyspark.sql import SparkSession # noqa: F401 22 | 23 | SPARK_AVAILABLE = True 24 | except ModuleNotFoundError: 25 | SPARK_AVAILABLE = False 26 | 27 | MODE = None 28 | TF_VERSION = None 29 | 30 | 31 | def initialize(): 32 | tf_full = tf.__version__.split(".")[0] 33 | # for building the docs since mock object doesn't mock int() 34 | global TF_VERSION 35 | global MODE 36 | if not isinstance(tf_full, str): 37 | TF_VERSION = 2 38 | else: 39 | TF_VERSION = int(tf_full) 40 | 41 | print("Detected Kernel: Python.") if not SPARK_AVAILABLE else print( 42 | "Detected Kernel: Spark." 43 | ) 44 | 45 | 46 | def is_spark_available(): 47 | return SPARK_AVAILABLE 48 | -------------------------------------------------------------------------------- /maggy/core/environment/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/environment/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | import shutil 19 | import warnings 20 | 21 | from maggy import util 22 | from maggy.core.rpc import Client 23 | 24 | 25 | class BaseEnv: 26 | """ 27 | Support maggy on a local pyspark installation. 28 | """ 29 | 30 | def __init__(self): 31 | self.log_dir = os.path.join(os.getcwd(), "experiment_log") 32 | if not os.path.exists(self.log_dir): 33 | os.mkdir(self.log_dir) 34 | 35 | def set_ml_id(self, app_id=0, run_id=0): 36 | os.environ["ML_ID"] = str(app_id) + "_" + str(run_id) 37 | 38 | def create_experiment_dir(self, app_id, run_id): 39 | if not os.path.exists(os.path.join(self.log_dir, app_id)): 40 | os.mkdir(os.path.join(self.log_dir, app_id)) 41 | 42 | experiment_path = self.get_logdir(app_id, run_id) 43 | if os.path.exists(experiment_path): 44 | shutil.rmtree(experiment_path) 45 | 46 | os.mkdir(experiment_path) 47 | 48 | def get_logdir(self, app_id, run_id): 49 | return os.path.join(self.log_dir, str(app_id), str(run_id)) 50 | 51 | def populate_experiment( 52 | self, 53 | model_name, 54 | function, 55 | type, 56 | hp, 57 | description, 58 | app_id, 59 | direction, 60 | optimization_key, 61 | ): 62 | pass 63 | 64 | def attach_experiment_xattr(self, exp_ml_id, experiment_json, command): 65 | pass 66 | 67 | def exists(self, hdfs_path): 68 | return os.path.exists(hdfs_path) 69 | 70 | def mkdir(self, hdfs_path): 71 | return os.mkdir(hdfs_path) 72 | 73 | def isdir(self, dir_path, project=None): 74 | return os.path.isdir(dir_path) 75 | 76 | def ls(self, dir_path): 77 | return os.listdir(dir_path) 78 | 79 | def delete(self, path, recursive=False): 80 | 81 | if self.exists(path): 82 | if os.path.isdir(path): 83 | if recursive: 84 | # remove the directory recursively 85 | shutil.rmtree(path) 86 | elif not os.listdir(path): 87 | os.rmdir(path) 88 | else: 89 | warnings.warn( 90 | "Could not delete the dir {}, not empty.\n" 91 | "Use recursive=True when calling this function".format(path) 92 | ) 93 | elif os.path.isfile(path): 94 | os.remove(path) 95 | else: 96 | warnings.warn( 97 | "Could not delete the file in {}.\n" 98 | "File does not exists.".format(path) 99 | ) 100 | 101 | def dump(self, data, hdfs_path): 102 | head_tail = os.path.split(hdfs_path) 103 | if not os.path.exists(head_tail[0]): 104 | os.makedirs(head_tail[0]) 105 | with self.open_file(hdfs_path, flags="w") as file: 106 | file.write(data) 107 | 108 | def get_ip_address(self): 109 | sc = util.find_spark().sparkContext 110 | return sc._conf.get("spark.driver.host") 111 | 112 | def get_constants(self): 113 | pass 114 | 115 | def open_file(self, hdfs_path, flags="r", buff_size=-1): 116 | return open(hdfs_path, mode=flags, buffering=buff_size) 117 | 118 | def get_training_dataset_path( 119 | self, training_dataset, featurestore=None, training_dataset_version=1 120 | ): 121 | pass 122 | 123 | def get_training_dataset_tf_record_schema( 124 | self, training_dataset, training_dataset_version=1, featurestore=None 125 | ): 126 | pass 127 | 128 | def get_featurestore_metadata(self, featurestore=None, update_cache=False): 129 | pass 130 | 131 | def init_ml_tracking(self, app_id, run_id): 132 | pass 133 | 134 | def log_searchspace(self, app_id, run_id, searchspace): 135 | pass 136 | 137 | def connect_host(self, server_sock, server_host_port, exp_driver): 138 | if not server_host_port: 139 | server_sock.bind(("", 0)) 140 | # hostname may not be resolvable but IP address probably will be 141 | host = self.get_ip_address() 142 | port = server_sock.getsockname()[1] 143 | server_host_port = (host, port) 144 | 145 | else: 146 | server_sock.bind(server_host_port) 147 | 148 | server_sock.listen(10) 149 | 150 | return server_sock, server_host_port 151 | 152 | def _upload_file_output(self, retval, hdfs_exec_logdir): 153 | pass 154 | 155 | def project_path(self): 156 | return os.getcwd() 157 | 158 | def get_user(self): 159 | return "" 160 | 161 | def project_name(self): 162 | return "" 163 | 164 | def finalize_experiment( 165 | self, 166 | experiment_json, 167 | metric, 168 | app_id, 169 | run_id, 170 | state, 171 | duration, 172 | logdir, 173 | best_logdir, 174 | optimization_key, 175 | ): 176 | pass 177 | 178 | def str_or_byte(self, str): 179 | return str 180 | 181 | def get_executors(self, sc): 182 | 183 | if sc._conf.get("spark.dynamicAllocation.enabled") == "true": 184 | maxExecutors = int( 185 | sc._conf.get("spark.dynamicAllocation.maxExecutors", defaultValue="-1") 186 | ) 187 | if maxExecutors == -1: 188 | raise KeyError( 189 | 'Failed to find "spark.dynamicAllocation.maxExecutors" property, ' 190 | "but dynamicAllocation is enabled. " 191 | "Define the number of min and max executors when building the spark session." 192 | ) 193 | else: 194 | maxExecutors = int( 195 | sc._conf.get("spark.executor.instances", defaultValue="-1") 196 | ) 197 | if maxExecutors == -1: 198 | raise KeyError( 199 | 'Failed to find "spark.executor.instances" property, ' 200 | 'Define the number of executors using "spark.executor.instances" ' 201 | "when building the spark session." 202 | ) 203 | return maxExecutors 204 | 205 | def build_summary_json(self, logdir): 206 | pass 207 | 208 | def connect_hsfs(self): 209 | pass 210 | 211 | def convert_return_file_to_arr(self, return_file): 212 | pass 213 | 214 | def upload_file_output(self, retval, hdfs_exec_logdir): 215 | pass 216 | 217 | def get_client(self, server_addr, partition_id, hb_interval, secret, sock): 218 | client_addr = ( 219 | self.get_ip_address(), 220 | sock.getsockname()[1], 221 | ) 222 | return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret) 223 | -------------------------------------------------------------------------------- /maggy/core/environment/databricks.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | 19 | from maggy.core.environment.base import BaseEnv 20 | from maggy.core.rpc import Client 21 | 22 | 23 | class DatabricksEnv(BaseEnv): 24 | """ 25 | This class extends BaseEnv. 26 | Environment implemented for maggy usage on Databricks. 27 | """ 28 | 29 | def __init__(self): 30 | self.log_dir = "/dbfs/maggy_log/" 31 | if not os.path.exists(self.log_dir): 32 | os.mkdir(self.log_dir) 33 | 34 | def mkdir(self, hdfs_path): 35 | return os.mkdir(hdfs_path) 36 | 37 | def project_path(self, project=None, exclude_nn_addr=False): 38 | return "/dbfs/" 39 | 40 | def get_executors(self, sc): 41 | if ( 42 | sc._conf.get("spark.databricks.clusterUsageTags.clusterScalingType") 43 | == "autoscaling" 44 | ): 45 | maxExecutors = int( 46 | sc._conf.get( 47 | "spark.databricks.clusterUsageTags.clusterMaxWorkers", 48 | defaultValue="-1", 49 | ) 50 | ) 51 | if maxExecutors == -1: 52 | raise KeyError( 53 | 'Failed to find "spark.databricks.clusterUsageTags.clusterMaxWorkers" property, ' 54 | "but clusterScalingType is set to autoscaling." 55 | ) 56 | else: 57 | maxExecutors = int( 58 | sc._conf.get( 59 | "spark.databricks.clusterUsageTags.clusterWorkers", 60 | defaultValue="-1", 61 | ) 62 | ) 63 | if maxExecutors == -1: 64 | raise KeyError( 65 | 'Failed to find "spark.databricks.clusterUsageTags.clusterWorkers" property.' 66 | ) 67 | return maxExecutors 68 | 69 | def get_client(self, server_addr, partition_id, hb_interval, secret, sock): 70 | server_addr = (server_addr[0], server_addr[1]) 71 | client_addr = ( 72 | server_addr[0], 73 | sock.getsockname()[1], 74 | ) 75 | return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret) 76 | 77 | def get_logdir(self, app_id, run_id): 78 | return self.log_dir 79 | -------------------------------------------------------------------------------- /maggy/core/environment/singleton.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | 19 | 20 | class EnvSing(object): 21 | 22 | obj = None 23 | 24 | def __new__(cls, *args, **kwargs): 25 | if EnvSing.obj is not None: 26 | raise Exception("A Test Singleton instance already exists") 27 | 28 | # check hopsworks availability 29 | if "REST_ENDPOINT" in os.environ: 30 | print("Detected Environment: Hopsworks.") 31 | 32 | from maggy.core.environment import hopsworks 33 | 34 | EnvSing.obj = hopsworks.HopsworksEnv() 35 | 36 | elif os.environ.get("DATABRICKS_ROOT_CONDA_ENV") == "databricks-ml": 37 | print("Detected Environment: Databricks.") 38 | 39 | from maggy.core.environment import databricks 40 | 41 | EnvSing.obj = databricks.DatabricksEnv() 42 | 43 | else: 44 | print("Detected Environment: base.") 45 | 46 | from maggy.core.environment import base 47 | 48 | EnvSing.obj = base.BaseEnv() 49 | 50 | if EnvSing.obj is None: 51 | raise NotImplementedError( 52 | "environment_instance is None, environment not initialised." 53 | ) 54 | 55 | @staticmethod 56 | def get_instance(): 57 | """ 58 | return an instance of the environment to be used by maggy within a session. 59 | """ 60 | if EnvSing.obj is None: 61 | EnvSing() 62 | return EnvSing.obj 63 | -------------------------------------------------------------------------------- /maggy/core/exceptions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Maggy specific exceptions. 19 | """ 20 | 21 | 22 | class EarlyStopException(Exception): 23 | """Raised by the reporter when a early stop signal is received.""" 24 | 25 | def __init__(self, metric): 26 | super().__init__() 27 | self.metric = metric 28 | 29 | 30 | class NotSupportedError(Exception): 31 | """Raised when we are dealing with a situation that we do not (yet) 32 | support, e.g., a specific dataset type. 33 | """ 34 | 35 | def __init__(self, category, value, suggestion=""): 36 | self.message = "({0}: {1}) is not supported by Maggy.{2}".format( 37 | category, value, suggestion 38 | ) 39 | super().__init__(self.message) 40 | 41 | 42 | class ReturnTypeError(TypeError): 43 | """User defined training function returns value of wrong type.""" 44 | 45 | def __init__(self, optimization_key, return_type): 46 | self.message = ( 47 | "Training function cannot return value of type: {}. " 48 | "Return single numeric value or 'dict' containing optimization key" 49 | " `{}` with numeric value".format( 50 | type(return_type).__name__, optimization_key 51 | ) 52 | ) 53 | super().__init__(self.message) 54 | 55 | 56 | class MetricTypeError(TypeError): 57 | """User defined training function returns metric of wrong type.""" 58 | 59 | def __init__(self, optimization_key, return_type): 60 | self.message = ( 61 | "The optimization metric `{}` returned by the training function is" 62 | " of type: {}. The optimization metric can only be numeric".format( 63 | optimization_key, type(return_type).__name__ 64 | ) 65 | ) 66 | super().__init__(self.message) 67 | 68 | 69 | class BroadcastMetricTypeError(TypeError): 70 | """User defined training function broadcasts metric of wrong type.""" 71 | 72 | def __init__(self, return_type): 73 | self.message = ( 74 | "The optimization metric broadcast by the training function with " 75 | "the reporter is of type: {}. The optimization metric can only " 76 | "be numeric".format(type(return_type).__name__) 77 | ) 78 | super().__init__(self.message) 79 | 80 | 81 | class BroadcastStepTypeError(TypeError): 82 | """User defined training function broadcasts metric with a non-numeric step 83 | type. 84 | """ 85 | 86 | def __init__(self, value, step): 87 | self.message = ( 88 | "The optimization metric `{}` was broadcast by the training " 89 | " function in step {}, which is of type {}. The step value can " 90 | "only be numeric.".format(value, step, type(value).__name__) 91 | ) 92 | super().__init__(self.message) 93 | 94 | 95 | class BroadcastStepValueError(ValueError): 96 | """User defined training function broadcasts metric with a 97 | non-monotonically increasing step attribute. 98 | """ 99 | 100 | def __init__(self, value, step, prev_step): 101 | self.message = ( 102 | "The optimization metric `{}` was broadcast by the training " 103 | " function in step {}, while the previous step was {}. The steps " 104 | "should be a monotonically increasing attribute.".format( 105 | value, step, prev_step 106 | ) 107 | ) 108 | super().__init__(self.message) 109 | 110 | 111 | class BadArgumentsError(Exception): 112 | """Raised when a function or method has been called with incompatible arguments. 113 | This can be used by developers to prevent bad usage of their functions 114 | or classes by other developers. 115 | """ 116 | 117 | def __init__(self, callable, suggestion=""): 118 | self.message = "{0} was called using incompatible arguments. {1}".format( 119 | callable, suggestion 120 | ) 121 | super().__init__(self.message) 122 | -------------------------------------------------------------------------------- /maggy/core/executors/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/executors/base_executor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from typing import Callable, Any 18 | 19 | 20 | def base_executor_fn( 21 | train_fn: Callable, 22 | ) -> Callable: 23 | """Wraps the user supplied training function in order to be passed to the Spark Executors. 24 | 25 | :param train_fn: Original training function. 26 | :param config: Experiment config. 27 | 28 | :returns: Patched function to execute on the Spark executors. 29 | """ 30 | 31 | def wrapper_function(_: Any) -> None: 32 | """Patched function from tf_dist_executor_fn factory. 33 | 34 | :param _: Necessary catch for the iterator given by Spark to the 35 | function upon foreach calls. Can safely be disregarded. 36 | """ 37 | 38 | retval = train_fn() 39 | return retval 40 | 41 | return wrapper_function 42 | -------------------------------------------------------------------------------- /maggy/core/experiment_driver/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from .optimization_driver import HyperparameterOptDriver 18 | from .ablation_driver import AblationDriver 19 | from .base_driver import BaseDriver 20 | 21 | 22 | __all__ = ["HyperparameterOptDriver", "AblationDriver", "BaseDriver"] 23 | -------------------------------------------------------------------------------- /maggy/core/experiment_driver/torch_distributed_training_driver.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from pickle import PicklingError 18 | from typing import Callable, Type, Any 19 | 20 | from maggy import util 21 | from maggy.core.environment.singleton import EnvSing 22 | from maggy.config import TorchDistributedConfig 23 | from maggy.core.rpc import DistributedTrainingServer 24 | from maggy.core.experiment_driver.spark_driver import Driver 25 | from maggy.core.executors.torch_dist_executor import torch_dist_executor_fn 26 | 27 | 28 | class TorchDistributedTrainingDriver(Driver): 29 | """Driver for distributed learning on a Spark cluster. 30 | 31 | Registers the workers on an RPC server, ensures proper configuration and 32 | logging, and accumulates final results. 33 | """ 34 | 35 | def __init__(self, config: TorchDistributedConfig, app_id: int, run_id: int): 36 | """Initializes the server for initial training setup communication and log collection. 37 | 38 | :param config: Experiment config. 39 | :param app_id: Maggy application ID. 40 | :param run_id: Maggy run ID. 41 | """ 42 | super().__init__(config, app_id, run_id) 43 | self.server = DistributedTrainingServer(self.num_executors, config.__class__) 44 | self.results = [] 45 | 46 | def _exp_startup_callback(self) -> None: 47 | """No special startup actions required.""" 48 | 49 | def _exp_final_callback(self, job_end: float, _: Any) -> dict: 50 | """Calculates the average test error from all partitions. 51 | 52 | :param job_end: Time of the job end. 53 | :param _: Catches additional callback arguments. 54 | 55 | :returns: The result in a dictionary. 56 | """ 57 | result = {"test result": self.average_metric()} 58 | exp_ml_id = str(self.app_id) + "_" + str(self.run_id) 59 | EnvSing.get_instance().attach_experiment_xattr( 60 | exp_ml_id, 61 | {"state": "FINISHED", "duration": int(job_end - self.job_start) * 1000}, 62 | "FULL_UPDATE", 63 | ) 64 | print("Final average test loss: {:.3f}".format(self.average_metric())) 65 | print( 66 | "Finished experiment. Total run time: " 67 | + util.time_diff(self.job_start, job_end) 68 | ) 69 | return result 70 | 71 | def _exp_exception_callback(self, exc: Type[Exception]) -> None: 72 | """Catches pickling errors in case the input arguments (most likely 73 | the dataset) are too large to be pickled, or not compatible. 74 | 75 | :param exc: The exception to handle. 76 | 77 | :raises RuntimeError: Provides the user with additional information 78 | about avoiding pickle problems and includes the pickle error. 79 | """ 80 | if isinstance(exc, PicklingError): 81 | raise RuntimeError( 82 | """Pickling has failed. This is most likely caused by one of 83 | the following reasons: Your module class can't be pickled, or your 84 | dataset is too large. 85 | Consider passing a custom dataloader that reads from files in 86 | case of large datasets, and verify that your module is 87 | pickleable!""" 88 | ) 89 | raise exc 90 | 91 | def _patching_fn( 92 | self, train_fn: Callable, config: TorchDistributedConfig 93 | ) -> Callable: 94 | """Monkey patches the user training function with the distributed 95 | executor modifications for distributed training. 96 | 97 | :param train_fn: User provided training function. 98 | 99 | :returns: The monkey patched training function. 100 | """ 101 | return torch_dist_executor_fn( 102 | train_fn, 103 | config, 104 | self.app_id, 105 | self.run_id, 106 | self.server_addr, 107 | self.hb_interval, 108 | self._secret, 109 | self.log_dir, 110 | ) 111 | 112 | def _register_msg_callbacks(self) -> None: 113 | """Registers a metric message callback for heartbeat responses to spark 114 | magic and a final callback to process experiment results. 115 | """ 116 | self.message_callbacks["METRIC"] = self._log_msg_callback 117 | self.message_callbacks["FINAL"] = self._final_msg_callback 118 | 119 | def _log_msg_callback(self, msg: dict) -> None: 120 | """Callback for heartbeat messages with logs from the executors. 121 | 122 | :param msg: Message from the executors. Contains logs to be written to 123 | jupyter and the DFS. 124 | """ 125 | logs = msg.get("logs", None) 126 | if logs is not None: 127 | with self.log_lock: 128 | self.executor_logs = self.executor_logs + logs 129 | 130 | def _final_msg_callback(self, msg: dict) -> None: 131 | """Appends the test result from the workers to the result list. 132 | 133 | :param msg: Final message from the executors. 134 | """ 135 | self.results.append(msg.get("data", None)) 136 | 137 | def average_metric(self) -> float: 138 | """Calculates the current average over the valid results. 139 | 140 | :returns: The average result value. 141 | """ 142 | valid_results = [x for x in self.results if x is not None] 143 | if len(valid_results) > 0: 144 | return sum(valid_results) / len(valid_results) 145 | else: 146 | return 0 147 | -------------------------------------------------------------------------------- /maggy/core/patching/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import torch 18 | 19 | from .dataloader import MaggyDataLoader, MaggyPetastormDataLoader 20 | from .modules import ( 21 | get_maggy_ddp_wrapper, 22 | get_maggy_fairscale_wrapper, 23 | get_maggy_deepspeed_wrapper, 24 | ) 25 | 26 | __all__ = [ 27 | "get_maggy_ddp_wrapper", 28 | "get_maggy_fairscale_wrapper", 29 | "get_maggy_deepspeed_wrapper", 30 | "MaggyDataLoader", 31 | "MaggyPetastormDataLoader", 32 | ] 33 | 34 | # Check torch version, only import ZeroRedundancyOptimizer if >= 1.8 35 | _torch_version = torch.__version__.split(".") 36 | if int(_torch_version[0]) > 1 or int(_torch_version[1]) >= 8: 37 | from .optim import ( 38 | MaggyZeroAdadelta, 39 | MaggyZeroAdagrad, 40 | MaggyZeroAdam, 41 | MaggyZeroAdamW, 42 | MaggyZeroSparseAdam, 43 | MaggyZeroAdamax, 44 | MaggyZeroASGD, 45 | MaggyZeroLBFGS, 46 | MaggyZeroRMSprop, 47 | MaggyZeroRprop, 48 | MaggyZeroSGD, 49 | ) 50 | 51 | __all__ += [ 52 | "MaggyZeroAdadelta", 53 | "MaggyZeroAdagrad", 54 | "MaggyZeroAdam", 55 | "MaggyZeroAdamW", 56 | "MaggyZeroSparseAdam", 57 | "MaggyZeroAdamax", 58 | "MaggyZeroASGD", 59 | "MaggyZeroLBFGS", 60 | "MaggyZeroRMSprop", 61 | "MaggyZeroRprop", 62 | "MaggyZeroSGD", 63 | ] 64 | -------------------------------------------------------------------------------- /maggy/core/patching/dataloader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import os 20 | from typing import Type, Union, Optional, Any, Callable 21 | import collections 22 | 23 | import torch 24 | from torch.utils.data import Dataset, Sampler 25 | from torch.utils.data import DataLoader as TorchDataLoader 26 | from petastorm.reader import make_reader, make_batch_reader 27 | from petastorm.pytorch import DataLoader as PetastormDataLoader 28 | from petastorm.transform import TransformSpec 29 | 30 | from maggy.core.environment.singleton import EnvSing 31 | 32 | 33 | class MaggyDataLoader(TorchDataLoader): 34 | """Monkey patching class for PyTorch's DataLoader. 35 | 36 | Patches the DataLoader to include a distributed sampler. Uses environment 37 | variables for infos such as world size for the DataLoader. These can 38 | assumed to be present since Maggy's distributed experiment sets them prior 39 | to running thes training. 40 | Automatically moves training data to the GPU since distributed training 41 | requires execution on GPUs. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | dataset: Union[Type[Dataset], str], 47 | batch_size: int = 1, 48 | shuffle: Any = False, 49 | sampler: Optional[Sampler[int]] = None, 50 | batch_sampler: Optional[Any] = None, 51 | num_workers: int = 0, 52 | collate_fn: Optional[Callable] = None, 53 | pin_memory: bool = False, 54 | drop_last: bool = False, 55 | timeout: float = 0, 56 | worker_init_fn: Optional[Callable] = None, 57 | **_: Any, 58 | ): 59 | """Initializes a torch DataLoader. 60 | 61 | :param dataset: A PyTorch Dataset. 62 | :param batch_size: How many samples per batch to load (default: ``1``). 63 | :param shuffle: Discarded, not compatible with Maggy. 64 | :param sampler: Discarded, gets replaced by DistributedSampler. 65 | :param batch_sampler: Discarded, not compatible with Maggy. 66 | :param num_workers: Discarded, currently crashes Spark if set >0. 67 | :param collate_fn: Merges a list of samples to a minibatch of tensors. 68 | :param pin_memory: Automatically transfer tensors to GPU. 69 | :param drop_last: Drop last incomplete batch. 70 | :param timeout: Timeout for collecting a batch. 71 | :param worker_init_fn: Executed on each worker with worker ID. 72 | :param _: Argument catch to stay compatible with PyTorch. 73 | """ 74 | sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset) 75 | super().__init__( 76 | dataset, 77 | batch_size, 78 | shuffle=False, 79 | sampler=sampler, 80 | batch_sampler=None, 81 | num_workers=0, # Multiprocessing workers do not work at the moment. 82 | collate_fn=collate_fn, 83 | pin_memory=pin_memory, 84 | drop_last=drop_last, 85 | timeout=timeout, 86 | worker_init_fn=worker_init_fn, 87 | ) 88 | self.iterator = None 89 | 90 | def __iter__(self) -> MaggyDataLoader: 91 | # Reload the dataset when new iterator requested. 92 | self.iterator = TorchDataLoader.__iter__(self) 93 | return self 94 | 95 | def __next__(self) -> Union[torch.Tensor, list, dict]: 96 | data = self.iterator.__next__() 97 | return _to_cuda(data) 98 | 99 | 100 | class MaggyPetastormDataLoader(PetastormDataLoader): 101 | """Maggy implementation of a Petastorm parquet DataLoader. 102 | 103 | Arguments such as world size, reader and rank are automated to make 104 | PetastormDataLoader as similar to PyTorch's DataLoader as possible. 105 | """ 106 | 107 | def __init__( 108 | self, dataset: str, batch_size: int = 1, transform_spec: TransformSpec = None 109 | ): 110 | """Initializes a reader depending on the dataset (Petastorm/Parquet). 111 | 112 | :param dataset: Path to the dataset. 113 | :param batch_size: How many samples per batch to load (default: ``1``). 114 | :param transform_spec: Petastorm transform spec for data augmentation. 115 | """ 116 | num_workers = int(os.environ["WORLD_SIZE"]) # Is set at lagom startup. 117 | rank = int(os.environ["RANK"]) 118 | is_peta_ds = EnvSing.get_instance().exists( 119 | dataset.rstrip("/") + "/_common_metadata" 120 | ) 121 | # Make reader only compatible with petastorm dataset. 122 | ds_type = "Petastorm" if is_peta_ds else "Parquet" 123 | print(f"{ds_type} dataset detected in folder {dataset}") 124 | reader_factory = make_reader if is_peta_ds else make_batch_reader 125 | reader = reader_factory( 126 | dataset, 127 | cur_shard=rank, 128 | shard_count=num_workers, 129 | transform_spec=TransformSpec(transform_spec), 130 | ) 131 | super().__init__(reader, batch_size=batch_size) 132 | self.iterator = None 133 | 134 | def __iter__(self) -> MaggyPetastormDataLoader: 135 | # Reload the dataset when new iterator requested. 136 | self.iterator = PetastormDataLoader.__iter__(self) 137 | return self 138 | 139 | def __next__(self) -> Union[torch.Tensor, list, dict]: 140 | data = self.iterator.__next__() 141 | return _to_cuda(data) 142 | 143 | def __len__(self): 144 | raise NotImplementedError("Petastorm dataloader does not support __len__.") 145 | 146 | 147 | def _to_cuda(data: Union[torch.Tensor, list, dict]) -> Union[torch.Tensor, list, dict]: 148 | """Recurses into data, transfers tensors to GPU. 149 | 150 | :param data: The data structure to be transferred. 151 | 152 | :raises TypeError: In case of unsupported data structures. 153 | 154 | :returns: The transfered data structure. 155 | """ 156 | if isinstance(data, collections.abc.Mapping): 157 | return {key: _to_cuda(val) for key, val in data.items()} 158 | if isinstance(data, (list, tuple)): 159 | data_list = [_to_cuda(el) for el in data] 160 | return data_list if isinstance(data, list) else tuple(data_list) 161 | if isinstance(data, torch.Tensor): 162 | return data.cuda() 163 | raise TypeError(f"Type {type(data)} currently not supported!") 164 | -------------------------------------------------------------------------------- /maggy/core/patching/modules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | from types import SimpleNamespace 20 | from typing import Type, Any 21 | 22 | from torch.nn import Module as TorchModule 23 | from torch.nn.parallel import DistributedDataParallel as TorchDistributedDataParallel 24 | 25 | try: 26 | from deepspeed.pipe import PipelineModule 27 | from deepspeed.runtime.engine import DeepSpeedEngine 28 | from fairscale.nn import ( 29 | FullyShardedDataParallel as FairscaleFullyShardedDataParallel, 30 | ) 31 | except ImportError: 32 | print( 33 | """Warning: deepspeed and/or fairscale import failed. DeepSpeed backend and zero_lvl 3 34 | won't be available""" 35 | ) 36 | 37 | 38 | def get_maggy_ddp_wrapper(module: Type[TorchModule]): 39 | """Factory function for MaggyDDPModuleWrapper. 40 | 41 | :param module: PyTorch module passed by the user. 42 | """ 43 | 44 | class MaggyDDPModuleWrapper(TorchDistributedDataParallel): 45 | """Wrapper around PyTorch's DDP Module. 46 | 47 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 48 | we cannot add the module as an additional parameter during initialization. Instead, it is 49 | configured by its factory function. 50 | """ 51 | 52 | __module = module # Avoid overwriting torch module 53 | 54 | def __init__(self, *args: Any, **kwargs: Any): 55 | """Initializes the previously set module, moves it to the GPU and initializes a DDP 56 | module with it. 57 | 58 | :param args: Arguments passed by the user for module initialization. 59 | :param kwargs: Keyword arguments passed by the user for module initialization. 60 | """ 61 | # Avoid self because bound method adds to args which makes the function call fail 62 | model = MaggyDDPModuleWrapper.__module(*args, **kwargs).cuda() 63 | super().__init__(model) 64 | 65 | return MaggyDDPModuleWrapper 66 | 67 | 68 | def get_maggy_fairscale_wrapper(module: TorchModule, mixed_precision: bool): 69 | """Factory function for MaggyFairScaleModuleWrapper. 70 | 71 | :param module: PyTorch module passed by the user. 72 | :param mixed_precision: Switches on mixed precision for the FairscaleModule. 73 | """ 74 | 75 | class MaggyFairScaleModuleWrapper(FairscaleFullyShardedDataParallel): 76 | """Wrapper around Fairscale's FullyShardedDataParallel Module. 77 | 78 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 79 | we cannot add the module as an additional parameter during initialization. Instead, it is 80 | configured by its factory function. 81 | """ 82 | 83 | __module = module 84 | __mixed_precision = mixed_precision 85 | 86 | def __init__(self, *args: Any, **kwargs: Any): 87 | """Initializes the previously set module, moves it to the GPU and initializes a 88 | Fairscale FullyShardedDataParallel module with it. 89 | 90 | :param args: Arguments passed by the user for module initialization. 91 | :param kwargs: Keyword arguments passed by the user for module initialization. 92 | """ 93 | # Avoid self because bound method adds to args which makes the function call fail 94 | model = MaggyFairScaleModuleWrapper.__module(*args, **kwargs).cuda() 95 | super().__init__(model, mixed_precision=self.__mixed_precision) 96 | 97 | return MaggyFairScaleModuleWrapper 98 | 99 | 100 | def get_maggy_deepspeed_wrapper(module: TorchModule, config_params: dict): 101 | """Factory function for MaggyDeepSpeedModuleWrapper. 102 | 103 | :param module: PyTorch module passed by the user. 104 | :param mixed_precision: DeepSpeed config dict passed by the user. 105 | """ 106 | assert ( 107 | module != PipelineModule 108 | ), """Maggy currently doesn't support pipeline 109 | modules with DeepSpeed ZeRO.""" 110 | 111 | class MaggyDeepSpeedModuleWrapper(DeepSpeedEngine): 112 | """Wrapper around DeepSpeed's DeepSpeedEngine. 113 | 114 | The wrapper replaces the user's module. Since the module's signature needs to be preserved, 115 | we cannot add the module as an additional parameter during initialization. Instead, it is 116 | configured by its factory function. 117 | """ 118 | 119 | __module = module 120 | __config_params = config_params 121 | 122 | def __init__(self, *args, **kwargs): 123 | """Initializes the previously set module and initializes a DeepSpeedEngine with it. 124 | 125 | :param args: Arguments passed by the user for module initialization. 126 | :param kwargs: Keyword arguments passed by the user for module initialization. 127 | """ 128 | # Avoid self because bound method adds to args which makes the function call fail. 129 | # No .cuda() calls for DeepSpeed necessary. 130 | model = MaggyDeepSpeedModuleWrapper.__module(*args, **kwargs) 131 | ds_args = SimpleNamespace(local_rank=0) 132 | super().__init__( 133 | ds_args, 134 | model, 135 | model_parameters=model.parameters(), 136 | config_params=self.__config_params, 137 | ) 138 | 139 | return MaggyDeepSpeedModuleWrapper 140 | -------------------------------------------------------------------------------- /maggy/core/patching/optim.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from __future__ import annotations 18 | 19 | import inspect 20 | from typing import Any 21 | from abc import ABC, abstractclassmethod 22 | 23 | 24 | import torch.optim as optim 25 | from torch.distributed.optim import ZeroRedundancyOptimizer 26 | 27 | 28 | class MaggyZeroOptimizer(ZeroRedundancyOptimizer, ABC): 29 | """Abstract base class for Maggy's optimizer patching classes.""" 30 | 31 | def __init__(self, *args: Any, **kwargs: Any): 32 | """Initializes a ZeroRedundancyOptimizer with the defined optim_cls as optimizer class. 33 | 34 | Passes any arguments for initialization of the default optimizer to the Zero optimizer. 35 | :param args: Optimizer args. Get reassigned into kwargs. 36 | :param kwargs: Optimizer kwargs. 37 | """ 38 | # Move args to kwargs to pass args into kwargs only ZeroRedundancyOptimizer 39 | arg_spec = inspect.getfullargspec(self.optim_cls.__init__) 40 | for idx, arg in enumerate(args): 41 | kwargs[arg_spec.args[idx + 1]] = arg # +1 to skip self in arg_spec 42 | params = kwargs.pop("params", None) 43 | super().__init__( 44 | params, self.optim_cls, group=None, bucket_cap_kb=2**24, **kwargs 45 | ) 46 | 47 | @property 48 | @abstractclassmethod 49 | def optim_cls(cls: optim.Optimizer) -> MaggyZeroOptimizer: 50 | """Optimizer class property needs to be defined by each implementation of the base class.""" 51 | raise NotImplementedError 52 | 53 | 54 | class MaggyZeroAdadelta(MaggyZeroOptimizer): 55 | """Maggy's Zero wrapper around torch's Adadelta optimizer.""" 56 | 57 | optim_cls = optim.Adadelta 58 | 59 | 60 | class MaggyZeroAdagrad(MaggyZeroOptimizer): 61 | """Maggy's Zero wrapper around torch's Adagrad optimizer.""" 62 | 63 | optim_cls = optim.Adagrad 64 | 65 | 66 | class MaggyZeroAdam(MaggyZeroOptimizer): 67 | """Maggy's Zero wrapper around torch's Adam optimizer.""" 68 | 69 | optim_cls = optim.Adam 70 | 71 | 72 | class MaggyZeroAdamW(MaggyZeroOptimizer): 73 | """Maggy's Zero wrapper around torch's AdamW optimizer.""" 74 | 75 | optim_cls = optim.AdamW 76 | 77 | 78 | class MaggyZeroSparseAdam(MaggyZeroOptimizer): 79 | """Maggy's Zero wrapper around torch's SparseAdam optimizer.""" 80 | 81 | optim_cls = optim.SparseAdam 82 | 83 | 84 | class MaggyZeroAdamax(MaggyZeroOptimizer): 85 | """Maggy's Zero wrapper around torch's Adamax optimizer.""" 86 | 87 | optim_cls = optim.Adamax 88 | 89 | 90 | class MaggyZeroASGD(MaggyZeroOptimizer): 91 | """Maggy's Zero wrapper around torch's ASGD optimizer.""" 92 | 93 | optim_cls = optim.ASGD 94 | 95 | 96 | class MaggyZeroLBFGS(MaggyZeroOptimizer): 97 | """Maggy's Zero wrapper around torch's LBFGS optimizer.""" 98 | 99 | optim_cls = optim.LBFGS 100 | 101 | 102 | class MaggyZeroRMSprop(MaggyZeroOptimizer): 103 | """Maggy's Zero wrapper around torch's RMSprop optimizer.""" 104 | 105 | optim_cls = optim.RMSprop 106 | 107 | 108 | class MaggyZeroRprop(MaggyZeroOptimizer): 109 | """Maggy's Zero wrapper around torch's Rprop optimizer.""" 110 | 111 | optim_cls = optim.Rprop 112 | 113 | 114 | class MaggyZeroSGD(MaggyZeroOptimizer): 115 | """Maggy's Zero wrapper around torch's SGD optimizer.""" 116 | 117 | optim_cls = optim.SGD 118 | -------------------------------------------------------------------------------- /maggy/core/reporter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | API Module for the user to include in his training code. 19 | 20 | """ 21 | import threading 22 | from datetime import datetime 23 | 24 | from maggy import constants 25 | from maggy.core import exceptions 26 | 27 | from maggy.core.environment.singleton import EnvSing 28 | 29 | 30 | class Reporter(object): 31 | """ 32 | Thread-safe store for sending a metric and logs from executor to driver 33 | """ 34 | 35 | def __init__(self, log_file, partition_id, task_attempt, print_executor): 36 | self.metric = None 37 | self.step = -1 38 | self.lock = threading.RLock() 39 | self.stop = False 40 | self.trial_id = None 41 | self.trial_log_file = None 42 | self.logs = "" 43 | self.log_file = log_file 44 | self.partition_id = partition_id 45 | self.task_attempt = task_attempt 46 | self.print_executor = print_executor 47 | 48 | # Open executor log file descriptor 49 | # This log is for all maggy system related log messages 50 | env = EnvSing.get_instance() 51 | if not env.exists(log_file): 52 | env.dump("", log_file) 53 | self.fd = env.open_file(log_file, flags="w") 54 | self.trial_fd = None 55 | 56 | def init_logger(self, trial_log_file): 57 | """Initializes the trial log file""" 58 | self.trial_log_file = trial_log_file 59 | env = EnvSing.get_instance() 60 | # Open trial log file descriptor 61 | if not env.exists(self.trial_log_file): 62 | env.dump("", self.trial_log_file) 63 | self.trial_fd = env.open_file(self.trial_log_file, flags="w") 64 | 65 | def close_logger(self): 66 | """Savely closes the file descriptors of the log files. 67 | 68 | close() can be called multiple times and flushes the buffer contents 69 | before closing 70 | """ 71 | with self.lock: 72 | if self.trial_fd: 73 | self.trial_fd.close() 74 | self.fd.close() 75 | 76 | # report 77 | def broadcast(self, metric, step=None): 78 | """Broadcast a metric to the experiment driver with the heartbeat. 79 | 80 | :param metric: Metric to be broadcasted 81 | :type metric: int, float 82 | :param step: The iteration step which produced the metric, e.g. batch or 83 | epoch number, or any other monotonically increasing progress attribute 84 | :type step: int 85 | :raises exception: EarlyStopException if told by the experiment driver 86 | """ 87 | with self.lock: 88 | # if stop == True -> raise exception to break training function 89 | if step is None: 90 | step = self.step + 1 91 | if not isinstance(metric, constants.USER_FCT.NUMERIC_TYPES): 92 | raise exceptions.BroadcastMetricTypeError(metric) 93 | elif not isinstance(step, constants.USER_FCT.NUMERIC_TYPES): 94 | raise exceptions.BroadcastStepTypeError(metric, step) 95 | elif step < self.step: 96 | raise exceptions.BroadcastStepValueError(metric, step, self.step) 97 | else: 98 | self.step = step 99 | self.metric = metric 100 | if self.stop: 101 | raise exceptions.EarlyStopException(metric) 102 | 103 | def log(self, log_msg, jupyter=False): 104 | """Logs a message to the executor logfile and executor stderr and 105 | optionally prints the message in jupyter. 106 | 107 | :param log_msg: Message to log. 108 | :type log_msg: str 109 | :param verbose: Print in Jupyter Notebook, defaults to True 110 | :type verbose: bool, optional 111 | """ 112 | with self.lock: 113 | env = EnvSing.get_instance() 114 | try: 115 | msg = (datetime.now().isoformat() + " ({0}/{1}): {2} \n").format( 116 | self.partition_id, self.task_attempt, log_msg 117 | ) 118 | if jupyter: 119 | jupyter_log = str(self.partition_id) + ": " + log_msg 120 | if self.trial_fd: 121 | self.trial_fd.write(env.str_or_byte(msg)) 122 | self.logs = self.logs + jupyter_log + "\n" 123 | else: 124 | self.fd.write(env.str_or_byte(msg)) 125 | if self.trial_fd: 126 | self.trial_fd.write(env.str_or_byte(msg)) 127 | self.print_executor(msg) 128 | # Throws ValueError when operating on closed HDFS file object 129 | # Throws AttributeError when calling file ops on NoneType object 130 | except (IOError, ValueError, AttributeError) as e: 131 | self.fd.write( 132 | env.str_or_byte( 133 | "An error occurred while writing logs: {}".format(e) 134 | ) 135 | ) 136 | 137 | def get_data(self): 138 | """Returns the metric and logs to be sent to the experiment driver.""" 139 | with self.lock: 140 | log_to_send = self.logs 141 | self.logs = "" 142 | return self.metric, self.step, log_to_send 143 | 144 | def reset(self): 145 | """ 146 | Resets the reporter to the initial state in order to start a new 147 | trial. 148 | """ 149 | with self.lock: 150 | self.metric = None 151 | self.step = -1 152 | self.stop = False 153 | self.trial_id = None 154 | self.fd.flush() 155 | self.trial_fd.close() 156 | self.trial_fd = None 157 | self.trial_log_file = None 158 | 159 | def early_stop(self): 160 | with self.lock: 161 | if self.metric is not None: 162 | self.stop = True 163 | 164 | def get_trial_id(self): 165 | with self.lock: 166 | return self.trial_id 167 | 168 | def set_trial_id(self, trial_id): 169 | with self.lock: 170 | self.trial_id = trial_id 171 | -------------------------------------------------------------------------------- /maggy/core/tf_patching/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/core/tf_patching/tf_modules.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | def get_wrapped_model(model, strategy, is_chief): 19 | """Build a wrap class for the user defined tensorflow model. 20 | 21 | :param model: The class of the user defined tensorflow model. 22 | :param strategy: A class of the strategy to be used for the training. 23 | 24 | :returns: The TensorflowModelWrapper class. 25 | """ 26 | 27 | class TensorflowModelWrapper(model): 28 | """A wrap for tensorflow model, the __init__() and compile() functions are overridden in order to launch 29 | train the model in a distributed fashion. 30 | """ 31 | 32 | def __init__(self, *args, **kwargs): 33 | self.__strategy = strategy 34 | self.is_chief = is_chief 35 | with self.__strategy.scope(): 36 | try: 37 | super().__init__(*args, **kwargs) 38 | except TypeError as e: 39 | raise TypeError( 40 | "The parameters passed to TensorflowConfig (model_parameters) " 41 | "do not corresponds to the parameters defined in your model " 42 | "constructor." 43 | ) from e 44 | 45 | return TensorflowModelWrapper 46 | -------------------------------------------------------------------------------- /maggy/earlystop/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.earlystop import abstractearlystop, medianrule, nostop 18 | 19 | AbstractEarlyStop = abstractearlystop.AbstractEarlyStop 20 | MedianStoppingRule = medianrule.MedianStoppingRule 21 | NoStoppingRule = nostop.NoStoppingRule 22 | 23 | __all__ = ["AbstractEarlyStop", "MedianStoppingRule", "NoStoppingRule"] 24 | -------------------------------------------------------------------------------- /maggy/earlystop/abstractearlystop.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | 19 | 20 | class AbstractEarlyStop(ABC): 21 | """An Abstract class to implement custom early stopping criteria.""" 22 | 23 | @staticmethod 24 | @abstractmethod 25 | def earlystop_check(to_check, finalized_trials, direction): 26 | """A abstract static method that needs to be implemented with a custom 27 | early stopping criterium. 28 | 29 | The function is called internally in the user specified interval 30 | with three arguments. It is necessary to add these to the function 31 | definition. 32 | 33 | :param to_check: A dictionary of currently running 34 | trials, where the key is the `trial_id` and values are Trial objects. 35 | :type to_check: dictionary 36 | :param finalized_trials: A list of finalized Trial objects. 37 | :type finalized_trials: list 38 | :param direction: A string describing the search objective, i.e. 'min' 39 | or 'max'. 40 | :type direction: str 41 | """ 42 | pass 43 | -------------------------------------------------------------------------------- /maggy/earlystop/medianrule.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import statistics 18 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop 19 | 20 | 21 | class MedianStoppingRule(AbstractEarlyStop): 22 | """The Median Stopping Rule implements the simple strategy of stopping a 23 | trial if its performance falls below the median of other trials at similar 24 | points in time. 25 | """ 26 | 27 | @staticmethod 28 | def earlystop_check(to_check, finalized_trials, direction): 29 | 30 | results = [] 31 | median = None 32 | 33 | # count step from zero so it can be used as index for array 34 | step = len(to_check.metric_history) 35 | 36 | if step > 0: 37 | 38 | for fin_trial in finalized_trials: 39 | 40 | if len(fin_trial.metric_history) >= step: 41 | avg = sum(fin_trial.metric_history[:step]) / float(step) 42 | results.append(avg) 43 | 44 | try: 45 | median = statistics.median(results) 46 | except statistics.StatisticsError as e: 47 | raise Exception( 48 | "Warning: StatisticsError when calling early stop method\n{}".format( 49 | e 50 | ) 51 | ) 52 | 53 | if median is not None: 54 | if direction == "max": 55 | if max(to_check.metric_history) < median: 56 | return to_check.trial_id 57 | elif direction == "min": 58 | if min(to_check.metric_history) > median: 59 | return to_check.trial_id 60 | return None 61 | -------------------------------------------------------------------------------- /maggy/earlystop/nostop.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop 18 | 19 | 20 | class NoStoppingRule(AbstractEarlyStop): 21 | """The no stopping rule never stops any trials early.""" 22 | 23 | @staticmethod 24 | def earlystop_check(to_check, finalized_trials, direction): 25 | return None 26 | -------------------------------------------------------------------------------- /maggy/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/experiment/experiment.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from typing import Callable 18 | from maggy.config import LagomConfig, BaseConfig 19 | 20 | 21 | def lagom(train_fn: Callable, config: LagomConfig = None) -> dict: 22 | 23 | """Entry point for Maggy experiment, this function passes the parameters to the lagom function 24 | depending whether the kernel is pyspark or python. 25 | **lagom** is a Swedish word meaning "just the right amount". 26 | 27 | :param train_fn: User defined experiment containing the model training. 28 | :param config: An experiment configuration. For more information, see config. 29 | 30 | :returns: The experiment results as a dict. 31 | """ 32 | from maggy.experiment import experiment_python 33 | from maggy.experiment import experiment_pyspark 34 | from maggy.core import config as maggyconfig 35 | 36 | if config is None: 37 | config = BaseConfig( 38 | name="maggy_experiment", 39 | description="experiment without config object", 40 | hb_interval=1, 41 | ) 42 | if maggyconfig.is_spark_available(): 43 | return experiment_pyspark.lagom(train_fn, config) 44 | else: 45 | return experiment_python.lagom(train_fn, config) 46 | -------------------------------------------------------------------------------- /maggy/experiment/experiment_pyspark.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Experiment module used for running asynchronous optimization tasks. 19 | The programming model is that you wrap the code containing the model 20 | training inside a wrapper function. 21 | Inside that wrapper function provide all imports and parts that make up your 22 | experiment, see examples below. Whenever a function to run an experiment is 23 | invoked it is also registered in the Experiments service along with the 24 | provided information. 25 | """ 26 | import atexit 27 | import time 28 | from functools import singledispatch 29 | from typing import Callable 30 | 31 | from maggy import util 32 | from maggy.core.environment.singleton import EnvSing 33 | from maggy.config import * 34 | from maggy.core.experiment_driver import HyperparameterOptDriver, AblationDriver 35 | 36 | 37 | APP_ID = None 38 | RUNNING = False 39 | RUN_ID = 1 40 | EXPERIMENT_JSON = {} 41 | 42 | 43 | def lagom(train_fn: Callable, config: LagomConfig) -> dict: 44 | """Launches a maggy experiment, which depending on 'config' can either 45 | be a hyperparameter optimization, an ablation study experiment or distributed 46 | training. Given a search space, objective and a model training procedure `train_fn` 47 | (black-box function), an experiment is the whole process of finding the 48 | best hyperparameter combination in the search space, optimizing the 49 | black-box function. Currently maggy supports random search and a median 50 | stopping rule. 51 | **lagom** is a Swedish word meaning "just the right amount". 52 | 53 | :param train_fn: User defined experiment containing the model training. 54 | :param config: An experiment configuration. For more information, see config. 55 | 56 | :returns: The experiment results as a dict. 57 | """ 58 | global APP_ID 59 | global RUNNING 60 | global RUN_ID 61 | job_start = time.time() 62 | try: 63 | if RUNNING: 64 | raise RuntimeError("An experiment is currently running.") 65 | RUNNING = True 66 | spark_context = util.find_spark().sparkContext 67 | APP_ID = str(spark_context.applicationId) 68 | APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) 69 | EnvSing.get_instance().set_app_id(APP_ID) 70 | driver = lagom_driver(config, APP_ID, RUN_ID) 71 | return driver.run_experiment(train_fn, config) 72 | except: # noqa: E722 73 | _exception_handler(util.seconds_to_milliseconds(time.time() - job_start)) 74 | raise 75 | finally: 76 | # Clean up spark jobs 77 | RUN_ID += 1 78 | RUNNING = False 79 | util.find_spark().sparkContext.setJobGroup("", "") 80 | 81 | 82 | @singledispatch 83 | def lagom_driver(config, app_id: int, run_id: int) -> None: 84 | """Dispatcher function for the experiment driver. 85 | 86 | Initializes the appropriate driver according to the config. 87 | 88 | :raises TypeError: Only gets called if no fitting config was found and 89 | raises an error. 90 | """ 91 | raise TypeError( 92 | "Invalid config type! LagomConfig is expected to be of type {}, {}, {} or {}, \ 93 | but is of type {}".format( 94 | HyperparameterOptConfig, 95 | AblationConfig, 96 | TorchDistributedConfig, 97 | TfDistributedConfig, 98 | type(config), 99 | ) 100 | ) 101 | 102 | 103 | @lagom_driver.register(HyperparameterOptConfig) 104 | def _( 105 | config: HyperparameterOptConfig, app_id: int, run_id: int 106 | ) -> HyperparameterOptDriver: 107 | return HyperparameterOptDriver(config, app_id, run_id) 108 | 109 | 110 | @lagom_driver.register(AblationConfig) 111 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver: 112 | return AblationDriver(config, app_id, run_id) 113 | 114 | 115 | @lagom_driver.register(TorchDistributedConfig) 116 | # Lazy import of TorchDistributedTrainingDriver to avoid Torch import until necessary 117 | def _( 118 | config: TorchDistributedConfig, app_id: int, run_id: int 119 | ) -> "TorchDistributedTrainingDriver": # noqa: F821 120 | from maggy.core.experiment_driver.torch_distributed_training_driver import ( 121 | TorchDistributedTrainingDriver, 122 | ) 123 | 124 | return TorchDistributedTrainingDriver(config, app_id, run_id) 125 | 126 | 127 | @lagom_driver.register(TfDistributedConfig) 128 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 129 | def _( 130 | config: TfDistributedConfig, app_id: int, run_id: int 131 | ) -> "TfDistributedTrainingDriver": # noqa: F821 132 | from maggy.core.experiment_driver.tf_distributed_training_driver import ( 133 | TfDistributedTrainingDriver, 134 | ) 135 | 136 | return TfDistributedTrainingDriver(config, app_id, run_id) 137 | 138 | 139 | @lagom_driver.register(LagomConfig) 140 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 141 | def _(config: LagomConfig, app_id: int, run_id: int) -> "BaseDriver": # noqa: F821 142 | from maggy.core.experiment_driver.base_driver import ( 143 | BaseDriver, 144 | ) 145 | 146 | return BaseDriver(config, app_id, run_id) 147 | 148 | 149 | def _exception_handler(duration: int) -> None: 150 | """Handles exceptions during execution of an experiment. 151 | 152 | :param duration: Duration of the experiment until exception in milliseconds 153 | """ 154 | try: 155 | global RUNNING 156 | global EXPERIMENT_JSON 157 | if RUNNING: 158 | EXPERIMENT_JSON["state"] = "FAILED" 159 | EXPERIMENT_JSON["duration"] = duration 160 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 161 | EnvSing.get_instance().attach_experiment_xattr( 162 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 163 | ) 164 | except Exception as err: 165 | util.log(err) 166 | 167 | 168 | def _exit_handler() -> None: 169 | """Handles jobs killed by the user.""" 170 | try: 171 | global RUNNING 172 | global EXPERIMENT_JSON 173 | if RUNNING: 174 | EXPERIMENT_JSON["status"] = "KILLED" 175 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 176 | EnvSing.get_instance().attach_experiment_xattr( 177 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 178 | ) 179 | except Exception as err: 180 | util.log(err) 181 | 182 | 183 | atexit.register(_exit_handler) 184 | -------------------------------------------------------------------------------- /maggy/experiment/experiment_python.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Experiment module used for running asynchronous optimization tasks. 19 | The programming model is that you wrap the code containing the model 20 | training inside a wrapper function. 21 | Inside that wrapper function provide all imports and parts that make up your 22 | experiment, see examples below. Whenever a function to run an experiment is 23 | invoked it is also registered in the Experiments service along with the 24 | provided information. 25 | """ 26 | import atexit 27 | import calendar 28 | import time 29 | from functools import singledispatch 30 | from typing import Callable 31 | 32 | from maggy import util 33 | from maggy.core.environment.singleton import EnvSing 34 | from maggy.config import * 35 | from maggy.core.experiment_driver import ( 36 | HyperparameterOptDriver, 37 | AblationDriver, 38 | BaseDriver, 39 | ) 40 | 41 | 42 | APP_ID = None 43 | RUNNING = False 44 | RUN_ID = 1 45 | EXPERIMENT_JSON = {} 46 | 47 | 48 | def lagom(train_fn: Callable, config) -> dict: 49 | """Launches a maggy experiment, which depending on 'config' can either 50 | be a hyperparameter optimization, an ablation study experiment or distributed 51 | training. Given a search space, objective and a model training procedure `train_fn` 52 | (black-box function), an experiment is the whole process of finding the 53 | best hyperparameter combination in the search space, optimizing the 54 | black-box function. Currently maggy supports random search and a median 55 | stopping rule. 56 | **lagom** is a Swedish word meaning "just the right amount". 57 | 58 | :param train_fn: User defined experiment containing the model training. 59 | :param config: An experiment configuration. For more information, see config. 60 | 61 | :returns: The experiment results as a dict. 62 | """ 63 | global APP_ID 64 | global RUNNING 65 | global RUN_ID 66 | job_start = time.time() 67 | try: 68 | if RUNNING: 69 | raise RuntimeError("An experiment is currently running.") 70 | RUNNING = True 71 | APP_ID = str(calendar.timegm(time.gmtime())) 72 | APP_ID = "application_" + APP_ID + "_0001" 73 | APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID) 74 | driver = lagom_driver(config, APP_ID, RUN_ID) 75 | return driver.run_experiment(train_fn, config) 76 | except: # noqa: E722 77 | _exception_handler(util.seconds_to_milliseconds(time.time() - job_start)) 78 | raise 79 | finally: 80 | # Clean up spark jobs 81 | RUN_ID += 1 82 | RUNNING = False 83 | 84 | 85 | @singledispatch 86 | def lagom_driver(config, app_id: int, run_id: int) -> None: 87 | """Dispatcher function for the experiment driver. 88 | 89 | Initializes the appropriate driver according to the config. 90 | 91 | :raises TypeError: Only gets called if no fitting config was found and 92 | raises an error. 93 | """ 94 | raise TypeError( 95 | "Invalid config type! Config is expected to be of type {}, {}, {}, {} or {}, \ 96 | but is of type {}".format( 97 | HyperparameterOptConfig, 98 | AblationConfig, 99 | TorchDistributedConfig, 100 | TfDistributedConfig, 101 | BaseConfig, 102 | type(config), 103 | ) 104 | ) 105 | 106 | 107 | @lagom_driver.register(HyperparameterOptConfig) 108 | def _( 109 | config: HyperparameterOptConfig, app_id: int, run_id: int 110 | ) -> HyperparameterOptDriver: 111 | return HyperparameterOptDriver(config, app_id, run_id) 112 | 113 | 114 | @lagom_driver.register(AblationConfig) 115 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver: 116 | return AblationDriver(config, app_id, run_id) 117 | 118 | 119 | @lagom_driver.register(TorchDistributedConfig) 120 | # Lazy import of DistributedDriver to avoid Torch import until necessary 121 | def _( 122 | config: TorchDistributedConfig, app_id: int, run_id: int 123 | ) -> "TorchDistributedTrainingDriver": # noqa: F821 124 | from maggy.core.experiment_driver.torch_distributed_training_driver import ( 125 | TorchDistributedTrainingDriver, 126 | ) 127 | 128 | return TorchDistributedTrainingDriver(config, app_id, run_id) 129 | 130 | 131 | @lagom_driver.register(TfDistributedConfig) 132 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary 133 | def _( 134 | config: TfDistributedConfig, app_id: int, run_id: int 135 | ) -> "TfDistributedTrainingDriver": # noqa: F821 136 | from maggy.core.experiment_driver.tf_distributed_training_driver import ( 137 | TfDistributedTrainingDriver, 138 | ) 139 | 140 | return TfDistributedTrainingDriver(config, app_id, run_id) 141 | 142 | 143 | @lagom_driver.register(BaseConfig) 144 | # Lazy import of BaseConfig 145 | def _(config: BaseConfig, app_id: int, run_id: int) -> BaseDriver: 146 | from maggy.core.experiment_driver.base_driver import ( 147 | BaseDriver, 148 | ) 149 | 150 | return BaseDriver(config, app_id, run_id) 151 | 152 | 153 | @lagom_driver.register(LagomConfig) 154 | # Lazy import of LagomConfig 155 | def _(config: LagomConfig, app_id: int, run_id: int) -> BaseDriver: 156 | from maggy.core.experiment_driver.base_driver import ( 157 | BaseDriver, 158 | ) 159 | 160 | return BaseDriver(config, app_id, run_id) 161 | 162 | 163 | def _exception_handler(duration: int) -> None: 164 | """Handles exceptions during execution of an experiment. 165 | 166 | :param duration: Duration of the experiment until exception in milliseconds 167 | """ 168 | try: 169 | global RUNNING 170 | global EXPERIMENT_JSON 171 | if RUNNING: 172 | EXPERIMENT_JSON["state"] = "FAILED" 173 | EXPERIMENT_JSON["duration"] = duration 174 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 175 | EnvSing.get_instance().attach_experiment_xattr( 176 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 177 | ) 178 | except Exception as err: 179 | util.log(err) 180 | 181 | 182 | def _exit_handler() -> None: 183 | """Handles jobs killed by the user.""" 184 | try: 185 | global RUNNING 186 | global EXPERIMENT_JSON 187 | if RUNNING: 188 | EXPERIMENT_JSON["status"] = "KILLED" 189 | exp_ml_id = APP_ID + "_" + str(RUN_ID) 190 | EnvSing.get_instance().attach_experiment_xattr( 191 | exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE" 192 | ) 193 | except Exception as err: 194 | util.log(err) 195 | 196 | 197 | atexit.register(_exit_handler) 198 | -------------------------------------------------------------------------------- /maggy/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer import abstractoptimizer, randomsearch, asha, singlerun, gridsearch 18 | 19 | AbstractOptimizer = abstractoptimizer.AbstractOptimizer 20 | RandomSearch = randomsearch.RandomSearch 21 | Asha = asha.Asha 22 | SingleRun = singlerun.SingleRun 23 | GridSearch = gridsearch.GridSearch 24 | 25 | __all__ = ["AbstractOptimizer", "RandomSearch", "Asha", "SingleRun", "GridSearch"] 26 | -------------------------------------------------------------------------------- /maggy/optimizer/asha.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import math 18 | 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 20 | from maggy.trial import Trial 21 | 22 | 23 | class Asha(AbstractOptimizer): 24 | """Implements the Asynchronous Successiv Halving Algorithm - ASHA 25 | (https://arxiv.org/abs/1810.05934). ASHA needs three additional parameters: 26 | 'reduction_factor', 'resource_min' and 'resource_max'. To set custom values 27 | for these, initialize the optimizer first and pass it as an argument to 28 | 'experiment.lagom()'. 29 | 30 | Sample usage: 31 | 32 | >>> # Import Asha optimizer 33 | >>> from maggy.optimizer import Asha 34 | >>> # Instantiate the optimizer with custom arguments 35 | >>> asha = Asha(3, 1, 9) 36 | >>> experiment.lagom(..., optimizer=asha, ...) 37 | """ 38 | 39 | def __init__(self, reduction_factor=2, resource_min=1, resource_max=4): 40 | super().__init__() 41 | 42 | if reduction_factor < 2 or not isinstance(reduction_factor, int): 43 | raise Exception( 44 | "Can't initialize ASHA optimizer. 'reduction_factor'" 45 | + "has to be an integer equal to or larger than 2: {}".format( 46 | reduction_factor 47 | ) 48 | ) 49 | else: 50 | self.reduction_factor = reduction_factor 51 | 52 | if not isinstance(resource_min, int): 53 | raise Exception( 54 | "Can't initialize ASHA optimizer. 'resource_min'" 55 | + "not of type INTEGER." 56 | ) 57 | if not isinstance(resource_max, int): 58 | raise Exception( 59 | "Can't initialize ASHA optimizer. 'resource_max'" 60 | + "not of type INTEGER." 61 | ) 62 | if resource_min >= resource_max: 63 | raise Exception( 64 | "Can't initialize ASHA optimizer. 'resource_min' is larger" 65 | + "than 'resource_max'." 66 | ) 67 | 68 | self.resource_min = resource_min 69 | self.resource_max = resource_max 70 | 71 | def initialize(self): 72 | 73 | # maps rung index k to trials in that rung 74 | self.rungs = {0: []} 75 | # maps rung index k to trial ids of trials that were promoted 76 | self.promoted = {0: []} 77 | 78 | self.max_rung = int( 79 | math.floor( 80 | math.log(self.resource_max / self.resource_min, self.reduction_factor) 81 | ) 82 | ) 83 | 84 | assert self.num_trials >= self.reduction_factor ** (self.max_rung + 1) 85 | 86 | def get_suggestion(self, trial=None): 87 | 88 | if trial is not None: 89 | # stopping criterium: one trial in max rung 90 | if self.max_rung in self.rungs: 91 | # return None to signal end to experiment driver 92 | return None 93 | 94 | # for each rung 95 | for k in range(self.max_rung - 1, -1, -1): 96 | # if rung doesn't exist yet go one lower 97 | if k not in self.rungs: 98 | continue 99 | 100 | # get top_k 101 | rung_finished = len( 102 | [x for x in self.rungs[k] if x.status == Trial.FINALIZED] 103 | ) 104 | 105 | if (rung_finished // self.reduction_factor) - len( 106 | self.promoted.get(k, []) 107 | ) > 0: 108 | candidates = self._top_k( 109 | k, (rung_finished // self.reduction_factor) 110 | ) 111 | else: 112 | candidates = [] 113 | 114 | # if there are no candidates, check one rung below 115 | if not candidates: 116 | continue 117 | 118 | # select all that haven't been promoted yet 119 | promotable = [ 120 | t for t in candidates if t.trial_id not in self.promoted.get(k, []) 121 | ] 122 | 123 | nr_promotable = len(promotable) 124 | if nr_promotable >= 1: 125 | new_rung = k + 1 126 | # sorted in decending order, take highest -> index 0 127 | old_trial = promotable[0] 128 | # make copy of params to be able to change resource 129 | params = old_trial.params.copy() 130 | params["budget"] = self.resource_min * ( 131 | self.reduction_factor**new_rung 132 | ) 133 | promote_trial = Trial(params) 134 | 135 | # open new rung if not exists 136 | if new_rung in self.rungs: 137 | self.rungs[new_rung].append(promote_trial) 138 | else: 139 | self.rungs[new_rung] = [promote_trial] 140 | 141 | # remember promoted trial 142 | if k in self.promoted: 143 | self.promoted[k].append(old_trial.trial_id) 144 | else: 145 | self.promoted[k] = [old_trial.trial_id] 146 | 147 | return promote_trial 148 | 149 | # else return random configuration in base rung 150 | params = self.searchspace.get_random_parameter_values(1)[0] 151 | # set resource to minimum 152 | params["budget"] = self.resource_min 153 | to_return = Trial(params) 154 | # add to bottom rung 155 | self.rungs[0].append(to_return) 156 | return to_return 157 | 158 | def finalize_experiment(self, trials): 159 | return 160 | 161 | def _top_k(self, rung_k, number): 162 | """Find top-`number` trials in `rung_k`.""" 163 | if number > 0: 164 | filtered = [x for x in self.rungs[rung_k] if x.status == Trial.FINALIZED] 165 | filtered.sort(key=lambda x: x.final_metric, reverse=True) 166 | # return top k trials if finalized 167 | return filtered[:number] 168 | else: 169 | return [] 170 | -------------------------------------------------------------------------------- /maggy/optimizer/bayes/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer.bayes import base, gp, tpe 18 | 19 | BaseAsyncBO = base.BaseAsyncBO 20 | GP = gp.GP 21 | TPE = tpe.TPE 22 | 23 | __all__ = [ 24 | "TPE", 25 | "BaseAsyncBO", 26 | "GP", 27 | ] 28 | -------------------------------------------------------------------------------- /maggy/optimizer/bayes/acquisitions.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC 18 | from abc import abstractmethod 19 | 20 | import numpy as np 21 | from skopt.acquisition import _gaussian_acquisition 22 | from skopt.acquisition import gaussian_acquisition_1D 23 | 24 | 25 | class AbstractAcquisitionFunction(ABC): 26 | @staticmethod 27 | @abstractmethod 28 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 29 | """evaluates acquisition function at given points 30 | 31 | :param X: Values where the acquisition function should be computed. shape = (n_locations, n_hparams) 32 | :type X: np.ndarray 33 | :param surrogate_model: the surrogate model of the bayesian optimizer. 34 | :type surrogate_model: GaussianProcessRegressor 35 | :param y_opt: currently best observed value 36 | :type y_opt: float 37 | :param acq_func_kwargs: additional arguments for the acquisition function 38 | :type acq_func_kwargs: dict|None 39 | :return: Acquisition function values computed at X. shape = (n_locations,) 40 | :rtype: np.ndarray 41 | """ 42 | pass 43 | 44 | @staticmethod 45 | @abstractmethod 46 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 47 | """A wrapper around the acquisition function that is called by fmin_l_bfgs_b. 48 | This is because lbfgs allows only 1-D input. 49 | 50 | :param x: value where acquisition function should be evaluated. shape=(n_hparams, ) 51 | :type x: np.ndarray 52 | :param surrogate_model: the surrogate model of the bayesian optimizer. 53 | :type surrogate_model: GaussianProcessRegressor 54 | :param y_opt: currently best observed value 55 | :type y_opt: float 56 | :param acq_func_kwargs: additional arguments for the acquisition function 57 | :type acq_func_kwargs: dict|None 58 | :return: tuple containing two arrays. the first holds the evaluated values of the acquisition function at value 59 | x; shape = (1,) . the second holds the gradients; shape = (n_hparams,). 60 | :rtype: tuple 61 | """ 62 | pass 63 | 64 | def name(self): 65 | return str(self.__class__.__name__) 66 | 67 | 68 | class GaussianProcess_EI(AbstractAcquisitionFunction): 69 | """xi in acq_func_kwargs""" 70 | 71 | @staticmethod 72 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 73 | return _gaussian_acquisition( 74 | X=X, 75 | model=surrogate_model, 76 | y_opt=y_opt, 77 | acq_func="EI", 78 | acq_func_kwargs=acq_func_kwargs, 79 | ) 80 | 81 | @staticmethod 82 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 83 | return gaussian_acquisition_1D( 84 | X=x, 85 | model=surrogate_model, 86 | y_opt=y_opt, 87 | acq_func="EI", 88 | acq_func_kwargs=acq_func_kwargs, 89 | ) 90 | 91 | 92 | class GaussianProcess_PI(AbstractAcquisitionFunction): 93 | @staticmethod 94 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 95 | return _gaussian_acquisition( 96 | X=X, 97 | model=surrogate_model, 98 | y_opt=y_opt, 99 | acq_func="PI", 100 | acq_func_kwargs=acq_func_kwargs, 101 | ) 102 | 103 | @staticmethod 104 | def evaluate_1_d(X, surrogate_model, y_opt, acq_func_kwargs=None): 105 | return gaussian_acquisition_1D( 106 | X=X, 107 | model=surrogate_model, 108 | y_opt=y_opt, 109 | acq_func="PI", 110 | acq_func_kwargs=acq_func_kwargs, 111 | ) 112 | 113 | 114 | class GaussianProcess_LCB(AbstractAcquisitionFunction): 115 | """kappa in acq_func_kwargs""" 116 | 117 | @staticmethod 118 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 119 | return _gaussian_acquisition( 120 | X=X, 121 | model=surrogate_model, 122 | y_opt=y_opt, 123 | acq_func="LCB", 124 | acq_func_kwargs=acq_func_kwargs, 125 | ) 126 | 127 | @staticmethod 128 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 129 | return gaussian_acquisition_1D( 130 | X=x, 131 | model=surrogate_model, 132 | y_opt=y_opt, 133 | acq_func="LCB", 134 | acq_func_kwargs=acq_func_kwargs, 135 | ) 136 | 137 | 138 | class GaussianProcess_UCB(AbstractAcquisitionFunction): 139 | @staticmethod 140 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 141 | raise NotImplementedError 142 | 143 | @staticmethod 144 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 145 | raise NotImplementedError 146 | 147 | 148 | class TPE_EI(AbstractAcquisitionFunction): 149 | @staticmethod 150 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 151 | raise NotImplementedError 152 | 153 | @staticmethod 154 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 155 | raise NotImplementedError 156 | 157 | 158 | class AsyTS(AbstractAcquisitionFunction): 159 | @staticmethod 160 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 161 | return surrogate_model.sample_y(X).reshape( 162 | X.shape[0], 163 | ) 164 | 165 | @staticmethod 166 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 167 | """A wrapper around the acquisition function that is called by fmin_l_bfgs_b. 168 | This is because lbfgs allows only 1-D input. 169 | 170 | :param x: value where acquisition function should be evaluated. shape=(n_hparams, ) 171 | :type x: np.ndarray 172 | :param surogate_model: the surrogate model of the bayesian optimizer. 173 | :type surogate_model: GaussianProcessRegressor 174 | :param y_opt: currently best observed value 175 | :type y_opt: float 176 | :param acq_func_kwargs: additional arguments for the acquisition function 177 | :type acq_func_kwargs: dict|None 178 | :return: values of the acquisition function at value x. shape = (1,) 179 | :rtype: np.ndarray 180 | """ 181 | return surrogate_model.sample_y(np.expand_dims(x, axis=0)).reshape( 182 | 1, 183 | ) 184 | 185 | 186 | class HLP(AbstractAcquisitionFunction): 187 | @staticmethod 188 | def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None): 189 | raise NotImplementedError 190 | 191 | @staticmethod 192 | def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None): 193 | raise NotImplementedError 194 | -------------------------------------------------------------------------------- /maggy/optimizer/gridsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import itertools 18 | 19 | from maggy import Searchspace 20 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 21 | 22 | 23 | class GridSearch(AbstractOptimizer): 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | self.config_buffer = [] 27 | 28 | def initialize(self): 29 | self._validate_searchspace(self.searchspace) 30 | # create all trials ahead of time 31 | self.config_buffer = self._grid_params(self.searchspace) 32 | 33 | @classmethod 34 | def get_num_trials(cls, searchspace): 35 | """For grid search the number of trials is determined by the size of the 36 | cartisian product, depending on the user-set number of parameters and values 37 | 38 | This method is duplicating part of the code in the `initialize()` mainly to keep 39 | the flow of things the same as for other optimizers, where the user sets only 40 | the number of trials to evaluate. 41 | """ 42 | cls._validate_searchspace(searchspace) 43 | return len(cls._grid_params(searchspace)) 44 | 45 | def get_suggestion(self, trial=None): 46 | # sampling routine for randomsearch + pruner 47 | if self.pruner: 48 | raise NotImplementedError( 49 | "Grid search in combination with trial pruning " 50 | "is currently not supported." 51 | ) 52 | elif self.config_buffer: 53 | run_budget = 0 54 | next_trial_params = self.config_buffer.pop() 55 | next_trial = self.create_trial( 56 | hparams=next_trial_params, 57 | sample_type="grid", 58 | run_budget=run_budget, 59 | ) 60 | 61 | self._log( 62 | "start trial {}: {}, {} \n".format( 63 | next_trial.trial_id, next_trial.params, next_trial.info_dict 64 | ) 65 | ) 66 | 67 | return next_trial 68 | else: 69 | return None 70 | 71 | def finalize_experiment(self, trials): 72 | return 73 | 74 | @staticmethod 75 | def _grid_params(searchspace): 76 | return_list = [] 77 | for hparams in itertools.product( 78 | *[item["values"] for item in searchspace.items()] 79 | ): 80 | return_list.append(searchspace.list_to_dict(hparams)) 81 | return return_list 82 | 83 | @staticmethod 84 | def _validate_searchspace(searchspace): 85 | if ( 86 | Searchspace.DOUBLE in searchspace.names().values() 87 | or Searchspace.INTEGER in searchspace.names().values() 88 | ): 89 | raise NotImplementedError( 90 | "Searchspace can only contain `discrete` or `categorical` " 91 | "hyperparameters for grid search." 92 | ) 93 | -------------------------------------------------------------------------------- /maggy/optimizer/randomsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | import time 17 | from copy import deepcopy 18 | 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 20 | from maggy.searchspace import Searchspace 21 | 22 | 23 | class RandomSearch(AbstractOptimizer): 24 | def __init__(self, **kwargs): 25 | super().__init__(**kwargs) 26 | self.config_buffer = [] 27 | 28 | def initialize(self): 29 | 30 | if ( 31 | Searchspace.DOUBLE not in self.searchspace.names().values() 32 | and Searchspace.INTEGER not in self.searchspace.names().values() 33 | ): 34 | raise NotImplementedError( 35 | "Searchspace needs at least one continuous parameter for random search." 36 | ) 37 | 38 | self.config_buffer = self.searchspace.get_random_parameter_values( 39 | self.num_trials 40 | ) 41 | 42 | def get_suggestion(self, trial=None): 43 | self._log("### start get_suggestion ###") 44 | self.sampling_time_start = time.time() 45 | 46 | # sampling routine for randomsearch + pruner 47 | if self.pruner: 48 | next_trial_info = self.pruner.pruning_routine() 49 | if next_trial_info == "IDLE": 50 | self._log( 51 | "Worker is IDLE and has to wait until a new trial can be scheduled" 52 | ) 53 | return "IDLE" 54 | elif next_trial_info is None: 55 | # experiment is finished 56 | self._log("Experiment has finished") 57 | return None 58 | elif next_trial_info["trial_id"]: 59 | # copy hparams of given promoted trial and start new trial with it 60 | parent_trial_id = next_trial_info["trial_id"] 61 | parent_trial_hparams = deepcopy( 62 | self.get_hparams_dict(trial_ids=parent_trial_id)[parent_trial_id] 63 | ) 64 | # update trial info dict and create new trial object 65 | next_trial = self.create_trial( 66 | hparams=parent_trial_hparams, 67 | sample_type="promoted", 68 | run_budget=next_trial_info["budget"], 69 | ) 70 | self._log("use hparams from promoted trial {}".format(parent_trial_id)) 71 | else: 72 | # start sampling procedure with given budget 73 | parent_trial_id = None 74 | run_budget = next_trial_info["budget"] 75 | hparams = self.searchspace.get_random_parameter_values(1)[0] 76 | next_trial = self.create_trial( 77 | hparams=hparams, sample_type="random", run_budget=run_budget 78 | ) 79 | 80 | # report new trial id to pruner 81 | self.pruner.report_trial( 82 | original_trial_id=parent_trial_id, new_trial_id=next_trial.trial_id 83 | ) 84 | 85 | self._log( 86 | "start trial {}: {}. info_dict: {} \n".format( 87 | next_trial.trial_id, next_trial.params, next_trial.info_dict 88 | ) 89 | ) 90 | return next_trial 91 | 92 | # sampling routine for pure random search 93 | elif self.config_buffer: 94 | run_budget = 0 95 | next_trial_params = self.config_buffer.pop() 96 | next_trial = self.create_trial( 97 | hparams=next_trial_params, 98 | sample_type="random", 99 | run_budget=run_budget, 100 | ) 101 | 102 | self._log( 103 | "start trial {}: {}, {} \n".format( 104 | next_trial.trial_id, next_trial.params, next_trial.info_dict 105 | ) 106 | ) 107 | 108 | return next_trial 109 | else: 110 | return None 111 | 112 | def finalize_experiment(self, trials): 113 | return 114 | -------------------------------------------------------------------------------- /maggy/optimizer/singlerun.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer 18 | from maggy.trial import Trial 19 | 20 | 21 | class SingleRun(AbstractOptimizer): 22 | def __init__(self): 23 | super().__init__() 24 | self.trial_buffer = [] 25 | 26 | def initialize(self): 27 | for _ in range(self.num_trials): 28 | self.trial_buffer.append(Trial({})) 29 | 30 | def get_suggestion(self, trial=None): 31 | if self.trial_buffer: 32 | return self.trial_buffer.pop() 33 | else: 34 | return None 35 | 36 | def finalize_experiment(self, trials): 37 | return 38 | -------------------------------------------------------------------------------- /maggy/pruner/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from maggy.pruner import hyperband, abstractpruner 18 | 19 | Hyperband = hyperband.Hyperband 20 | AbstractPruner = abstractpruner.AbstractPruner 21 | 22 | __all__ = ["Hyperband", "AbstractPruner"] 23 | -------------------------------------------------------------------------------- /maggy/pruner/abstractpruner.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from abc import ABC, abstractmethod 18 | from datetime import datetime 19 | 20 | from maggy.core.environment.singleton import EnvSing 21 | 22 | 23 | class AbstractPruner(ABC): 24 | def __init__(self, trial_metric_getter): 25 | """ 26 | :param trial_metric_getter: a function that returns a dict with `trial_id` as key and `metric` as value 27 | with the lowest metric being the "best" 28 | It's only argument is `trial_ids`, it can be either str of single trial or list of trial ids 29 | :type trial_metric_getter: function 30 | """ 31 | 32 | self.trial_metric_getter = trial_metric_getter 33 | 34 | # logger variables 35 | self.log_file = None 36 | self.fd = None 37 | 38 | @abstractmethod 39 | def pruning_routine(self): 40 | """ 41 | runs pruning routine. 42 | interface top `optimizer` 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def report_trial(self): 48 | """ 49 | hook for reporting trial id of created trial from optimizer to pruner 50 | """ 51 | pass 52 | 53 | @abstractmethod 54 | def finished(self): 55 | """ 56 | checks if experiment is finished 57 | """ 58 | pass 59 | 60 | @abstractmethod 61 | def num_trials(self): 62 | """ 63 | calculates the number of trials in the experiment 64 | 65 | :return: number of trials 66 | :rtype: int 67 | """ 68 | 69 | def name(self): 70 | return str(self.__class__.__name__) 71 | 72 | def initialize_logger(self, exp_dir): 73 | """Initialize logger of optimizer 74 | 75 | :param exp_dir: path of experiment directory 76 | :rtype exp_dir: str 77 | """ 78 | env = EnvSing.get_instance() 79 | # configure logger 80 | self.log_file = exp_dir + "/pruner.log" 81 | 82 | if not env.exists(self.log_file): 83 | env.dump("", self.log_file) 84 | self.fd = env.open_file(self.log_file, flags="w") 85 | self._log("Initialized Pruner Logger") 86 | 87 | def _log(self, msg): 88 | if self.fd and not self.fd.closed: 89 | msg = datetime.now().isoformat() + ": " + str(msg) 90 | self.fd.write(EnvSing.get_instance().str_or_byte(msg + "\n")) 91 | 92 | def _close_log(self): 93 | if not self.fd.closed: 94 | self.fd.flush() 95 | self.fd.close() 96 | -------------------------------------------------------------------------------- /maggy/tensorboard.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ 18 | Module to encapsulate functionality related to writing to the tensorboard 19 | log dir and programmatically structure the outputs. 20 | """ 21 | 22 | import tensorflow as tf 23 | from tensorboard.plugins.hparams import api as hp 24 | 25 | _tensorboard_dir = None 26 | 27 | 28 | def _register(trial_dir): 29 | global _tensorboard_dir 30 | _tensorboard_dir = trial_dir 31 | 32 | 33 | def logdir(): 34 | """Returns the path to the tensorboard log directory. 35 | 36 | Instead of hardcoding a log dir path in a training function, users should 37 | make use of this function call, which will programmatically create a folder 38 | structure for tensorboard to visualize the machine learning experiment. 39 | 40 | :return: Path of the log directory in HOPSFS 41 | :rtype: str 42 | """ 43 | global _tensorboard_dir 44 | return _tensorboard_dir 45 | 46 | 47 | def _create_hparams_config(searchspace): 48 | hparams = [] 49 | 50 | for key, val in searchspace.names().items(): 51 | if val == "DOUBLE": 52 | hparams.append( 53 | hp.HParam( 54 | key, 55 | hp.RealInterval( 56 | float(searchspace.get(key)[0]), float(searchspace.get(key)[1]) 57 | ), 58 | ) 59 | ) 60 | elif val == "INTEGER": 61 | hparams.append( 62 | hp.HParam( 63 | key, 64 | hp.IntInterval(searchspace.get(key)[0], searchspace.get(key)[1]), 65 | ) 66 | ) 67 | elif val == "DISCRETE": 68 | hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key)))) 69 | elif val == "CATEGORICAL": 70 | hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key)))) 71 | 72 | return hparams 73 | 74 | 75 | def _write_hparams_config(log_dir, searchspace): 76 | HPARAMS = _create_hparams_config(searchspace) 77 | METRICS = [ 78 | hp.Metric( 79 | "epoch_accuracy", 80 | group="validation", 81 | display_name="accuracy (val.)", 82 | ), 83 | hp.Metric( 84 | "epoch_loss", 85 | group="validation", 86 | display_name="loss (val.)", 87 | ), 88 | hp.Metric( 89 | "epoch_accuracy", 90 | group="train", 91 | display_name="accuracy (train)", 92 | ), 93 | hp.Metric( 94 | "epoch_loss", 95 | group="train", 96 | display_name="loss (train)", 97 | ), 98 | ] 99 | 100 | with tf.summary.create_file_writer(log_dir).as_default(): 101 | hp.hparams_config(hparams=HPARAMS, metrics=METRICS) 102 | 103 | 104 | def _write_hparams(hparams, trial_id): 105 | global _tensorboard_dir 106 | with tf.summary.create_file_writer(_tensorboard_dir).as_default(): 107 | hp.hparams(hparams, trial_id) 108 | -------------------------------------------------------------------------------- /maggy/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /maggy/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | """ pytest fixtures that can be resued across tests. the filename needs to be conftest.py 18 | """ 19 | 20 | # make sure env variables are set correctly 21 | import findspark # this needs to be the first import 22 | 23 | findspark.init() 24 | 25 | import logging 26 | import pytest 27 | 28 | from pyspark import HiveContext 29 | from pyspark import SparkConf 30 | from pyspark import SparkContext 31 | from pyspark.streaming import StreamingContext 32 | 33 | 34 | def quiet_py4j(): 35 | """turn down spark logging for the test context""" 36 | logger = logging.getLogger("py4j") 37 | logger.setLevel(logging.WARN) 38 | 39 | 40 | def pytest_addoption(parser): 41 | parser.addoption( 42 | "--spark-master", 43 | action="store", 44 | default=None, 45 | help='spark-master: "spark://name.local:7077"', 46 | ) 47 | 48 | 49 | @pytest.fixture(scope="session") 50 | def sc(request): 51 | """fixture for creating a spark context 52 | Args: 53 | request: pytest.FixtureRequest object 54 | """ 55 | 56 | assert ( 57 | request.config.getoption("--spark-master") is not None 58 | ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" ' 59 | 60 | conf = ( 61 | SparkConf() 62 | .setMaster(request.config.getoption("--spark-master")) 63 | .setAppName("pytest-pyspark-local-testing") 64 | .set("spark.dynamicAllocation.maxExecutors", 2) 65 | .set("spark.executor.instances", 2) 66 | ) 67 | scont = SparkContext(conf=conf) 68 | request.addfinalizer(lambda: scont.stop()) 69 | 70 | quiet_py4j() 71 | return scont 72 | 73 | 74 | @pytest.fixture(scope="session") 75 | def hive_context(sc): 76 | """fixture for creating a Hive Context. Creating a fixture enables it to be reused across all 77 | tests in a session 78 | Args: 79 | spark_context: spark_context fixture 80 | Returns: 81 | HiveContext for tests 82 | """ 83 | return HiveContext(sc) 84 | 85 | 86 | @pytest.fixture(scope="session") 87 | def streaming_context(sc): 88 | return StreamingContext(sc, 1) 89 | -------------------------------------------------------------------------------- /maggy/tests/test_maggy.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | from maggy.searchspace import Searchspace 19 | from maggy.optimizer import RandomSearch 20 | 21 | # this allows using the fixture in all tests in this module 22 | pytestmark = pytest.mark.usefixtures("sc") 23 | 24 | 25 | def test_nr_executors(sc): 26 | 27 | executor_instances = int(sc._conf.get("spark.executor.instances")) 28 | expected_number = 2 29 | assert executor_instances == expected_number 30 | 31 | 32 | def test_random_search(sc): 33 | 34 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 35 | 36 | rs = RandomSearch() 37 | rs.searchspace = sp 38 | 39 | rs.num_trials = 5 40 | exp_result = {"argument_param": "DOUBLE"} 41 | 42 | assert sp.names() == exp_result 43 | assert rs.num_trials == 5 44 | assert rs.searchspace == sp 45 | -------------------------------------------------------------------------------- /maggy/tests/test_randomsearch.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | import tensorflow as tf 22 | from tensorflow import keras 23 | import numpy as np 24 | 25 | from maggy.searchspace import Searchspace 26 | from maggy.optimizer import RandomSearch 27 | from maggy import experiment 28 | from maggy.config import HyperparameterOptConfig, TfDistributedConfig 29 | 30 | # this allows using the fixture in all tests in this module 31 | pytestmark = pytest.mark.usefixtures("sc") 32 | 33 | 34 | def test_randomsearch_init(): 35 | 36 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 37 | 38 | rs = RandomSearch(5, sp, []) 39 | 40 | assert rs.num_trials == 5 41 | assert rs.searchspace == sp 42 | 43 | 44 | def test_randomsearch_initialize(): 45 | 46 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 47 | 48 | rs = RandomSearch(5, sp, []) 49 | 50 | rs.initialize() 51 | 52 | assert len(rs.trial_buffer) == 5 53 | 54 | 55 | def test_rs_initialize2(): 56 | 57 | sp = Searchspace(argument_param=("DISCRETE", [1, 5])) 58 | 59 | rs = RandomSearch() 60 | rs.searchspace = sp 61 | 62 | with pytest.raises(NotImplementedError) as excinfo: 63 | rs.initialize() 64 | assert "Searchspace needs at least one continuous parameter" in str(excinfo.value) 65 | 66 | 67 | def test_randomsearch(sc): 68 | def train(model, train_set, test_set, hparams, reporter): 69 | 70 | if "argument_param" in hparams.keys(): 71 | print( 72 | "Entered train function with param {}".format(hparams["argument_param"]) 73 | ) 74 | 75 | for i in range(5): 76 | acc = i + random.random() 77 | reporter.broadcast(metric=acc) 78 | reporter.log("Metric: {}".format(acc)) 79 | 80 | # do something with HP. 81 | if "argument_param" in hparams.keys(): 82 | time.sleep(hparams["argument_param"]) 83 | 84 | return acc 85 | 86 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 87 | 88 | config = HyperparameterOptConfig( 89 | searchspace=sp, 90 | optimizer="randomsearch", 91 | direction="max", 92 | num_trials=5, 93 | name="test", 94 | hb_interval=1, 95 | es_interval=10, 96 | ) 97 | 98 | result = experiment.lagom(train_fn=train, config=config) 99 | assert type(result) == type({}) 100 | 101 | test_dt_tensorflow(sc) 102 | 103 | 104 | def test_dt_tensorflow(sc): 105 | 106 | mnist = tf.keras.datasets.mnist 107 | 108 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 109 | 110 | x_train, x_test = x_train / 255.0, x_test / 255.0 111 | x_train = np.reshape(x_train, (60000, 28, 28, 1)) 112 | x_test = np.reshape(x_test, (10000, 28, 28, 1)) 113 | 114 | def training_function(model, train_set, test_set, hparams): 115 | from tensorflow import keras 116 | 117 | # Define training parameters 118 | num_epochs = 10 119 | batch_size = 256 120 | learning_rate = 0.1 121 | 122 | criterion = keras.losses.SparseCategoricalCrossentropy() 123 | optimizer = keras.optimizers.SGD( 124 | learning_rate=learning_rate, momentum=0.9, decay=1e-5 125 | ) 126 | 127 | model = model(nlayers=2) 128 | 129 | model.compile(optimizer=optimizer, loss=criterion, metrics=["accuracy"]) 130 | 131 | model.fit( 132 | x_train, 133 | y_train, 134 | # batch_size=batch_size, 135 | # epochs=num_epochs, 136 | ) 137 | 138 | print("Testing") 139 | 140 | loss = model.evaluate(x_test, y_test) 141 | 142 | return loss 143 | 144 | class NeuralNetwork(tf.keras.Model): 145 | def __init__(self, nlayers): 146 | super().__init__() 147 | self.conv1 = keras.layers.Conv2D(28, 2, activation="relu") 148 | self.flatten = keras.layers.Flatten() 149 | self.d1 = keras.layers.Dense(32, activation="relu") 150 | self.d2 = keras.layers.Dense(10, activation="softmax") 151 | 152 | def call(self, x): 153 | x = self.conv1(x) 154 | x = self.flatten(x) 155 | x = self.d1(x) 156 | return self.d2(x) 157 | 158 | model = NeuralNetwork 159 | 160 | # define the constructor parameters of your model 161 | model_parameters = { 162 | "train_batch_size": 30000, 163 | "test_batch_size": 5000, 164 | "nlayers": 2, 165 | } 166 | 167 | # pass the model parameters in the last 168 | config = TfDistributedConfig( 169 | name="tf_test", 170 | model=model, 171 | train_set=None, 172 | test_set=None, 173 | hparams=model_parameters, 174 | ) 175 | 176 | result = experiment.lagom(train_fn=training_function, config=config) 177 | 178 | assert type(result) == list 179 | -------------------------------------------------------------------------------- /maggy/tests/test_searchspace.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | from maggy import Searchspace 22 | 23 | 24 | def test_searchspace_init(): 25 | 26 | sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4])) 27 | 28 | exp_get = [1, 5] 29 | 30 | assert sp.get("argument_param") == exp_get 31 | assert sp.argument_param == exp_get # pylint: disable=no-member 32 | 33 | 34 | def test_searchspace_add(): 35 | 36 | sp = Searchspace(argument_param=("DOUBLE", [1, 5])) 37 | 38 | with pytest.raises(ValueError) as excinfo: 39 | sp.add("argument_param", ("DOUBLE", [1, 5])) 40 | assert "Hyperparameter name is reserved" in str(excinfo.value) 41 | 42 | with pytest.raises(ValueError) as excinfo: 43 | # add tuple with too many elements 44 | sp.add("param", ("DOUBLE", [1, 5], "too many")) 45 | assert "Hyperparameter tuple has to be of length two" in str(excinfo.value) 46 | 47 | with pytest.raises(ValueError) as excinfo: 48 | # add unknown type 49 | sp.add("param", ("FLOAT", [1, 5])) 50 | assert "Hyperparameter type is not of type " in str(excinfo.value) 51 | 52 | with pytest.raises(ValueError) as excinfo: 53 | # add empty region list 54 | sp.add("param", ("DOUBLE", [])) 55 | assert "Hyperparameter feasible region list" in str(excinfo.value) 56 | 57 | with pytest.raises(AssertionError) as excinfo: 58 | # add incompatible type and feasible region 59 | sp.add("param", ("DOUBLE", [1, 5, 5])) 60 | sp.add("param2", ("INTEGER", [1, 5, 5])) 61 | assert "For DOUBLE or " in str(excinfo.value) 62 | 63 | with pytest.raises(AssertionError) as excinfo: 64 | # lower bound higher than upper bound 65 | sp.add("param", ("DOUBLE", [5, 1])) 66 | sp.add("param2", ("INTEGER", [4, 1])) 67 | assert "Lower bound " in str(excinfo.value) 68 | 69 | with pytest.raises(ValueError) as excinfo: 70 | # Non integer boundaries for integer type parameter 71 | sp.add("param2", ("INTEGER", [1.5, 5])) 72 | assert "type INTEGER need to be integer:" in str(excinfo.value) 73 | 74 | with pytest.raises(ValueError) as excinfo: 75 | # Non numeric interval boundaries 76 | sp.add("param2", ("DOUBLE", ["lower", 5])) 77 | assert "type DOUBLE need to be integer or float:" in str(excinfo.value) 78 | -------------------------------------------------------------------------------- /maggy/tests/test_trial.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | import time 19 | import random 20 | 21 | from maggy import Trial 22 | 23 | 24 | def test_trial_init(): 25 | 26 | trial = Trial({"param1": 5, "param2": "ada"}) 27 | 28 | exp = {"param1": 5, "param2": "ada"} 29 | 30 | assert trial.params == exp 31 | assert trial.status == Trial.PENDING 32 | assert trial.trial_id == "3d1cc9fdb1d4d001" 33 | 34 | 35 | def test_trial_serialization(): 36 | 37 | trial = Trial({"param1": 5, "param2": "ada"}) 38 | 39 | exp = {"param1": 5, "param2": "ada"} 40 | 41 | json_str = trial.to_json() 42 | 43 | new_trial = Trial.from_json(json_str) 44 | 45 | assert isinstance(new_trial, Trial) 46 | assert new_trial.params == exp 47 | assert new_trial.status == Trial.PENDING 48 | assert new_trial.trial_id == "3d1cc9fdb1d4d001" 49 | -------------------------------------------------------------------------------- /maggy/tests/test_wordcount.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import pytest 18 | from operator import add 19 | 20 | # this allows using the fixture in all tests in this module 21 | pytestmark = pytest.mark.usefixtures("sc") 22 | 23 | # Can also use a decorator such as this to use specific fixtures in specific functions 24 | # @pytest.mark.usefixtures("spark_context", "hive_context") 25 | 26 | 27 | def do_word_counts(lines): 28 | """count of words in an rdd of lines""" 29 | 30 | counts = lines.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(add) 31 | results = {word: count for word, count in counts.collect()} 32 | return results 33 | 34 | 35 | # start function with test_ so pytest can discover them 36 | def test_do_word_counts(sc): 37 | """test that a single event is parsed correctly 38 | Args: 39 | spark_context: test fixture SparkContext 40 | hive_context: test fixture HiveContext 41 | """ 42 | 43 | test_input = [" hello spark ", " hello again spark spark"] 44 | 45 | input_rdd = sc.parallelize(test_input, 1) 46 | results = do_word_counts(input_rdd) 47 | 48 | expected_results = {"hello": 2, "spark": 3, "again": 1} 49 | assert results == expected_results 50 | -------------------------------------------------------------------------------- /maggy/trial.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import json 18 | import threading 19 | import hashlib 20 | 21 | from maggy import util 22 | 23 | 24 | class Trial(object): 25 | """A Trial object contains all relevant information about the evaluation 26 | of an hyperparameter combination. 27 | 28 | It is used as shared memory between 29 | the worker thread and rpc server thread. The server thread performs only 30 | lookups on the `early_stop` and `params` attributes. 31 | """ 32 | 33 | PENDING = "PENDING" 34 | SCHEDULED = "SCHEDULED" 35 | RUNNING = "RUNNING" 36 | ERROR = "ERROR" 37 | FINALIZED = "FINALIZED" 38 | 39 | def __init__(self, params, trial_type="optimization", info_dict=None): 40 | """Create a new trial object from a hyperparameter combination 41 | ``params``. 42 | 43 | :param params: A dictionary of Hyperparameters as key value pairs. 44 | :type params: dict 45 | :param info_dict: dict containing additional information about the trial including 46 | - sample_type 47 | - sampling_time 48 | - run_budget 49 | - model_budget (optinally) 50 | see `create_trial()` method of base.py for further reference 51 | :type info_dict: dict 52 | """ 53 | # XXX before merge, we should remove the default value for trial_type 54 | # and make sure everywhere Trial() is called (e.g. in all optimizers) 55 | # trial_type is passed 56 | # @Moritz 57 | 58 | self.trial_type = trial_type 59 | # XXX temp fix, have to come up with abstractions 60 | if self.trial_type == "optimization": 61 | self.trial_id = Trial._generate_id(params) 62 | elif self.trial_type == "ablation": 63 | serializable_params = { 64 | "ablated_feature": params.get("ablated_feature", None), 65 | "ablated_layer": params.get("ablated_layer", None), 66 | } 67 | self.trial_id = Trial._generate_id(serializable_params) 68 | self.params = params 69 | self.status = Trial.PENDING 70 | self.early_stop = False 71 | self.final_metric = None 72 | self.metric_history = [] 73 | self.step_history = [] 74 | self.metric_dict = {} 75 | self.start = None 76 | self.duration = None 77 | self.lock = threading.RLock() 78 | if info_dict is None: 79 | self.info_dict = {} 80 | else: 81 | self.info_dict = info_dict 82 | 83 | def get_early_stop(self): 84 | """Return the early stopping flag of the trial.""" 85 | with self.lock: 86 | return self.early_stop 87 | 88 | def set_early_stop(self): 89 | """Set the early stopping flag of the trial to true.""" 90 | with self.lock: 91 | self.early_stop = True 92 | 93 | def append_metric(self, metric_data): 94 | """Append a metric from the heartbeats to the history.""" 95 | with self.lock: 96 | # from python 3.7 dicts are insertion ordered, 97 | # so two of these data structures can be removed 98 | if ( 99 | metric_data["step"] not in self.metric_dict 100 | and metric_data["value"] is not None 101 | ): 102 | self.metric_dict[metric_data["step"]] = metric_data["value"] 103 | self.metric_history.append(metric_data["value"]) 104 | self.step_history.append(metric_data["step"]) 105 | # return step number to indicate that it was a new unique step 106 | return metric_data["step"] 107 | # return None to indicate that no new step has finished 108 | return None 109 | 110 | @classmethod 111 | def _generate_id(cls, params): 112 | """ 113 | Class method to generate a hash from a hyperparameter dictionary. 114 | 115 | All keys in the dictionary have to be strings. The hash is a to 16 116 | characters truncated md5 hash and stable across processes. 117 | 118 | :param params: Hyperparameters 119 | :type params: dictionary 120 | :raises ValueError: All hyperparameter names have to be strings. 121 | :raises ValueError: Hyperparameters need to be a dictionary. 122 | :return: Sixteen character truncated md5 hash 123 | :rtype: str 124 | """ 125 | 126 | # ensure params is a dictionary 127 | if isinstance(params, dict): 128 | # check that all keys are strings 129 | if False in set(isinstance(k, str) for k in params.keys()): 130 | raise ValueError("All hyperparameter names have to be strings.") 131 | 132 | return hashlib.md5( 133 | json.dumps(params, sort_keys=True).encode("utf-8") 134 | ).hexdigest()[:16] 135 | 136 | raise ValueError("Hyperparameters need to be a dictionary.") 137 | 138 | def to_json(self): 139 | return json.dumps(self.to_dict(), default=util.json_default_numpy) 140 | 141 | def to_dict(self): 142 | obj_dict = {"__class__": self.__class__.__name__} 143 | 144 | temp_dict = self.__dict__.copy() 145 | temp_dict.pop("lock") 146 | temp_dict.pop("start") 147 | 148 | obj_dict.update(temp_dict) 149 | 150 | return obj_dict 151 | 152 | @classmethod 153 | def from_json(cls, json_str): 154 | """Creates a Trial instance from a previously json serialized Trial 155 | object instance. 156 | 157 | :param json_str: String containing the object. 158 | :type json_str: str 159 | :raises ValueError: json_str is not a Trial object. 160 | :return: Instantiated object instance of Trial. 161 | :rtype: Trial 162 | """ 163 | 164 | temp_dict = json.loads(json_str) 165 | if temp_dict.get("__class__", None) != "Trial": 166 | raise ValueError("json_str is not a Trial object.") 167 | if temp_dict.get("params", None) is not None: 168 | instance = cls(temp_dict.get("params")) 169 | instance.trial_id = temp_dict["trial_id"] 170 | instance.status = temp_dict["status"] 171 | instance.early_stop = temp_dict.get("early_stop", False) 172 | instance.final_metric = temp_dict["final_metric"] 173 | instance.metric_history = temp_dict["metric_history"] 174 | instance.duration = temp_dict["duration"] 175 | 176 | return instance 177 | -------------------------------------------------------------------------------- /maggy/version.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | __version__ = "1.1.2" 18 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: "MAGGY" 2 | site_description: "Official website and documentation for MAGGY - Distribution transparent Machine Learning experiments on Apache Spark." 3 | site_author: "Logical Clocks" 4 | site_url: "https://maggy.ai" 5 | 6 | # Repository 7 | repo_name: logicalclocks/maggy 8 | repo_url: https://github.com/logicalclocks/maggy 9 | edit_uri: "" 10 | 11 | nav: 12 | - Home: 13 | - Introduction: README.md 14 | - Blogs: blogs.md 15 | - Publications: publications.md 16 | - Releases: releases.md 17 | - Contributing: CONTRIBUTING.md 18 | - Issues: https://github.com/logicalclocks/maggy/issues 19 | - Hopsworks.ai: https://hopsworks.ai/ 20 | - Getting Started: 21 | - Installation: start/install.md 22 | - Quickstart: start/quickstart.md 23 | - Hyperparameter Optimization: 24 | - Introduction: hpo/intro.md 25 | - Strategies: hpo/strategies.md 26 | - Ablation Studies: 27 | - Introduction: ablation/intro.md 28 | - Distributed Training: 29 | - Introduction: dist_training/intro.md 30 | - TensorFlow: dist_training/tensorflow.md 31 | - PyTorch: dist_training/torch.md 32 | 33 | theme: 34 | name: material 35 | favicon: assets/images/maggyfav.png 36 | logo: assets/images/whitemaggy-eye.svg 37 | icon: 38 | repo: fontawesome/brands/github 39 | font: 40 | text: "Roboto" 41 | palette: 42 | accent: orange 43 | features: 44 | - navigation.tabs 45 | - navigation.tabs.sticky 46 | 47 | extra: 48 | generator: false 49 | social: 50 | - icon: fontawesome/brands/twitter 51 | link: https://twitter.com/logicalclocks 52 | - icon: fontawesome/brands/github 53 | link: https://github.com/logicalclocks 54 | - icon: fontawesome/brands/discourse 55 | link: https://community.hopsworks.ai/ 56 | - icon: fontawesome/brands/linkedin 57 | link: https://www.linkedin.com/company/logicalclocks/ 58 | analytics: 59 | provider: google 60 | property: G-J3F4GSLKE8 61 | 62 | extra_css: 63 | - assets/css/custom.css 64 | - assets/css/version-select.css 65 | 66 | extra_javascript: 67 | - assets/javascript/version-select.js 68 | 69 | plugins: 70 | - search 71 | 72 | markdown_extensions: 73 | - admonition 74 | - codehilite 75 | - footnotes 76 | - pymdownx.tabbed: 77 | alternate_style: true 78 | - pymdownx.arithmatex 79 | - pymdownx.superfences 80 | - pymdownx.details 81 | - pymdownx.caret 82 | - pymdownx.mark 83 | - pymdownx.tilde 84 | - pymdownx.critic 85 | - toc: 86 | permalink: "#" 87 | toc_depth: 3 88 | - pymdownx.tasklist: 89 | custom_checkbox: true 90 | - markdown_include.include: 91 | base_path: docs 92 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = maggy/tests 3 | max-line-length = 80 4 | select = C,E,F,W,B,B950 5 | ignore = E203, E501, W503 6 | per-file-ignores = 7 | maggy/experiment/experiment_python.py:F403, F405 8 | maggy/experiment/experiment_pyspark.py:F403, F405 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2021 Logical Clocks AB 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import os 18 | from setuptools import setup, find_packages 19 | from importlib.machinery import SourceFileLoader 20 | 21 | 22 | version = ( 23 | SourceFileLoader("maggy.version", os.path.join("maggy", "version.py")).load_module().__version__ 24 | ) 25 | 26 | 27 | def read(fname): 28 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 29 | 30 | 31 | setup( 32 | name='maggy', 33 | version=version, 34 | install_requires=[ 35 | 'numpy>=1.19.2', 'scikit-optimize==0.9.0', 'statsmodels==0.12.2', 'scipy==1.10.0' 36 | ], 37 | extras_require={ 38 | 'pydoop': ['pydoop'], 39 | 'tf': ['tensorflow==2.4.1'], 40 | 'torch': ['torch==1.7.1'], # Should be 1.8.1 if we want to support PyTorch's ZeRO. 41 | 'zero': ['deepspeed==0.3.13', 42 | 'fairscale==0.3.0'], 43 | 'docs': [ 44 | 'mkdocs==1.5.3', 45 | 'mike==2.0.0', 46 | 'mkdocs-material==9.5.10', 47 | 'markdown-include==0.8.1', 48 | ], 49 | 'dev': [ 50 | 'black==20.8b1', 51 | 'flake8==3.9.0', 52 | 'pre-commit==2.11.1', 53 | ], 54 | 'spark': ['pyspark==2.4.3'] 55 | }, 56 | author='Moritz Meister', 57 | author_email='moritz@logicalclocks.com', 58 | description='Distribution transparent Machine Learning experiments on Apache Spark ', 59 | license='Apache License 2.0', 60 | keywords='Hyperparameter, Optimization, Distributed, Training, Keras, PyTorch, TensorFlow, Spark', 61 | url='https://github.com/logicalclocks/maggy', 62 | download_url='', 63 | packages=find_packages(), 64 | long_description=read('README.md'), 65 | long_description_content_type="text/markdown", 66 | python_requires=">=3.7", 67 | classifiers=[ 68 | 'Development Status :: 5 - Production/Stable', 69 | 'Topic :: Utilities', 70 | 'License :: OSI Approved :: Apache Software License', 71 | 'Programming Language :: Python :: 3', 72 | 'Programming Language :: Python :: 3.7', 73 | 'Intended Audience :: Developers', 74 | ] 75 | ) 76 | --------------------------------------------------------------------------------