├── .github
    └── workflows
    │   ├── mkdocs.yml
    │   └── pre-commit.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── docs
    ├── CONTRIBUTING.md
    ├── README.md
    ├── ablation
    │   └── intro.md
    ├── assets
    │   ├── css
    │   │   ├── custom.css
    │   │   └── version-select.css
    │   ├── images
    │   │   ├── databricks_installation.png
    │   │   ├── firstgraph.png
    │   │   ├── hopsworks_installation.png
    │   │   ├── maggy.png
    │   │   ├── maggy_dt_video.png
    │   │   ├── maggy_hpo_video.png
    │   │   ├── maggyfav.png
    │   │   ├── scdgraph.png
    │   │   └── whitemaggy-eye.svg
    │   └── javascript
    │   │   └── version-select.js
    ├── blogs.md
    ├── dist_training
    │   ├── intro.md
    │   ├── tensorflow.md
    │   └── torch.md
    ├── hpo
    │   ├── intro.md
    │   └── strategies.md
    ├── publications.md
    ├── releases.md
    └── start
    │   ├── install.md
    │   └── quickstart.md
├── examples
    ├── Databricks
    │   ├── maggy-databricks-iris.ipynb
    │   └── maggy-databricks-mnist-example.ipynb
    └── README.md
├── maggy
    ├── __init__.py
    ├── ablation
    │   ├── __init__.py
    │   ├── ablationstudy.py
    │   └── ablator
    │   │   ├── __init__.py
    │   │   ├── abstractablator.py
    │   │   └── loco.py
    ├── callbacks.py
    ├── config
    │   ├── __init__.py
    │   ├── ablation.py
    │   ├── base_config.py
    │   ├── hyperparameter_optimization.py
    │   ├── lagom.py
    │   ├── tf_distributed.py
    │   └── torch_distributed.py
    ├── constants.py
    ├── core
    │   ├── __init__.py
    │   ├── config.py
    │   ├── environment
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── databricks.py
    │   │   ├── hopsworks.py
    │   │   └── singleton.py
    │   ├── exceptions.py
    │   ├── executors
    │   │   ├── __init__.py
    │   │   ├── base_executor.py
    │   │   ├── tf_dist_executor.py
    │   │   ├── torch_dist_executor.py
    │   │   └── trial_executor.py
    │   ├── experiment_driver
    │   │   ├── __init__.py
    │   │   ├── ablation_driver.py
    │   │   ├── base_driver.py
    │   │   ├── optimization_driver.py
    │   │   ├── python_driver.py
    │   │   ├── spark_driver.py
    │   │   ├── tf_distributed_training_driver.py
    │   │   └── torch_distributed_training_driver.py
    │   ├── patching
    │   │   ├── __init__.py
    │   │   ├── dataloader.py
    │   │   ├── modules.py
    │   │   └── optim.py
    │   ├── reporter.py
    │   ├── rpc.py
    │   └── tf_patching
    │   │   ├── __init__.py
    │   │   └── tf_modules.py
    ├── earlystop
    │   ├── __init__.py
    │   ├── abstractearlystop.py
    │   ├── medianrule.py
    │   └── nostop.py
    ├── experiment
    │   ├── __init__.py
    │   ├── experiment.py
    │   ├── experiment_pyspark.py
    │   └── experiment_python.py
    ├── optimizer
    │   ├── __init__.py
    │   ├── abstractoptimizer.py
    │   ├── asha.py
    │   ├── bayes
    │   │   ├── __init__.py
    │   │   ├── acquisitions.py
    │   │   ├── base.py
    │   │   ├── gp.py
    │   │   └── tpe.py
    │   ├── gridsearch.py
    │   ├── randomsearch.py
    │   └── singlerun.py
    ├── pruner
    │   ├── __init__.py
    │   ├── abstractpruner.py
    │   └── hyperband.py
    ├── searchspace.py
    ├── tensorboard.py
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_maggy.py
    │   ├── test_randomsearch.py
    │   ├── test_searchspace.py
    │   ├── test_trial.py
    │   └── test_wordcount.py
    ├── trial.py
    ├── util.py
    └── version.py
├── mkdocs.yml
├── setup.cfg
└── setup.py


/.github/workflows/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | name: mkdocs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 |   publish-master:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - uses: actions/checkout@v2
13 |         with:
14 |           fetch-depth: 0
15 |       - uses: actions/setup-python@v2
16 |         with:
17 |           python-version: '3.8'
18 |       - name: install deps
19 |         run: pip install .[dev,docs]
20 | 
21 |       - name: copy files
22 |         run: |
23 |           rm docs/CONTRIBUTING.md docs/README.md
24 |           cp -f CONTRIBUTING.md docs/
25 |           cp -f README.md docs/
26 | 
27 |       - name: setup git
28 |         run: |
29 |           git config --global user.name Mike
30 |           git config --global user.email mike@maggy.ai
31 | 
32 |       - name: mike deploy master
33 |         run: mike deploy --push --update-aliases master dev
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: pre-commit
 2 | 
 3 | on: pull_request
 4 | 
 5 | jobs:
 6 |   stylecheck:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - uses: actions/setup-python@v2
12 |         with:
13 |           python-version: '3.8'
14 |       - name: install deps
15 |         run: pip install flake8==3.9.0 black==22.3.0 pre-commit-hooks==2.4.0
16 | 
17 |       - name: black
18 |         run: black --check maggy
19 | 
20 |       - name: flake8
21 |         run: flake8 maggy
22 | 
23 |       - name: trailing-whitespace-fixer
24 |         run: trailing-whitespace-fixer $(find maggy -type f) || exit 1
25 | 
26 |       - name: end-of-file-fixer
27 |         run: end-of-file-fixer $(find maggy -type f) || exit 1
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # IDE
  2 | .vscode
  3 | .idea
  4 | scripts/
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | env.bak/
 96 | venv.bak/
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | .spyproject
101 | 
102 | # Rope project settings
103 | .ropeproject
104 | 
105 | # mkdocs documentation
106 | /site
107 | 
108 | # mypy
109 | .mypy_cache/
110 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | exclude: (^setup.py|^maggy/tests/|^docs/)
 2 | repos:
 3 | -   repo: https://github.com/psf/black
 4 |     rev: 22.3.0
 5 |     hooks:
 6 |     -   id: black
 7 |         language_version: python3
 8 | -   repo: https://gitlab.com/pycqa/flake8
 9 |     rev: 3.9.0
10 |     hooks:
11 |       - id: flake8
12 |         language_version: python3
13 | -   repo: https://github.com/pre-commit/pre-commit-hooks
14 |     rev: v2.4.0
15 |     hooks:
16 |     -   id: trailing-whitespace
17 |     -   id: end-of-file-fixer
18 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | Contributions are welcome! Not familiar with the codebase yet? No problem!
 4 | There are many ways to contribute to open source projects: reporting bugs,
 5 | helping with the documentation, spreading the word and of course, adding
 6 | new features and patches.
 7 | 
 8 | ## Reporting issues
 9 | 
10 | - Describe what you expected to happen.
11 | - If possible, include a [minimal, complete, and verifiable example](https://stackoverflow.com/help/mcve) to help
12 |   us identify the issue. This also helps to check that the issue is not with
13 |   your own code.
14 | - Describe what actually happened. Include the full traceback if there was an
15 |   exception.
16 | - List your Python, Hopsworks and Maggy versions. If possible, check if this
17 |   issue is already fixed in the repository.
18 | 
19 | ## Contributing Code
20 | 
21 | Code contributions, in the form of patches or features are welcome. In order to
22 | start developing, please follow the instructions below, to enable [pre-commit](https://pre-commit.com/) and
23 | ensure style and codechecks.
24 | 
25 | ### Python Setup
26 | 
27 | - Fork Maggy to your GitHub account by clicking the `Fork` button.
28 | 
29 | - Clone your fork locally:
30 | 
31 |   ```bash
32 |   git clone https://github.com/[username]/maggy.git
33 |   cd maggy
34 |   ```
35 | 
36 | - Add the upstream repository as a remote to update later::
37 | 
38 |   ```bash
39 |   git remote add upstream https://github.com/logicalclocks/maggy.git
40 |   git fetch upstream
41 |   ```
42 | 
43 | - Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda:
44 | 
45 |   ```bash
46 |   python3 -m venv env
47 |   . env/bin/activate
48 |   # or "env\Scripts\activate" on Windows
49 |   ```
50 | 
51 |   or with conda:
52 | 
53 |   ```bash
54 |   conda create --name maggy python=3.8
55 |   conda activate maggy
56 |   ```
57 | 
58 |   verify your python version - we are using Python 3.8:
59 | 
60 |   ```bash
61 |   python --version
62 |   ```
63 | 
64 | - Install Maggy in editable mode with development dependencies::
65 | 
66 |   ```bash
67 |   pip install -e ".[dev]"
68 |   ```
69 | 
70 | - Install pre-commit_ and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. Maggy uses pre-commit to ensure code-style and code formatting through [black](https://github.com/psf/black) and [flake8](https://gitlab.com/pycqa/flake8):
71 | 
72 |   ```bash
73 |   pip install --user pre-commit
74 |   pre-commit install
75 |   ```
76 | 
77 |   Afterwards, pre-commit will run whenever you commit.
78 | 
79 | - To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use black and flake8, or run them via the command line:
80 | 
81 |   ```bash
82 |   flake8 maggy
83 |   black maggy
84 |   ```
85 | 
86 | ### Start coding
87 | 
88 | - Create a branch to identify the issue or feature you would like to work on.
89 | - Using your favorite editor, make your changes, committing as you go.
90 | - Follow [PEP8](https://pep8.org/).
91 | - Push your commits to GitHub and [create a pull request](https://help.github.com/articles/creating-a-pull-request/).
92 | - Celebrate 🎉
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://github.com/logicalclocks/maggy">
  3 |     <img src="https://raw.githubusercontent.com/moritzmeister/maggy/mkdocs/docs/assets/images/maggy.png" width="320" alt="Maggy">
  4 |   </a>
  5 | </p>
  6 | 
  7 | <p align="center">
  8 |   <a href="https://community.hopsworks.ai"><img
  9 |     src="https://img.shields.io/discourse/users?label=Hopsworks%20Community&server=https%3A%2F%2Fcommunity.hopsworks.ai"
 10 |     alt="Hopsworks Community"
 11 |   /></a>
 12 |     <a href="https://maggy.ai"><img
 13 |     src="https://img.shields.io/badge/docs-MAGGY-orange"
 14 |     alt="Maggy Documentation"
 15 |   /></a>
 16 |   <a href="https://pypi.org/project/maggy/"><img
 17 |     src="https://img.shields.io/pypi/v/maggy?color=blue"
 18 |     alt="PyPiStatus"
 19 |   /></a>
 20 |   <a href="https://pepy.tech/project/maggy/month"><img
 21 |     src="https://pepy.tech/badge/maggy/month"
 22 |     alt="Downloads"
 23 |   /></a>
 24 |   <a href="https://github.com/psf/black"><img
 25 |     src="https://img.shields.io/badge/code%20style-black-000000.svg"
 26 |     alt="CodeStyle"
 27 |   /></a>
 28 |   <a><img
 29 |     src="https://img.shields.io/pypi/l/maggy?color=green"
 30 |     alt="License"
 31 |   /></a>
 32 | </p>
 33 | 
 34 | Maggy is a framework for **distribution transparent** machine learning experiments on [Apache Spark](https://spark.apache.org/).
 35 | In this post, we introduce a new unified framework for writing core ML training logic as **oblivious training functions**.
 36 | Maggy enables you to reuse the same training code whether training small models on your laptop or reusing the same code to scale out hyperparameter tuning or distributed deep learning on a cluster.
 37 | Maggy enables the replacement of the current waterfall development process for distributed ML applications, where code is rewritten at every stage to account for the different distribution context.
 38 | 
 39 | <p align="center">
 40 |   <figure>
 41 |     <a href="https://github.com/logicalclocks/maggy">
 42 |       <img src="https://raw.githubusercontent.com/moritzmeister/maggy/mkdocs/docs/assets/images/firstgraph.png" alt="Maggy">
 43 |     </a>
 44 |     <figcaption>Maggy uses the same distribution transparent training function in all steps of the machine learning development process.</figcaption>
 45 |   </figure>
 46 | </p>
 47 | 
 48 | ## Quick Start
 49 | 
 50 | Maggy uses PySpark as an engine to distribute the training processes. To get started, install Maggy in the Python environment used by your Spark Cluster, or install Maggy in your local Python environment with the `'spark'` extra, to run on Spark in local mode:
 51 | 
 52 | ```python
 53 | pip install maggy
 54 | ```
 55 | 
 56 | The programming model consists of wrapping the code containing the model training
 57 | inside a function. Inside that wrapper function provide all imports and
 58 | parts that make up your experiment.
 59 | 
 60 | Single run experiment:
 61 | 
 62 | ```python
 63 | def train_fn():
 64 |     # This is your training iteration loop
 65 |     for i in range(number_iterations):
 66 |         ...
 67 |         # add the maggy reporter to report the metric to be optimized
 68 |         reporter.broadcast(metric=accuracy)
 69 |          ...
 70 |     # Return metric to be optimized or any metric to be logged
 71 |     return accuracy
 72 | 
 73 | from maggy import experiment
 74 | result = experiment.lagom(train_fn=train_fn, name='MNIST')
 75 | ```
 76 | 
 77 | **lagom** is a Swedish word meaning "just the right amount". This is how MAggy
 78 | uses your resources.
 79 | 
 80 | 
 81 | ## Documentation
 82 | 
 83 | Full documentation is available at [maggy.ai](https://maggy.ai/)
 84 | 
 85 | ## Contributing
 86 | 
 87 | There are various ways to contribute, and any contribution is welcome, please follow the
 88 | CONTRIBUTING guide to get started.
 89 | 
 90 | ## Issues
 91 | 
 92 | Issues can be reported on the official [GitHub repo](https://github.com/logicalclocks/maggy/issues) of Maggy.
 93 | 
 94 | ## Citation
 95 | 
 96 | Please see our publications on [maggy.ai](https://maggy.ai/publications) to find out how to cite our work.
 97 | 
 98 | ## Acknowledgements
 99 | 
100 | The development of Maggy is supported by the <a href="https://deepcube-h2020.eu/">EU H2020 Deep Cube Project</a> (Grant agreement ID: 101004188).
101 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | Contributions are welcome! Not familiar with the codebase yet? No problem!
 4 | There are many ways to contribute to open source projects: reporting bugs,
 5 | helping with the documentation, spreading the word and of course, adding
 6 | new features and patches.
 7 | 
 8 | ## Reporting issues
 9 | 
10 | - Describe what you expected to happen.
11 | - If possible, include a [minimal, complete, and verifiable example](https://stackoverflow.com/help/mcve) to help
12 |   us identify the issue. This also helps to check that the issue is not with
13 |   your own code.
14 | - Describe what actually happened. Include the full traceback if there was an
15 |   exception.
16 | - List your Python, Hopsworks and Maggy versions. If possible, check if this
17 |   issue is already fixed in the repository.
18 | 
19 | ## Contributing Code
20 | 
21 | Code contributions, in the form of patches or features are welcome. In order to
22 | start developing, please follow the instructions below, to enable [pre-commit](https://pre-commit.com/) and
23 | ensure style and codechecks.
24 | 
25 | ### Python Setup
26 | 
27 | - Fork Maggy to your GitHub account by clicking the `Fork` button.
28 | 
29 | - Clone your fork locally:
30 | 
31 |   ```bash
32 |   git clone https://github.com/[username]/maggy.git
33 |   cd maggy
34 |   ```
35 | 
36 | - Add the upstream repository as a remote to update later::
37 | 
38 |   ```bash
39 |   git remote add upstream https://github.com/logicalclocks/maggy.git
40 |   git fetch upstream
41 |   ```
42 | 
43 | - Create a new Python environment with your favourite environment manager, e.g. virtualenv or conda:
44 | 
45 |   ```bash
46 |   python3 -m venv env
47 |   . env/bin/activate
48 |   # or "env\Scripts\activate" on Windows
49 |   ```
50 | 
51 |   or with conda:
52 | 
53 |   ```bash
54 |   conda create --name maggy python=3.8
55 |   conda activate maggy
56 |   ```
57 | 
58 |   verify your python version - we are using Python 3.8:
59 | 
60 |   ```bash
61 |   python --version
62 |   ```
63 | 
64 | - Install Maggy in editable mode with development dependencies::
65 | 
66 |   ```bash
67 |   pip install -e ".[dev]"
68 |   ```
69 | 
70 | - Install pre-commit_ and then activate its hooks. pre-commit is a framework for managing and maintaining multi-language pre-commit hooks. Maggy uses pre-commit to ensure code-style and code formatting through [black](https://github.com/psf/black) and [flake8](https://gitlab.com/pycqa/flake8):
71 | 
72 |   ```bash
73 |   pip install --user pre-commit
74 |   pre-commit install
75 |   ```
76 | 
77 |   Afterwards, pre-commit will run whenever you commit.
78 | 
79 | - To run formatting and code-style separately, you can configure your IDE, such as VSCode, to use black and flake8, or run them via the command line:
80 | 
81 |   ```bash
82 |   flake8 maggy
83 |   black maggy
84 |   ```
85 | 
86 | ### Start coding
87 | 
88 | - Create a branch to identify the issue or feature you would like to work on.
89 | - Using your favorite editor, make your changes, committing as you go.
90 | - Follow [PEP8](https://pep8.org/).
91 | - Push your commits to GitHub and [create a pull request](https://help.github.com/articles/creating-a-pull-request/).
92 | - Celebrate 🎉
93 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <a href="https://github.com/logicalclocks/maggy">
  3 |     <img src="https://raw.githubusercontent.com/moritzmeister/maggy/mkdocs/docs/assets/images/maggy.png" width="320" alt="Maggy">
  4 |   </a>
  5 | </p>
  6 | 
  7 | <p align="center">
  8 |   <a href="https://community.hopsworks.ai"><img
  9 |     src="https://img.shields.io/discourse/users?label=Hopsworks%20Community&server=https%3A%2F%2Fcommunity.hopsworks.ai"
 10 |     alt="Hopsworks Community"
 11 |   /></a>
 12 |     <a href="https://.ai"><img
 13 |     src="https://img.shields.io/badge/docs-MAGGY-orange"
 14 |     alt="Maggy Documentation"
 15 |   /></a>
 16 |   <a href="https://pypi.org/project/maggy/"><img
 17 |     src="https://img.shields.io/pypi/v/maggy?color=blue"
 18 |     alt="PyPiStatus"
 19 |   /></a>
 20 |   <a href="https://pepy.tech/project/maggy/month"><img
 21 |     src="https://pepy.tech/badge/maggy/month"
 22 |     alt="Downloads"
 23 |   /></a>
 24 |   <a href="https://github.com/psf/black"><img
 25 |     src="https://img.shields.io/badge/code%20style-black-000000.svg"
 26 |     alt="CodeStyle"
 27 |   /></a>
 28 |   <a><img
 29 |     src="https://img.shields.io/pypi/l/maggy?color=green"
 30 |     alt="License"
 31 |   /></a>
 32 | </p>
 33 | 
 34 | Maggy is a framework for **distribution transparent** machine learning experiments on [Apache Spark](https://spark.apache.org/).
 35 | In this post, we introduce a new unified framework for writing core ML training logic as **oblivious training functions**.
 36 | Maggy enables you to reuse the same training code whether training small models on your laptop or reusing the same code to scale out hyperparameter tuning or distributed deep learning on a cluster.
 37 | Maggy enables the replacement of the current waterfall development process for distributed ML applications, where code is rewritten at every stage to account for the different distribution context.
 38 | 
 39 | <p align="center">
 40 |   <figure>
 41 |     <a href="https://github.com/logicalclocks/maggy">
 42 |       <img src="https://raw.githubusercontent.com/moritzmeister/maggy/mkdocs/docs/assets/images/firstgraph.png" alt="Maggy">
 43 |     </a>
 44 |     <figcaption>Maggy uses the same distribution transparent training function in all steps of the machine learning development process.</figcaption>
 45 |   </figure>
 46 | </p>
 47 | 
 48 | ## Quick Start
 49 | 
 50 | Maggy uses PySpark as an engine to distribute the training processes. To get started, install Maggy in the Python environment used by your Spark Cluster, or install Maggy in your local Python environment with the `'spark'` extra, to run on Spark in local mode:
 51 | 
 52 | ```python
 53 | pip install maggy
 54 | ```
 55 | 
 56 | The programming model consists of wrapping the code containing the model training
 57 | inside a function. Inside that wrapper function provide all imports and
 58 | parts that make up your experiment.
 59 | 
 60 | Single run experiment:
 61 | 
 62 | ```python
 63 | def train_fn():
 64 |     # This is your training iteration loop
 65 |     for i in range(number_iterations):
 66 |         ...
 67 |         # add the maggy reporter to report the metric to be optimized
 68 |         reporter.broadcast(metric=accuracy)
 69 |          ...
 70 |     # Return metric to be optimized or any metric to be logged
 71 |     return accuracy
 72 | 
 73 | from maggy import experiment
 74 | result = experiment.lagom(train_fn=train_fn, name='MNIST')
 75 | ```
 76 | 
 77 | **lagom** is a Swedish word meaning "just the right amount". This is how MAggy
 78 | uses your resources.
 79 | 
 80 | 
 81 | ## Documentation
 82 | 
 83 | Full documentation is available [here](https://maggy.readthedocs.io/en/latest/).
 84 | 
 85 | ## Contributing
 86 | 
 87 | There are various ways to contribute, and any contribution is welcome, please follow the
 88 | CONTRIBUTING guide to get started.
 89 | 
 90 | ## Issues
 91 | 
 92 | Issues can be reported on the official [GitHub repo](https://github.com/logicalclocks/maggy/issues) of Maggy.
 93 | 
 94 | ## Citation
 95 | 
 96 | Please see our publications on [maggy.ai](https://maggy.ai/publications) to find out how to cite our work.
 97 | 
 98 | ## Acknowledgements
 99 | 
100 | The development of Maggy is supported by the <a href="https://deepcube-h2020.eu/">EU H2020 Deep Cube Project</a> (Grant agreement ID: 101004188).
101 | 


--------------------------------------------------------------------------------
/docs/assets/css/custom.css:
--------------------------------------------------------------------------------
 1 | :root {
 2 |     --md-primary-fg-color: #F15A24;
 3 |     --md-secondary-fg-color: #333333;
 4 |   }
 5 | 
 6 |   .md-header__button.md-logo {
 7 |     margin: .1rem;
 8 |     padding: .1rem;
 9 |   }
10 | 
11 |   .md-header__button.md-logo img, .md-header__button.md-logo svg {
12 |     display: block;
13 |     width: 5.2rem;
14 |     height: 2rem;
15 |     fill: currentColor;
16 |   }
17 | 
18 |   .md-tabs {
19 |     width: 100%;
20 |     overflow: auto;
21 |     color: var(--md-primary-bg-color);
22 |     background-color: var(--md-secondary-fg-color);
23 |     transition: background-color 250ms;
24 |   }
25 | 


--------------------------------------------------------------------------------
/docs/assets/css/version-select.css:
--------------------------------------------------------------------------------
 1 | @media only screen and (max-width:76.1875em) {
 2 | }
 3 | 
 4 | #version-selector select.form-control  {
 5 |   	appearance: none;
 6 |     -webkit-appearance: none;
 7 |     -moz-appearance: none;
 8 | 
 9 |     background-color: #F5F5F5;
10 | 
11 |     background-position: center right;
12 |     background-repeat: no-repeat;
13 |     border: 0px;
14 |     border-radius: 2px;
15 |     /* box-shadow: 0px 1px 3px rgb(0 0 0 / 10%); */
16 |     color: inherit;
17 |     width: -webkit-fill-available;
18 |     width: -moz-available;
19 |     max-width: 200px;
20 |     font-size: inherit;
21 |     /* font-weight: 600; */
22 |     margin: 10px;
23 |     overflow: hidden;
24 |     padding: 7px 10px;
25 |     text-overflow: ellipsis;
26 |     white-space: nowrap;
27 | }
28 | 
29 | #version-selector::after {
30 |     content: '⌄';
31 |     font-family: inherit;
32 |     font-size: 22px;
33 |     margin: -35px;
34 |     vertical-align: 7%;
35 |     padding-bottom: 10px;
36 | }
37 | 


--------------------------------------------------------------------------------
/docs/assets/images/databricks_installation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/databricks_installation.png


--------------------------------------------------------------------------------
/docs/assets/images/firstgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/firstgraph.png


--------------------------------------------------------------------------------
/docs/assets/images/hopsworks_installation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/hopsworks_installation.png


--------------------------------------------------------------------------------
/docs/assets/images/maggy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy.png


--------------------------------------------------------------------------------
/docs/assets/images/maggy_dt_video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy_dt_video.png


--------------------------------------------------------------------------------
/docs/assets/images/maggy_hpo_video.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggy_hpo_video.png


--------------------------------------------------------------------------------
/docs/assets/images/maggyfav.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/maggyfav.png


--------------------------------------------------------------------------------
/docs/assets/images/scdgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/assets/images/scdgraph.png


--------------------------------------------------------------------------------
/docs/assets/images/whitemaggy-eye.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 25.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 164.9 62.8" enable-background="new 0 0 164.9 62.8" xml:space="preserve">
 5 | <g>
 6 | 	<path fill="#FFFFFF" d="M45.8,30.9c-0.1,0-0.1,0.1-0.2,0.1c-0.2,0-0.5-0.1-0.7-0.1H45.8z"/>
 7 | 	<path fill="#FFFFFF" d="M105.4,51.3l-1.5-0.2l-1.6-4.8c0.7-0.2,1.5-0.5,2.2-0.8L105.4,51.3z"/>
 8 | 	<path fill="#FFFFFF" d="M79.3,49.8l-0.8,5.4l-1.4-0.8l0-4.8C77.8,49.7,78.6,49.8,79.3,49.8z"/>
 9 | 	<path fill="#FFFFFF" d="M64.7,47.2l-1.4,5.2l-1.3-0.9l0.6-5C63.2,46.8,63.9,47,64.7,47.2z"/>
10 | 	<path fill="#FFFFFF" d="M54.4,43l-3.3,5.4l-1-1.3l2.2-5.2C53,42.2,53.7,42.6,54.4,43z"/>
11 | 	<path fill="#FFFFFF" d="M92.2,54.9l-1.5-0.4l-1.1-5.2c0.8-0.1,1.5-0.2,2.3-0.3L92.2,54.9z"/>
12 | 	<path fill="#FFFFFF" d="M120.3,43.8l-1.5,0.4l-3.3-3.9c0.7-0.4,1.3-0.8,1.9-1.1L120.3,43.8z"/>
13 | 	<path fill="#FFFFFF" d="M93,36.2c0,1.2-0.9,2.2-2,2.2c-1.1,0-2-1-2-2.2c0,0,0,0,0,0c1.3,0,2.6-0.1,3.9-0.1C93,36.1,93,36.2,93,36.2
14 | 		z"/>
15 | 	<path fill="#FFFFFF" d="M100,35.6c-1.6,3.8-5.4,6.5-9.7,6.4c-3.8-0.1-6.9-2.4-8.2-5.8c1.2,0,2.3,0,3.4,0c0.9,2.1,2.8,3.5,5,3.6
16 | 		c2.5,0.1,4.7-1.5,5.9-3.9C97.6,35.8,98.8,35.7,100,35.6z"/>
17 | 	<path fill="#FFFFFF" d="M93,36.2c0,1.2-0.9,2.2-2,2.2c-1.1,0-2-1-2-2.2c0,0,0,0,0,0c1.3,0,2.6-0.1,3.9-0.1C93,36.1,93,36.2,93,36.2
18 | 		z"/>
19 | 	<path fill="#FFFFFF" d="M100,35.6c-1.6,3.8-5.4,6.5-9.7,6.4c-3.8-0.1-6.9-2.4-8.2-5.8c1.2,0,2.3,0,3.4,0c0.9,2.1,2.8,3.5,5,3.6
20 | 		c2.5,0.1,4.7-1.5,5.9-3.9C97.6,35.8,98.8,35.7,100,35.6z"/>
21 | 	<path fill="#FFFFFF" d="M44.9,30.9c0.2,0,0.5,0.1,0.7,0.1c0.1,0,0.1-0.1,0.2-0.1H44.9z"/>
22 | 	<path fill="#FFFFFF" d="M93,36.2c0,1.2-0.9,2.2-2,2.2c-1.1,0-2-1-2-2.2c0,0,0,0,0,0c1.3,0,2.6-0.1,3.9-0.1C93,36.1,93,36.2,93,36.2
23 | 		z"/>
24 | 	<path fill="#FFFFFF" d="M100,35.6c-1.6,3.8-5.4,6.5-9.7,6.4c-3.8-0.1-6.9-2.4-8.2-5.8c1.2,0,2.3,0,3.4,0c0.9,2.1,2.8,3.5,5,3.6
25 | 		c2.5,0.1,4.7-1.5,5.9-3.9C97.6,35.8,98.8,35.7,100,35.6z"/>
26 | 	<path fill="#FFFFFF" d="M64.7,23.3c-0.4,0.2-0.9,0.3-1.3,0.5l0-0.1C63.8,23.5,64.3,23.4,64.7,23.3z"/>
27 | 	<path fill="#FFFFFF" d="M64.7,23.3c-0.4,0.2-0.9,0.3-1.3,0.5l0-0.1C63.8,23.5,64.3,23.4,64.7,23.3z"/>
28 | 	<path fill="#FFFFFF" d="M150.8,30.3c-0.1,0-0.1-0.1-0.2-0.1L150.8,30.3c-0.6-0.4-11.2-7.8-11.2-7.8l-0.5,1l7.9,5.4l-22.9,0.4
29 | 		c-2.1-1.3-3-1.3-5.1-2.4c-2-0.6-4-1.2-5.9-1.9c-0.7-0.3-1.4-0.5-2.2-0.7l0.4,0l1.4-7.3l-1.8,0.2l-2.4,6.5c-4.2-1.2-8.6-2-13-2.5
30 | 		c0-0.6,0.2-1,0.3-1.4c0,0,0-0.1,0-0.1l0.2-4.4c0-0.1,0-0.2,0-0.3c0-0.1,0-0.3,0-0.4v0l-1.9-0.2l-0.7,6.8l0.5,0.1L93.6,21
31 | 		c-4-0.4-7.7-0.5-11.8-0.4c-0.1-0.6-0.1-1.2-0.1-1.7c0-1.3,0.2-2.6,0.2-3.9l-0.3-2L80,13.7l0,7c-1.3,0.1-2.6,0.2-3.9,0.3
32 | 		c-3.1,0.3-6.2,0.8-9.1,1.6c-0.3,0.1-0.6,0.2-0.8,0.2L64,15.8l-1.5,1l0.9,6.9c0.4-0.1,0.9-0.2,1.3-0.3c-0.4,0.2-0.9,0.3-1.3,0.5
33 | 		l0-0.1c-4.1,1.1-7.6,2.4-10.4,3.6c-0.1,0-0.2,0.1-0.3,0.1l0-0.1h0l0.8-1.1l-4.2-6.2l-1.1,1.4l2.9,6.3c-0.2,0-0.3,0.1-0.5,0.2
34 | 		c-2,0.7-3.9,1.4-5.9,2h0L35.4,30c-3.3-2.7-9.3-8-9.3-8l-0.5,1.2l7.6,6.8l-4.5,0c-3.1-2.6-9.6-8.3-9.6-8.3l-0.5,1.2l7.9,7.1l-4.2,0
35 | 		c0,0,0,0,0,0c-2.8-2-10.2-7.7-10.2-7.7l-0.5,1.1l8.5,6.6l-5.4,0.1c0.4,0.5,0.7,1.1,1,1.6c0.2,0.3,0.2,0.5,0.3,0.8
36 | 		c1.5,0,3-0.1,4.6-0.1l-9,8.1l0.5,1.2c0,0,8.9-7.8,11-9.4c1.2,0,2.4-0.1,3.5-0.1l-9.3,8.1l0.5,1.2c0,0,10.5-8.9,11.3-9.4
37 | 		c0,0,0,0,0,0c0.9,0,1.9,0,2.8-0.1l-8.1,5.4l0.5,0.9c0,0,6.9-4.5,9.9-6.3h0c2.9-0.1,5.6-0.2,8-0.2c0.8,0.7,1.6,1.3,2.4,1.9
38 | 		c34,25.4,66.2,7.1,76.4,0c1.2-0.8,2.1-1.5,2.6-1.9c0.6,0,1.3,0,1.9,0.1c1.3,0.1,2.5,0.2,3.8,0.2c0.5,0,0.9,0,1.4,0
39 | 		c3.3-0.3,6.7,0,10,0c2.3,0,4.6-0.1,6.9-0.3l-8.6,6.2l0.5,1c0,0,8.4-5.9,10.8-7.4c0.2,0,0.5-0.1,0.7-0.1c0-0.3,0-0.6,0.1-0.9
40 | 		C151,30.5,150.9,30.4,150.8,30.3z M116.8,33.8c-9.7,6.3-38,20.8-67.9,0c-0.9-0.6-1.8-1.3-2.8-2c-0.3-0.2-0.6-0.4-0.8-0.7
41 | 		c0,0,0.1-0.1,0.3-0.2c-0.2,0-0.5-0.1-0.7-0.1h0.9c-0.1,0-0.1,0.1-0.2,0.1c1.5,0.3,3,0.5,4.5,0.7h0c0,0,0,0,0,0s0,0,0,0
42 | 		c6.3,1,12.5,1.7,18.6,2.1c11.1,0.8,22,0.8,32.7,0c6.3-0.5,12.6-1.3,18.7-2.3C119.6,31.9,118.5,32.7,116.8,33.8z"/>
43 | 	<path fill="#FFFFFF" d="M45.8,30.9c-0.1,0-0.1,0.1-0.2,0.1c-0.2,0-0.5-0.1-0.7-0.1H45.8z"/>
44 | 	<path fill="#FFFFFF" d="M53,27.3c-0.1,0-0.2,0.1-0.3,0.1l0-0.1h0C52.8,27.3,52.9,27.3,53,27.3z"/>
45 | 	<polygon fill="#FFFFFF" points="94,21 93.9,21.2 93.9,21 	"/>
46 | 	<g>
47 | 		<path fill="#FFFFFF" d="M100,35.6c-1.6,3.8-5.4,6.5-9.7,6.4c-3.8-0.1-6.9-2.4-8.2-5.8c1.2,0,2.3,0,3.4,0c0.9,2.1,2.8,3.5,5,3.6
48 | 			c2.5,0.1,4.7-1.5,5.9-3.9C97.6,35.8,98.8,35.7,100,35.6z"/>
49 | 		<path fill="#FFFFFF" d="M93,36.2c0,1.2-0.9,2.2-2,2.2c-1.1,0-2-1-2-2.2c0,0,0,0,0,0c1.3,0,2.6-0.1,3.9-0.1
50 | 			C93,36.1,93,36.2,93,36.2z"/>
51 | 	</g>
52 | </g>
53 | <g>
54 | 	<path fill="#FFFFFF" d="M65.5,25.4C65.5,25.4,65.5,25.4,65.5,25.4"/>
55 | </g>
56 | <g>
57 | </g>
58 | <g>
59 | </g>
60 | <g>
61 | </g>
62 | <g>
63 | </g>
64 | <g>
65 | </g>
66 | </svg>
67 | 


--------------------------------------------------------------------------------
/docs/assets/javascript/version-select.js:
--------------------------------------------------------------------------------
 1 | window.addEventListener("DOMContentLoaded", function() {
 2 |     // This is a bit hacky. Figure out the base URL from a known CSS file the
 3 |     // template refers to...
 4 |     var ex = new RegExp("/?assets/css/version-select.css$");
 5 |     var sheet = document.querySelector('link[href$="version-select.css"]');
 6 | 
 7 |     var ABS_BASE_URL = sheet.href.replace(ex, "");
 8 |     var CURRENT_VERSION = ABS_BASE_URL.split("/").pop();
 9 | 
10 |     function makeSelect(options, selected) {
11 |       var select = document.createElement("select");
12 |       select.classList.add("form-control");
13 | 
14 |       options.forEach(function(i) {
15 |         var option = new Option(i.text, i.value, undefined,
16 |                                 i.value === selected);
17 |         select.add(option);
18 |       });
19 | 
20 |       return select;
21 |     }
22 | 
23 |     var xhr = new XMLHttpRequest();
24 |     xhr.open("GET", ABS_BASE_URL + "/../versions.json");
25 |     xhr.onload = function() {
26 |       var versions = JSON.parse(this.responseText);
27 | 
28 |       var realVersion = versions.find(function(i) {
29 |         return i.version === CURRENT_VERSION ||
30 |                i.aliases.includes(CURRENT_VERSION);
31 |       }).version;
32 | 
33 |       var select = makeSelect(versions.map(function(i) {
34 |         if (i.aliases.length > 0) {
35 |           var aliasString = " [" + i.aliases.join(", ") + "]";
36 |         } else {
37 |           var aliasString = "";
38 |         }
39 |         return {text: i.title + aliasString, value: i.version};
40 |       }), realVersion);
41 |       select.addEventListener("change", function(event) {
42 |         window.location.href = ABS_BASE_URL + "/../" + this.value;
43 |       });
44 | 
45 |       var container = document.createElement("div");
46 |       container.id = "version-selector";
47 |       // container.className = "md-nav__item";
48 |       container.appendChild(select);
49 | 
50 |       var sidebar = document.querySelector(".md-nav--primary > .md-nav__list");
51 |       sidebar.parentNode.insertBefore(container, sidebar.nextSibling);
52 |     };
53 |     xhr.send();
54 |   });
55 | 


--------------------------------------------------------------------------------
/docs/blogs.md:
--------------------------------------------------------------------------------
1 | # Blogs
2 | 


--------------------------------------------------------------------------------
/docs/dist_training/intro.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Distributed training is useful for big models that can't fit in a single machine and for very big datasets.
 4 | There are several techniques available. As an example, the Mirrored Strategies replicate the models over the workers and 
 5 | train them using splits of the data. 
 6 | 
 7 | With Maggy, you can train a Machine Learning model in a distributed fashion without rewriting the code of the training.
 8 | Distributed Training with Maggy is available on TensorFlow and PyTorch.
 9 | 
10 | If you want to know more on how to use Maggy for Distributed Training, you can watch the presentation in the next section.
11 | 
12 | When you are ready, you can inspect an example on [TensorFlow](tensorflow.md) or [PyTorch](torch.md).
13 | 
14 | ## Maggy Distributed Model Training
15 | [![Maggy Distributed Model Training](../assets/images/maggy_dt_video.png)](
16 | https://www.youtube.com/watch?v=1SHOwl37I5c)
17 | 


--------------------------------------------------------------------------------
/docs/dist_training/tensorflow.md:
--------------------------------------------------------------------------------
 1 | # Quick Start
 2 | 
 3 | Using maggy for Distributed Training works as follows:
 4 | 
 5 | * Optionally, define a model generator object, similarly to what is done for Ablation Studies.
 6 | ```py
 7 | class MyModel(tf.keras.Model):
 8 | 
 9 |     def __init__(self, ...):
10 |         super().__init__()
11 |         ...
12 |         
13 |     def call(self, ...):
14 |         ...
15 |     
16 |     ...
17 | ```
18 | * Optionally, define your train and test datasets, these will be sharded by Maggy.
19 | ```py
20 | # Extract the data
21 | (x_train, y_train),(x_test, y_test) = split_dataset(dataset)
22 | 
23 | # Do some preprocessing operations
24 | ...
25 | ```
26 | * Define a training function containing the training logic.
27 | ```py
28 | def training_function(model, train_set, test_set, hparams):
29 |     #training and testing logic
30 |     ...
31 | ```
32 | 
33 | * Create the configuration object and run the optimization.
34 | ```py
35 | config = TfDistributedConfig(name="tf_test", 
36 |                              model=model, 
37 |                              train_set=(x_train, y_train), 
38 |                              test_set=(x_test, y_test),
39 |                              hparams=model_parameters),
40 |                              ...
41 |                              )
42 | 
43 | experiment.lagom(train_fn=training_function, config=config)
44 | ```
45 | There are many parameters for the configuration object:
46 |     * model: A tf.keras.Model superclass or list of them.
47 |            Note that this has to be the class itself, not an instance.
48 |     * train_set: The training set for the training function. If you want to load the set
49 |             inside the training function, this can be disregarded.
50 |     * test_set: The test set for the training function. If you want to load the set
51 |             inside the training function, this can be disregarded.
52 |     * process_data: The function for processing the data
53 |     * hparams: model parameters that should be used during model initialization. Primarily
54 |             used to give an interface for hp optimization.
55 |     * name: Experiment name.
56 |     * hb_interval: Heartbeat interval with which the server is polling.
57 |     * description: A description of the experiment.


--------------------------------------------------------------------------------
/docs/dist_training/torch.md:
--------------------------------------------------------------------------------
 1 | # Quick Start
 2 | 
 3 | Maggy enables you to train with Microsoft’s DeepSpeed ZeRO optimizer. Since DeepSpeed does not follow the common 
 4 | PyTorch programming model, Maggy is unable to provide full distribution transparency to the user. 
 5 | This means that if you want to use DeepSpeed for your training, you will have to make small changes 
 6 | to your code. In this notebook, we will show you what exactly you have to change in order to make 
 7 | DeepSpeed run with Maggy.
 8 | 
 9 | * First off, we have to define our model as we did for TensorFlow and Ablation studies.
10 | ```py
11 | class MyModel(torch.nn.Module):
12 |     
13 |     def __init__(self, ...):
14 |         super().__init__(...)
15 |         ...
16 |         
17 |     def forward(self, ...):
18 |        ...
19 | ```
20 | 
21 | * There are a few minor changes that have to be done in order to train with DeepSpeed: - There is no need for an 
22 | optimizer anymore. You can configure your optimizer later in the DeepSpeed config. - DeepSpeed’s ZeRO requires you to 
23 | use FP16 training. Therefore, convert your data to half precision! - The backward call is not executed on the loss, 
24 | but on the model (```model.backward(loss)``` instead of ```loss.backward()```). - 
25 | The step call is not executed on the optimizer, 
26 | but also on the model (```model.step()``` instead of ```optimizer.step()```). - 
27 | As we have no optimizer anymore, there is also 
28 | no need to call ```optimizer.zero_grad()```. 
29 | You do not have to worry about the implementation of these calls, 
30 | Maggy configures your model at runtime to act as a DeepSpeed engine.
31 | ```py
32 | def train_fn(...):
33 |     ...
34 | ```
35 | 
36 | * In order to use DeepSpeed’s ZeRO, the deepspeed backend has to be chosen. This 
37 | backend also requires its own config. You can read a full specification of the possible settings 
38 | [here](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training).
39 | ```py
40 | ds_config = {"train_micro_batch_size_per_gpu": 1,
41 |  "gradient_accumulation_steps": 1,
42 |  "optimizer": {"type": "Adam", "params": {"lr": 0.1}},
43 |  "fp16": {"enabled": True},
44 |  "zero_optimization": {"stage": 2},
45 | }
46 | 
47 | config = TorchDistributedConfig(module=MyModel, backend="deepspeed", deepspeed_config=ds_config, ...)
48 | ```
49 | 
50 | * Start the training with ```lagom()```
51 | ```py
52 | result = experiment.lagom(train_fn, config)
53 | ```


--------------------------------------------------------------------------------
/docs/hpo/intro.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Introduction
 3 | 
 4 | Maggy is a framework for asynchronous trials and early-stopping with global knowledge, guided by an Optimizer. 
 5 | Developers can use an existing Optimizer, such as asynchronous successive halving (ASHA), or provide their own one. 
 6 | The basic approach we followed was to add support for the Driver and Executors to communicate via RPCs. 
 7 | The Optimizer that guides hyperparameter search is located on the Driver, and it assigns trials to Executors. 
 8 | Executors periodically send back to the Driver the current performance of their trial, 
 9 | and the Optimizer can decide to early-stop its ongoing trial, followed by sending the Executor with a new trial. 
10 | Because of the impedance mismatch between trials and the stage-/task-based execution model of Spark, 
11 | we are blocking Executors with long-running tasks to run multiple trials per task. 
12 | In this way, Executors are always kept busy running trials, and global information needed for efficient 
13 | early-stopping is aggregated in the Optimizer.
14 | If you want to know more about Maggy for Hyperparameter Optimization (HPO), you can watch the presentation in the video posted below.
15 | Otherwise, if you feel ready to explore more details, you can jump to the [strategies](strategies.md) section.
16 | 
17 | 
18 | ## Spark/AI summit presentation of Maggy for HPO
19 | [![Maggy Parallel Hyperparameter Optimization](../assets/images/maggy_hpo_video.png)](https://www.youtube.com/watch?v=0Hd1iYEL03w)
20 | 


--------------------------------------------------------------------------------
/docs/hpo/strategies.md:
--------------------------------------------------------------------------------
  1 | # Quick Start
  2 | 
  3 | Using maggy for Hyperparameter Optimization (HPO) works as follows:
  4 | 
  5 | * Define a training function containing the training logic.
  6 | ```py
  7 | def training_function(model, train_set, test_set, hparams):
  8 |     #training and testing logic
  9 |     ...
 10 | ```
 11 | 
 12 | * Define a search space, containing the hparams we want to optimize, their type and range.
 13 | ```py
 14 | #define the hyperparemeters to optimize, together with their possible values
 15 | sp = Searchspace(kernel=('DISCRETE', [2, 8]), pool=('DISCRETE', [2, 8]), dropout=('DISCRETE', [0.01, 0.99]))
 16 | ```
 17 | 
 18 | * Create the configuration object and run the optimization.
 19 | ```py
 20 | config = OptimizationConfig(num_trials=4, 
 21 |                             optimizer="randomsearch", 
 22 |                             searchspace=sp, 
 23 |                             direction="max", 
 24 |                             es_interval=1, 
 25 |                             es_min=5, 
 26 |                             name="hp_tuning_test")
 27 | 
 28 | experiment.lagom(train_fn=training_function, config=config)
 29 | ```
 30 | There are many parameters for the configuration object:
 31 |     * num_trials: Controls how many seperate runs are conducted during the hp search.
 32 |     * optimizer: Optimizer type for searching the hp searchspace.
 33 |     * searchspace: A Searchspace object configuring the names, types and ranges of hps.
 34 |     * optimization_key: Name of the metric to use for hp search evaluation.
 35 |     * direction: Direction of optimization.
 36 |     * es_interval: Early stopping polling frequency during an experiment run.
 37 |     * es_min: Minimum number of experiments to conduct before starting the early stopping
 38 |     mechanism. Useful to establish a baseline for performance estimates.
 39 |     * es_policy: Early stopping policy which formulates a rule for triggering aborts.
 40 |     * name: Experiment name.
 41 |     * description: A description of the experiment.
 42 |     * hb_interval: Heartbeat interval with which the server is polling.
 43 |     * model: The class of the model to be used in the training function.
 44 |     * train_set: The train_set to be used in the training function.
 45 |     * test_set: The test_set to be used in the training function.
 46 | 
 47 | # Strategies
 48 | 
 49 | ### Random Search
 50 | 
 51 | With Random Search, the hparams are selected randomly within the search space defined. The search space is defined 
 52 | depending on how many trials (_num_trials_) you choose. 
 53 | 
 54 | In the following example, _num_trials_ is set to 4, therefore, Maggy will choose randomly 4 combinations of kernel, 
 55 | pool and dropout values.
 56 | ```py
 57 | def training_function(hparams):
 58 |     #training and testing logic
 59 |     ...
 60 | #define the hyperparemeters to optimize, together with their possible values
 61 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99]))
 62 | 
 63 | config = OptimizationConfig(num_trials=4, 
 64 |                             optimizer="randomsearch", 
 65 |                             searchspace=sp, 
 66 |                             direction="max", 
 67 |                             es_interval=1, 
 68 |                             es_min=5, 
 69 |                             name="hp_tuning_test")
 70 | 
 71 | #run optimization
 72 | result = experiment.lagom(train_fn=training_function, config=config)
 73 | ```
 74 | ### Grid Search
 75 | 
 76 | ```py
 77 | def training_function():
 78 |     #training and testing logic
 79 |     ...
 80 | #define the hyperparemeters to optimize, together with their possible values
 81 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99]))
 82 | 
 83 | config = OptimizationConfig(num_trials=4, 
 84 |                             optimizer="gridsearch", 
 85 |                             searchspace=sp, 
 86 |                             direction="max", 
 87 |                             es_interval=1, 
 88 |                             es_min=5, 
 89 |                             name="hp_tuning_test")
 90 | 
 91 | #run optimization
 92 | result = experiment.lagom(train_fn=training_function, config=config)
 93 | ```
 94 | 
 95 | ### Asynchronous Successive Halving Algorithm (ASHA)
 96 | 
 97 | This strategy is a combination of random search and early stopping. 
 98 | ASHA tackles large-scale hyperparameter optimization problems, and it is especially useful for challenges that need a
 99 | high number of parallelism (i.e. there are a lot of hparams and a lot of workers are available).
100 | 
101 | ```py
102 | def training_function():
103 |     #training and testing logic
104 |     ...
105 | #define the hyperparemeters to optimize, together with their possible values
106 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99]))
107 | 
108 | config = OptimizationConfig(num_trials=4, 
109 |                             optimizer='asha', 
110 |                             searchspace=sp, 
111 |                             direction="max", 
112 |                             es_interval=1, 
113 |                             es_min=5, 
114 |                             name="hp_tuning_test")
115 | 
116 | experiment.lagom(train_fn=training_function, config=config)
117 | ```
118 | 
119 | you can define custom ASHA optimizers by setting 3 parameters: _reduction_factor, resource_min_ and _resource_max_.
120 | The standard values are 2, 1, and 4, respectively.
121 | To use custom values, import the class _Asha_ from _maggy.optimizer_ and create the object with custom 
122 | parameters.
123 | 
124 | ```py
125 | from maggy.optimizer import Asha
126 | 
127 | asha = Asha(3,1,10)
128 | config = OptimizationConfig(..., 
129 |                             optimizer=asha, 
130 |                             ...)
131 | ```
132 | 
133 | ### Bayesian Optimization
134 | 
135 | WIth Bayesian Optimization (BO), the hparams are chosen based on the space of the hparams. 
136 | In order to do that, the algorithm infer a function of the hparams in order to optimize the cost function of a given model.
137 | 
138 | There are two different BO methods available in Maggy, namely Gaussian Process (GP) and Tree Parzen Estimators (TPE).
139 | The GP is a tool used to infer the value of a function in which predictions follow a normal distribution. 
140 | We use that set of predictions and pick new points where we should evaluate next. From that new point, we add it to 
141 | the samples and re-build the Gaussian Process with that new information… 
142 | We keep doing this until we reach the maximal number of iterations or the limit time for example.
143 | TPE is an iterative process that uses history of evaluated hyperparameters to create probabilistic model, 
144 | which is used to suggest next set of hyperparameters to evaluate.
145 | 
146 | 
147 | ```py
148 | def training_function():
149 |     #training and testing logic
150 |     ...
151 | #define the hyperparemeters to optimize, together with their possible values
152 | sp = Searchspace(kernel=('INTEGER', [2, 8]), pool=('INTEGER', [2, 8]), dropout=('DOUBLE', [0.01, 0.99]))
153 | 
154 | config = OptimizationConfig(num_trials=4, 
155 |                             optimizer='gp', #or 'tpe' 
156 |                             searchspace=sp, 
157 |                             direction="max", 
158 |                             es_interval=1, 
159 |                             es_min=5, 
160 |                             name="hp_tuning_test")
161 | 
162 | experiment.lagom(train_fn=training_function, config=config)
163 | ```


--------------------------------------------------------------------------------
/docs/publications.md:
--------------------------------------------------------------------------------
 1 | # Publications
 2 | 
 3 | If you use Maggy for research, or write about Maggy please cite the following papers:
 4 | 
 5 | ## Maggy Hyperparameter Optimization
 6 | 
 7 | ### Maggy: Scalable Asynchronous Parallel Hyperparameter Search
 8 | 
 9 | #### Authors
10 | 
11 | Moritz Meister, Sina Sheikholeslami, Amir H. Payberah, Vladimir Vlassov, Jim Dowling
12 | 
13 | #### Abstract
14 | 
15 | Running extensive experiments is essential for building Machine Learning (ML) models. Such experiments usually require iterative execution of many trials with varying run times. In recent years, Apache Spark has become the de-facto standard for parallel data processing in the industry, in which iterative processes are im- plemented within the bulk-synchronous parallel (BSP) execution model. The BSP approach is also being used to parallelize ML trials in Spark. However, the BSP task synchronization barriers prevent asynchronous execution of trials, which leads to a reduced number of trials that can be run on a given computational budget. In this paper, we introduce Maggy, an open-source framework based on Spark, to execute ML trials asynchronously in parallel, with the ability to early stop poorly performing trials. In the experiments, we compare Maggy with the BSP execution of parallel trials in Spark and show that on random hyperparameter search on a con- volutional neural network for the Fashion-MNIST dataset Maggy reduces the required time to execute a fixed number of trials by 33% to 58%, without any loss in the final model accuracy.
16 | 
17 | [Download Paper](https://content.logicalclocks.com/hubfs/Maggy%20Scalable%20Asynchronous%20Parallel%20Hyperparameter%20Search.pdf)
18 | 
19 | #### Cite
20 | 
21 | ```
22 | @inproceedings{10.1145/3426745.3431338,
23 | author = {Meister, Moritz and Sheikholeslami, Sina and Payberah, Amir H. and Vlassov, Vladimir and Dowling, Jim},
24 | title = {Maggy: Scalable Asynchronous Parallel Hyperparameter Search},
25 | year = {2020},
26 | isbn = {9781450381826},
27 | publisher = {Association for Computing Machinery},
28 | address = {New York, NY, USA},
29 | url = {https://doi.org/10.1145/3426745.3431338},
30 | doi = {10.1145/3426745.3431338},
31 | booktitle = {Proceedings of the 1st Workshop on Distributed Machine Learning},
32 | pages = {28–33},
33 | numpages = {6},
34 | keywords = {Scalable Hyperparameter Search, Machine Learning, Asynchronous Hyperparameter Optimization},
35 | location = {Barcelona, Spain},
36 | series = {DistributedML'20}
37 | }
38 | ```
39 | 
40 | ## Oblivious Training Functions
41 | 
42 | ### Towards Distribution Transparency for Supervised ML With Oblivious Training Functions
43 | 
44 | #### Authors
45 | 
46 | Moritz Meister, Sina Sheikholeslami, Robin Andersson, Alexandru A. Ormenisan, Jim Dowling
47 | 
48 | #### Abstract
49 | 
50 | Building and productionizing Machine Learning (ML) models is a process of interdependent steps of iterative code updates, including exploratory model design, hyperparameter tuning, ablation experiments, and model training. Industrial-strength ML involves doing this at scale, using many compute resources, and this requires rewriting the training code to account for distribution. The result is that moving from a single host program to a cluster hinders iterative development of the software, as iterative development would require multiple versions of the software to be maintained and kept consistent. In this paper, we introduce the distribution oblivious training function as an abstraction for ML development in Python, whereby developers can reuse the same training function when running a notebook on a laptop or performing scale-out hyperparameter search and distributed training on clusters. Programs written in our framework look like industry-standard ML programs as we factor out dependencies using best-practice programming idioms (such as functions to generate models and data batches). We believe that our approach takes a step towards unifying single-host and distributed ML development.
51 | 
52 | [Download Paper](https://content.logicalclocks.com/hubfs/research/oblivious-training_mlsys20.pdf)
53 | 
54 | #### Cite
55 | 
56 | ```
57 | @inproceedings{oblivious-mlops,
58 | author = {Meister, Moritz and Sheikholeslami, Sina and Andersson, Robin and Ormenisan, Alexandru A. and Dowling, Jim},
59 | title = {Towards Distribution Transparency for Supervised ML With Oblivious Training Functions},
60 | year = {2020},
61 | booktitle = {MLSys ’20: Workshop on MLOps Systems, March 02–04},
62 | location = {Austin, Texas, USA}
63 | }
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/releases.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logicalclocks/maggy/8f501586d1fea2f41a8fee369586c292e06d4e6a/docs/releases.md


--------------------------------------------------------------------------------
/docs/start/install.md:
--------------------------------------------------------------------------------
 1 | # Installing Maggy in your laptop
 2 | 
 3 | Maggy is available via pip.
 4 | 
 5 | Simply run the following commnad in your terminal or conda environment. 
 6 | 
 7 | ```
 8 | pip install maggy
 9 | ```
10 | 
11 | If you want to use another version of Maggy, you can run the following command.
12 | 
13 | ```
14 | pip install maggy==x.y.z
15 | ```
16 | 
17 | The available versions are listed in PyPi https://pypi.org/project/maggy/ .
18 | 
19 | # Installing Maggy in Hopsworks
20 | 
21 | If you are using Hopsworks, Maggy should be already installed and ready to be used.
22 | 
23 | However, it is possible to check the installation from the platform by entering a project,
24 | then navigate to the "Python" section from the sidebar and click on "Manage Environment"
25 | on the top bar. Finally, search for "Maggy".
26 | 
27 | If you want to change the version of Maggy, click on "Install" in the top bar and type "Maggy"
28 | on the search input. Finally, select the version you want to install and click.
29 | The progress of the installation is displayed in the "Ongoing Operations" section.
30 | 
31 | ![](../assets/images/hopsworks_installation.png)
32 | 
33 | 
34 | 
35 | # Installing Maggy in Databricks
36 | 
37 | 
38 | It is very simple to install Maggy in your Databricks cluster. 
39 | From your project, click on Libraries in the navigation bar and Install New, at this point it is possible to install 
40 | the latest release of Maggy in the PyPi section. In order to do that, just write "maggy" in the Package input section.
41 | 
42 | You can install other version of Maggy by uploading the wheel on the Upload section.
43 | 
44 | ![It is easy to install Maggy in Databricks, just click on Libraries in the navigation bar and then click 
45 | on Install New. Finally, write "maggy" on the Package input in the PyPi section.
46 | ](../assets/images/databricks_installation.png "Maggy Installation in Databricks")
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <a href="https://github.com/logicalclocks/maggy">
 3 |     <img src="https://raw.githubusercontent.com/moritzmeister/maggy/mkdocs/docs/assets/images/maggy.png" width="320" alt="Maggy">
 4 |   </a>
 5 | </p>
 6 | 
 7 | <h2>
 8 | <p align="center">
 9 | In this folder you will find example notebooks for Maggy on Databricks environments.
10 | </p>
11 | <p align="center">
12 | Please, if you are interested in using Maggy on Hopsworks or on Local environemts, check the notebooks example at the following link:
13 | </p>
14 | <p align="center">
15 |     <a href="https://github.com/logicalclocks/hops-examples/tree/master/notebooks/ml/Maggy">
16 |     Maggy Examples
17 |     </a>
18 | </p>
19 | </h2>
20 | 


--------------------------------------------------------------------------------
/maggy/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy import searchspace
18 | 
19 | Searchspace = searchspace.Searchspace
20 | 
21 | __all__ = ["Searchspace"]
22 | 


--------------------------------------------------------------------------------
/maggy/ablation/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.ablation import ablationstudy
18 | 
19 | AblationStudy = ablationstudy.AblationStudy
20 | 
21 | __all__ = ["AblationStudy"]
22 | 


--------------------------------------------------------------------------------
/maggy/ablation/ablator/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.ablation.ablator import abstractablator
18 | 
19 | AbstractAblator = abstractablator.AbstractAblator
20 | 
21 | __all__ = ["AbstractAblator"]
22 | 


--------------------------------------------------------------------------------
/maggy/ablation/ablator/abstractablator.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from abc import ABC, abstractmethod
18 | 
19 | 
20 | class AbstractAblator(ABC):
21 |     def __init__(self, ablation_study, final_store):
22 |         self.ablation_study = ablation_study
23 |         self.final_store = final_store
24 |         self.trial_buffer = []
25 | 
26 |     @abstractmethod
27 |     def get_number_of_trials(self):
28 |         """
29 |         If applicable, calculate and return the total number of trials of the ablation experiment.
30 |         Make sure to also include the base (reference) trial in the count.
31 | 
32 |         :return: total number of trials of the ablation study experiment
33 |         :rtype: int
34 |         """
35 |         pass
36 | 
37 |     @abstractmethod
38 |     def get_dataset_generator(self, ablated_feature, dataset_type="tfrecord"):
39 |         """
40 |         Create and return a dataset generator function based on the ablation policy to be used in a trial.
41 |         The returned function will be executed on the executor per each trial.
42 | 
43 |         :param ablated_feature: the name of the feature to be excluded from the training dataset.
44 |             Must match a feature name in the corresponding feature group in the feature store.
45 |         :type ablated_feature: str
46 |         :param dataset_type: type of the dataset. For now, we only support 'tfrecord'.
47 |         :return: A function that generates a TFRecordDataset
48 |         :rtype: function
49 |         """
50 |         pass
51 | 
52 |     @abstractmethod
53 |     def get_model_generator(self, ablated_layer):
54 |         pass
55 | 
56 |     @abstractmethod
57 |     def initialize(self):
58 |         """
59 |         Initialize the ablation study experiment by generating a number of trials. Depending on the ablation policy,
60 |         this method might generate all the trials (e.g. as in LOCO), or generate a number of trials to warm-start the
61 |         experiment. The trials should be added to `trial_buffer` in form of `Trial` objects.
62 |         """
63 |         pass
64 | 
65 |     @abstractmethod
66 |     def get_trial(self, ablation_trial=None):
67 |         """
68 |         Return a `Trial` to be assigned to an executor, or `None` if there are no trials remaining in the experiment.
69 |         The trial should contain a dataset generator and a model generator.
70 |         Depending on the ablator policy, the trials could come from a list (buffer) of pre-made trials,
71 |         or generated on the fly.
72 | 
73 |         :rtype: `Trial` or `None`
74 |         """
75 |         pass
76 | 
77 |     @abstractmethod
78 |     def finalize_experiment(self, trials):
79 |         """
80 |         This method will be called before finishing the experiment. Developers can implement this method
81 |         e.g. for cleanup or extra logging.
82 |         """
83 |         pass
84 | 
85 |     def name(self):
86 |         return str(self.__class__.__name__)
87 | 


--------------------------------------------------------------------------------
/maggy/callbacks.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | class KerasBatchEnd(tf.keras.callbacks.Callback):
21 |     """A Keras callback reporting a specified `metric` at the end of the batch
22 |     to the maggy experiment driver.
23 | 
24 |     `loss` is always available as a metric, and optionally `acc` (if accuracy
25 |     monitoring is enabled, that is, accuracy is added to keras model metrics).
26 |     Validation metrics are not available for the BatchEnd callback. Validation
27 |     after every batch would be too expensive.
28 |     Default is training loss (`loss`).
29 | 
30 |     Example usage:
31 | 
32 |     >>> from maggy.callbacks import KerasBatchEnd
33 |     >>> callbacks = [KerasBatchEnd(reporter, metric='acc')]
34 |     """
35 | 
36 |     def __init__(self, reporter, metric="loss"):
37 |         super().__init__()
38 |         self.metric_name = metric
39 |         self.reporter = reporter
40 | 
41 |     def on_batch_end(self, batch, logs={}):
42 |         self.reporter.broadcast(logs.get(self.metric_name, 0))
43 | 
44 | 
45 | class KerasEpochEnd(tf.keras.callbacks.Callback):
46 |     """A Keras callback reporting a specified `metric` at the end of an epoch
47 |     to the maggy experiment driver.
48 | 
49 |     `val_loss` is always available as a metric, and optionally `val_acc` (if
50 |     accuracy monitoring is enabled, that is, accuracy is added to keras model
51 |     metrics). Training metrics are available under the names `loss` and `acc`.
52 |     Default is validation loss (`val_loss`).
53 | 
54 |     Example usage:
55 | 
56 |     >>> from maggy.callbacks import KerasBatchEnd
57 |     >>> callbacks = [KerasBatchEnd(reporter, metric='val_acc')]
58 |     """
59 | 
60 |     def __init__(self, reporter, metric="val_loss"):
61 |         super().__init__()
62 |         self.metric_name = metric
63 |         self.reporter = reporter
64 | 
65 |     def on_epoch_end(self, epoch, logs={}):
66 |         self.reporter.broadcast(logs.get(self.metric_name, 0), epoch)
67 | 


--------------------------------------------------------------------------------
/maggy/config/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.config.lagom import LagomConfig
18 | from maggy.config.base_config import BaseConfig
19 | from maggy.config.ablation import AblationConfig
20 | from maggy.config.hyperparameter_optimization import HyperparameterOptConfig
21 | from maggy.config.torch_distributed import TorchDistributedConfig
22 | from maggy.config.tf_distributed import TfDistributedConfig
23 | 
24 | __all__ = [
25 |     "LagomConfig",
26 |     "BaseConfig",
27 |     "AblationConfig",
28 |     "HyperparameterOptConfig",
29 |     "TfDistributedConfig",
30 |     "TorchDistributedConfig",
31 | ]
32 | 


--------------------------------------------------------------------------------
/maggy/config/ablation.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | from typing import Union, List
20 | 
21 | from maggy.ablation.ablationstudy import AblationStudy
22 | from maggy.ablation.ablator import AbstractAblator
23 | from maggy.config import LagomConfig
24 | import tensorflow as tf
25 | from maggy.core import config as mc
26 | 
27 | 
28 | class AblationConfig(LagomConfig):
29 |     """Config class for ablation study experiments."""
30 | 
31 |     def __init__(
32 |         self,
33 |         ablation_study: AblationStudy,
34 |         ablator: Union[str, AbstractAblator] = "loco",
35 |         direction: str = "max",
36 |         name: str = "ablationStudy",
37 |         description: str = "",
38 |         hb_interval: int = 1,
39 |         model: tf.keras.Model = None,
40 |         dataset: List[Union[str, tf.data.Dataset]] = None,
41 |     ):
42 |         """Initializes ablation study experiment parameters.
43 | 
44 |         :param ablation_study: Ablation study object that defines the entry point into the
45 |             experiment.
46 |         :param ablator: An instance of `AbstractAblator` or a supported ablator name that controls
47 |             the manner in which parts of the model are ablated.
48 |         :param direction: Optimization direction to evaluate the experiments.
49 |         :param name: Experiment name.
50 |         :param description: A description of the experiment.
51 |         :param hb_interval: Heartbeat interval with which the server is polling.
52 |         :param model: The class of the model to be used in the training function.
53 |         :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset.
54 |         These datasets represent the ones you are going to use in the training function.
55 |         For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and
56 |         extract them in the training function. If you want to load the set inside the training function, this can be
57 |         disregarded.
58 |         """
59 |         super().__init__(name, description, hb_interval)
60 |         mc.initialize()
61 |         if not mc.is_spark_available():
62 |             raise NotImplementedError("Ablation Study can run only on a Spark kernel.")
63 |         self.ablator = ablator
64 |         self.ablation_study = ablation_study
65 |         self.direction = direction
66 |         self.model = model
67 |         self.dataset = dataset
68 | 


--------------------------------------------------------------------------------
/maggy/config/base_config.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | from maggy.config import LagomConfig
20 | from maggy.core import config as mc
21 | 
22 | 
23 | class BaseConfig(LagomConfig):
24 |     def __init__(
25 |         self,
26 |         name: str = "base",
27 |         hb_interval: int = 1,
28 |         description: str = "",
29 |     ):
30 | 
31 |         """Initializes Base configuration.
32 | 
33 |         :param name: Experiment name.
34 |         :param hb_interval: Heartbeat interval with which the server is polling.
35 |         :param description: A description of the experiment.
36 |         """
37 |         super().__init__(name, description, hb_interval)
38 |         mc.initialize()
39 | 


--------------------------------------------------------------------------------
/maggy/config/hyperparameter_optimization.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | import typing
20 | from typing import Union, Type, Optional, List
21 | import tensorflow as tf
22 | 
23 | if typing.TYPE_CHECKING:
24 |     import torch
25 | 
26 | from maggy import Searchspace
27 | from maggy.earlystop import AbstractEarlyStop
28 | from maggy.optimizer import AbstractOptimizer
29 | from maggy.config import LagomConfig
30 | from maggy.core import config as mc
31 | 
32 | 
33 | class HyperparameterOptConfig(LagomConfig):
34 |     """Config class for hyperparameter optimization experiments."""
35 | 
36 |     def __init__(
37 |         self,
38 |         num_trials: int,
39 |         optimizer: Union[str, AbstractOptimizer],
40 |         searchspace: Searchspace,
41 |         optimization_key: str = "Metric",
42 |         direction: str = "max",
43 |         es_interval: int = 1,
44 |         es_min: int = 10,
45 |         es_policy: Union[str, AbstractEarlyStop] = "median",
46 |         name: str = "HPOptimization",
47 |         description: str = "",
48 |         hb_interval: int = 1,
49 |         model: Union[
50 |             tf.keras.Model, Type[torch.nn.Module], List[Type[torch.nn.Module]]
51 |         ] = None,
52 |         dataset: List[
53 |             Optional[Union[str, tf.data.Dataset, torch.util.data.Dataset]]
54 |         ] = None,
55 |     ):
56 |         """Initializes HP optimization experiment parameters.
57 | 
58 |         :param num_trials: Controls how many seperate runs are conducted during the hp search.
59 |         :param optimizer: Optimizer type for searching the hp searchspace.
60 |         :param searchspace: A Searchspace object configuring the names, types and ranges of hps.
61 |         :param optimization_key: Name of the metric to use for hp search evaluation.
62 |         :param direction: Direction of optimization.
63 |         :param es_interval: Early stopping polling frequency during an experiment run.
64 |         :param es_min: Minimum number of experiments to conduct before starting the early stopping
65 |             mechanism. Useful to establish a baseline for performance estimates.
66 |         :param es_policy: Early stopping policy which formulates a rule for triggering aborts.
67 |         :param name: Experiment name.
68 |         :param description: A description of the experiment.
69 |         :param hb_interval: Heartbeat interval with which the server is polling.
70 |         :param model: The class of the model to be used in the training function.
71 |         :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset or
72 |         torch.util.data.Dataset. These datasets represent the ones you are going to use in the training function.
73 |         For example, if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and
74 |         extract them in the training function. If you want to load the set inside the training function, this can be
75 |         disregarded.
76 |         """
77 |         super().__init__(name, description, hb_interval)
78 |         if not mc.is_spark_available():
79 |             raise NotImplementedError(
80 |                 "Hyperparameter Optimization can run only on a Spark kernel."
81 |             )
82 |         if not num_trials > 0:
83 |             raise ValueError("Number of trials should be greater than zero!")
84 |         self.num_trials = num_trials
85 |         self.optimizer = optimizer
86 |         self.optimization_key = optimization_key
87 |         self.searchspace = searchspace
88 |         self.direction = direction
89 |         self.es_policy = es_policy
90 |         self.es_interval = es_interval
91 |         self.es_min = es_min
92 |         self.model = model
93 |         self.dataset = dataset
94 | 


--------------------------------------------------------------------------------
/maggy/config/lagom.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | from abc import ABC
20 | 
21 | 
22 | class LagomConfig(ABC):
23 |     """Base class for lagom configuration classes."""
24 | 
25 |     def __init__(self, name: str, description: str, hb_interval: int):
26 |         """Initializes basic experiment info.
27 | 
28 |         :param name: Experiment name.
29 |         :param description: A description of the experiment.
30 |         :param hb_interval: Heartbeat interval with which the server is polling.
31 |         """
32 |         self.name = name
33 |         self.description = description
34 |         self.hb_interval = hb_interval
35 | 


--------------------------------------------------------------------------------
/maggy/config/tf_distributed.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | from typing import Union, Callable, List, Optional
20 | 
21 | from maggy.config import LagomConfig
22 | 
23 | import tensorflow as tf
24 | 
25 | 
26 | class TfDistributedConfig(LagomConfig):
27 |     def __init__(
28 |         self,
29 |         model: tf.keras.Model = None,
30 |         dataset: List[Optional[Union[str, tf.data.Dataset]]] = None,
31 |         process_data: Callable = None,
32 |         mixed_precision: bool = False,
33 |         name: str = "tfDist",
34 |         hb_interval: int = 1,
35 |         description: str = "",
36 |         hparams: dict = None,
37 |     ):
38 | 
39 |         """Initializes Tensorflow distributed training parameters.
40 | 
41 |         :param model: A tf.keras.Model superclass or list of them.
42 |             Note that this has to be the class itself, not an instance.
43 |         :param dataset: A List of strings containing the dataset path or list of tf.data.Dataset.
44 |         these datasets represent the ones you are going to use in the training function. For example,
45 |         if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in
46 |         the training function. If you want to load the set inside the training function, this can be disregarded.
47 |         :param process_data: The function for processing the data.
48 |         :param hparams: model parameters that should be used during model initialization. Primarily
49 |             used to give an interface for hp optimization.
50 |         :param name: Experiment name.
51 |         :param hb_interval: Heartbeat interval with which the server is polling.
52 |         :param description: A description of the experiment.
53 |         """
54 |         super().__init__(name, description, hb_interval)
55 |         self.model = model
56 |         self.dataset = dataset
57 |         self.process_data = process_data
58 |         self.mixed_precision = mixed_precision
59 |         self.hparams = hparams if hparams else {}
60 | 


--------------------------------------------------------------------------------
/maggy/config/torch_distributed.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from __future__ import annotations
18 | 
19 | import typing
20 | from typing import Union, Type, Optional, List
21 | from maggy.config import LagomConfig
22 | from maggy.core import config as mc
23 | 
24 | if typing.TYPE_CHECKING:
25 |     import torch
26 | 
27 | 
28 | class TorchDistributedConfig(LagomConfig):
29 |     """LagomConfig class for running distributed PyTorch training."""
30 | 
31 |     BACKENDS = ["torch", "deepspeed"]
32 | 
33 |     def __init__(
34 |         self,
35 |         module: Union[Type[torch.nn.Module], List[Type[torch.nn.Module]]],
36 |         dataset: List[Optional[Union[str, torch.util.data.Dataset]]] = None,
37 |         hparams: dict = None,
38 |         backend: str = "torch",
39 |         mixed_precision: bool = False,
40 |         zero_lvl: int = 0,
41 |         deepspeed_config: dict = None,
42 |         name: str = "torchDist",
43 |         hb_interval: int = 1,
44 |         description: str = "",
45 |     ):
46 |         """Initializes PyTorch distributed training parameters.
47 | 
48 |         :param module: A PyTorch module class or list of PyTorch module classes.
49 |             Note that this has to be the class itself, not an instance.
50 |         :param dataset: A List of strings containing the dataset path or list of torch.util.data.Dataset.
51 |         these datasets represent the ones you are going to use in the training function. For example,
52 |         if you have 2 datasets for training and testing, pass an array with [train_set, test_set] and extract them in
53 |         the training function. If you want to load the set inside the training function, this can be disregarded.
54 |         :param hparams: Hyperparameters that should be used during model initialization. Primarily
55 |             used to give an interface for hp optimization.
56 |         :param backend: The backend framework used for training. Note that `deepspeed` needs syntax
57 |             changes to a normal PyTorch script!
58 |         :param mixed_precision: Used to control the use of mixed precision training in `torch`
59 |             backend mode with model sharding (`zero_lvl` 3).
60 |         :param zero_lvl: Sets the ZeRO optimization stages for `torch`. Note: When using `deepspeed`
61 |             backend, overwrites `deepspeed_config` zero level!
62 |         :param deepspeed_config: A dictionary that represents a valid deepspeed ZeRO optimizer
63 |             config. For information on the config, see https://www.deepspeed.ai/docs/config-json/.
64 |         :param name: Experiment name.
65 |         :param hb_interval: Heartbeat interval with which the server is polling.
66 |         :param description: A description of the experiment.
67 |         """
68 |         super().__init__(name, description, hb_interval)
69 |         mc.initialize()
70 |         if not mc.is_spark_available():
71 |             raise NotImplementedError(
72 |                 "Torch Distributed Training can run only on a Spark kernel."
73 |             )
74 |         self.module = module
75 |         self.dataset = dataset
76 |         if backend not in self.BACKENDS:
77 |             raise ValueError(
78 |                 """Backend {} not supported by Maggy.
79 |                  Supported types are: {}""".format(
80 |                     backend, self.BACKENDS
81 |                 )
82 |             )
83 |         self.backend = backend
84 |         self.mixed_precision = mixed_precision
85 |         self.hparams = hparams if hparams else {}
86 |         self.zero_lvl = zero_lvl
87 |         self.ds_config = deepspeed_config
88 | 


--------------------------------------------------------------------------------
/maggy/constants.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | """
18 | Constants used in Maggy: Allowed datatypes etc.
19 | """
20 | import numpy as np
21 | 
22 | 
23 | class USER_FCT:
24 |     """User training function specifics."""
25 | 
26 |     RETURN_TYPES = (float, int, np.number, dict)
27 |     NUMERIC_TYPES = (float, int, np.number)
28 | 


--------------------------------------------------------------------------------
/maggy/core/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/core/config.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import tensorflow as tf
18 | 
19 | SPARK_AVAILABLE = None
20 | try:
21 |     from pyspark.sql import SparkSession  # noqa: F401
22 | 
23 |     SPARK_AVAILABLE = True
24 | except ModuleNotFoundError:
25 |     SPARK_AVAILABLE = False
26 | 
27 | MODE = None
28 | TF_VERSION = None
29 | 
30 | 
31 | def initialize():
32 |     tf_full = tf.__version__.split(".")[0]
33 |     # for building the docs since mock object doesn't mock int()
34 |     global TF_VERSION
35 |     global MODE
36 |     if not isinstance(tf_full, str):
37 |         TF_VERSION = 2
38 |     else:
39 |         TF_VERSION = int(tf_full)
40 | 
41 |     print("Detected Kernel: Python.") if not SPARK_AVAILABLE else print(
42 |         "Detected Kernel: Spark."
43 |     )
44 | 
45 | 
46 | def is_spark_available():
47 |     return SPARK_AVAILABLE
48 | 


--------------------------------------------------------------------------------
/maggy/core/environment/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/core/environment/base.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | import os
 18 | import shutil
 19 | import warnings
 20 | 
 21 | from maggy import util
 22 | from maggy.core.rpc import Client
 23 | 
 24 | 
 25 | class BaseEnv:
 26 |     """
 27 |     Support maggy on a local pyspark installation.
 28 |     """
 29 | 
 30 |     def __init__(self):
 31 |         self.log_dir = os.path.join(os.getcwd(), "experiment_log")
 32 |         if not os.path.exists(self.log_dir):
 33 |             os.mkdir(self.log_dir)
 34 | 
 35 |     def set_ml_id(self, app_id=0, run_id=0):
 36 |         os.environ["ML_ID"] = str(app_id) + "_" + str(run_id)
 37 | 
 38 |     def create_experiment_dir(self, app_id, run_id):
 39 |         if not os.path.exists(os.path.join(self.log_dir, app_id)):
 40 |             os.mkdir(os.path.join(self.log_dir, app_id))
 41 | 
 42 |         experiment_path = self.get_logdir(app_id, run_id)
 43 |         if os.path.exists(experiment_path):
 44 |             shutil.rmtree(experiment_path)
 45 | 
 46 |         os.mkdir(experiment_path)
 47 | 
 48 |     def get_logdir(self, app_id, run_id):
 49 |         return os.path.join(self.log_dir, str(app_id), str(run_id))
 50 | 
 51 |     def populate_experiment(
 52 |         self,
 53 |         model_name,
 54 |         function,
 55 |         type,
 56 |         hp,
 57 |         description,
 58 |         app_id,
 59 |         direction,
 60 |         optimization_key,
 61 |     ):
 62 |         pass
 63 | 
 64 |     def attach_experiment_xattr(self, exp_ml_id, experiment_json, command):
 65 |         pass
 66 | 
 67 |     def exists(self, hdfs_path):
 68 |         return os.path.exists(hdfs_path)
 69 | 
 70 |     def mkdir(self, hdfs_path):
 71 |         return os.mkdir(hdfs_path)
 72 | 
 73 |     def isdir(self, dir_path, project=None):
 74 |         return os.path.isdir(dir_path)
 75 | 
 76 |     def ls(self, dir_path):
 77 |         return os.listdir(dir_path)
 78 | 
 79 |     def delete(self, path, recursive=False):
 80 | 
 81 |         if self.exists(path):
 82 |             if os.path.isdir(path):
 83 |                 if recursive:
 84 |                     # remove the directory recursively
 85 |                     shutil.rmtree(path)
 86 |                 elif not os.listdir(path):
 87 |                     os.rmdir(path)
 88 |                 else:
 89 |                     warnings.warn(
 90 |                         "Could not delete the dir {}, not empty.\n"
 91 |                         "Use recursive=True when calling this function".format(path)
 92 |                     )
 93 |             elif os.path.isfile(path):
 94 |                 os.remove(path)
 95 |         else:
 96 |             warnings.warn(
 97 |                 "Could not delete the file in {}.\n"
 98 |                 "File does not exists.".format(path)
 99 |             )
100 | 
101 |     def dump(self, data, hdfs_path):
102 |         head_tail = os.path.split(hdfs_path)
103 |         if not os.path.exists(head_tail[0]):
104 |             os.makedirs(head_tail[0])
105 |         with self.open_file(hdfs_path, flags="w") as file:
106 |             file.write(data)
107 | 
108 |     def get_ip_address(self):
109 |         sc = util.find_spark().sparkContext
110 |         return sc._conf.get("spark.driver.host")
111 | 
112 |     def get_constants(self):
113 |         pass
114 | 
115 |     def open_file(self, hdfs_path, flags="r", buff_size=-1):
116 |         return open(hdfs_path, mode=flags, buffering=buff_size)
117 | 
118 |     def get_training_dataset_path(
119 |         self, training_dataset, featurestore=None, training_dataset_version=1
120 |     ):
121 |         pass
122 | 
123 |     def get_training_dataset_tf_record_schema(
124 |         self, training_dataset, training_dataset_version=1, featurestore=None
125 |     ):
126 |         pass
127 | 
128 |     def get_featurestore_metadata(self, featurestore=None, update_cache=False):
129 |         pass
130 | 
131 |     def init_ml_tracking(self, app_id, run_id):
132 |         pass
133 | 
134 |     def log_searchspace(self, app_id, run_id, searchspace):
135 |         pass
136 | 
137 |     def connect_host(self, server_sock, server_host_port, exp_driver):
138 |         if not server_host_port:
139 |             server_sock.bind(("", 0))
140 |             # hostname may not be resolvable but IP address probably will be
141 |             host = self.get_ip_address()
142 |             port = server_sock.getsockname()[1]
143 |             server_host_port = (host, port)
144 | 
145 |         else:
146 |             server_sock.bind(server_host_port)
147 | 
148 |         server_sock.listen(10)
149 | 
150 |         return server_sock, server_host_port
151 | 
152 |     def _upload_file_output(self, retval, hdfs_exec_logdir):
153 |         pass
154 | 
155 |     def project_path(self):
156 |         return os.getcwd()
157 | 
158 |     def get_user(self):
159 |         return ""
160 | 
161 |     def project_name(self):
162 |         return ""
163 | 
164 |     def finalize_experiment(
165 |         self,
166 |         experiment_json,
167 |         metric,
168 |         app_id,
169 |         run_id,
170 |         state,
171 |         duration,
172 |         logdir,
173 |         best_logdir,
174 |         optimization_key,
175 |     ):
176 |         pass
177 | 
178 |     def str_or_byte(self, str):
179 |         return str
180 | 
181 |     def get_executors(self, sc):
182 | 
183 |         if sc._conf.get("spark.dynamicAllocation.enabled") == "true":
184 |             maxExecutors = int(
185 |                 sc._conf.get("spark.dynamicAllocation.maxExecutors", defaultValue="-1")
186 |             )
187 |             if maxExecutors == -1:
188 |                 raise KeyError(
189 |                     'Failed to find "spark.dynamicAllocation.maxExecutors" property, '
190 |                     "but dynamicAllocation is enabled. "
191 |                     "Define the number of min and max executors when building the spark session."
192 |                 )
193 |         else:
194 |             maxExecutors = int(
195 |                 sc._conf.get("spark.executor.instances", defaultValue="-1")
196 |             )
197 |             if maxExecutors == -1:
198 |                 raise KeyError(
199 |                     'Failed to find "spark.executor.instances" property, '
200 |                     'Define the number of executors using "spark.executor.instances" '
201 |                     "when building the spark session."
202 |                 )
203 |         return maxExecutors
204 | 
205 |     def build_summary_json(self, logdir):
206 |         pass
207 | 
208 |     def connect_hsfs(self):
209 |         pass
210 | 
211 |     def convert_return_file_to_arr(self, return_file):
212 |         pass
213 | 
214 |     def upload_file_output(self, retval, hdfs_exec_logdir):
215 |         pass
216 | 
217 |     def get_client(self, server_addr, partition_id, hb_interval, secret, sock):
218 |         client_addr = (
219 |             self.get_ip_address(),
220 |             sock.getsockname()[1],
221 |         )
222 |         return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret)
223 | 


--------------------------------------------------------------------------------
/maggy/core/environment/databricks.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import os
18 | 
19 | from maggy.core.environment.base import BaseEnv
20 | from maggy.core.rpc import Client
21 | 
22 | 
23 | class DatabricksEnv(BaseEnv):
24 |     """
25 |     This class extends BaseEnv.
26 |     Environment implemented for maggy usage on Databricks.
27 |     """
28 | 
29 |     def __init__(self):
30 |         self.log_dir = "/dbfs/maggy_log/"
31 |         if not os.path.exists(self.log_dir):
32 |             os.mkdir(self.log_dir)
33 | 
34 |     def mkdir(self, hdfs_path):
35 |         return os.mkdir(hdfs_path)
36 | 
37 |     def project_path(self, project=None, exclude_nn_addr=False):
38 |         return "/dbfs/"
39 | 
40 |     def get_executors(self, sc):
41 |         if (
42 |             sc._conf.get("spark.databricks.clusterUsageTags.clusterScalingType")
43 |             == "autoscaling"
44 |         ):
45 |             maxExecutors = int(
46 |                 sc._conf.get(
47 |                     "spark.databricks.clusterUsageTags.clusterMaxWorkers",
48 |                     defaultValue="-1",
49 |                 )
50 |             )
51 |             if maxExecutors == -1:
52 |                 raise KeyError(
53 |                     'Failed to find "spark.databricks.clusterUsageTags.clusterMaxWorkers" property, '
54 |                     "but clusterScalingType is set to autoscaling."
55 |                 )
56 |         else:
57 |             maxExecutors = int(
58 |                 sc._conf.get(
59 |                     "spark.databricks.clusterUsageTags.clusterWorkers",
60 |                     defaultValue="-1",
61 |                 )
62 |             )
63 |             if maxExecutors == -1:
64 |                 raise KeyError(
65 |                     'Failed to find "spark.databricks.clusterUsageTags.clusterWorkers" property.'
66 |                 )
67 |         return maxExecutors
68 | 
69 |     def get_client(self, server_addr, partition_id, hb_interval, secret, sock):
70 |         server_addr = (server_addr[0], server_addr[1])
71 |         client_addr = (
72 |             server_addr[0],
73 |             sock.getsockname()[1],
74 |         )
75 |         return Client(server_addr, client_addr, partition_id, 0, hb_interval, secret)
76 | 
77 |     def get_logdir(self, app_id, run_id):
78 |         return self.log_dir
79 | 


--------------------------------------------------------------------------------
/maggy/core/environment/singleton.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import os
18 | 
19 | 
20 | class EnvSing(object):
21 | 
22 |     obj = None
23 | 
24 |     def __new__(cls, *args, **kwargs):
25 |         if EnvSing.obj is not None:
26 |             raise Exception("A Test Singleton instance already exists")
27 | 
28 |         # check hopsworks availability
29 |         if "REST_ENDPOINT" in os.environ:
30 |             print("Detected Environment: Hopsworks.")
31 | 
32 |             from maggy.core.environment import hopsworks
33 | 
34 |             EnvSing.obj = hopsworks.HopsworksEnv()
35 | 
36 |         elif os.environ.get("DATABRICKS_ROOT_CONDA_ENV") == "databricks-ml":
37 |             print("Detected Environment: Databricks.")
38 | 
39 |             from maggy.core.environment import databricks
40 | 
41 |             EnvSing.obj = databricks.DatabricksEnv()
42 | 
43 |         else:
44 |             print("Detected Environment: base.")
45 | 
46 |             from maggy.core.environment import base
47 | 
48 |             EnvSing.obj = base.BaseEnv()
49 | 
50 |         if EnvSing.obj is None:
51 |             raise NotImplementedError(
52 |                 "environment_instance is None, environment not initialised."
53 |             )
54 | 
55 |     @staticmethod
56 |     def get_instance():
57 |         """
58 |         return an instance of the environment to be used by maggy within a session.
59 |         """
60 |         if EnvSing.obj is None:
61 |             EnvSing()
62 |         return EnvSing.obj
63 | 


--------------------------------------------------------------------------------
/maggy/core/exceptions.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | """
 18 | Maggy specific exceptions.
 19 | """
 20 | 
 21 | 
 22 | class EarlyStopException(Exception):
 23 |     """Raised by the reporter when a early stop signal is received."""
 24 | 
 25 |     def __init__(self, metric):
 26 |         super().__init__()
 27 |         self.metric = metric
 28 | 
 29 | 
 30 | class NotSupportedError(Exception):
 31 |     """Raised when we are dealing with a situation that we do not (yet)
 32 |     support, e.g., a specific dataset type.
 33 |     """
 34 | 
 35 |     def __init__(self, category, value, suggestion=""):
 36 |         self.message = "({0}: {1}) is not supported by Maggy.{2}".format(
 37 |             category, value, suggestion
 38 |         )
 39 |         super().__init__(self.message)
 40 | 
 41 | 
 42 | class ReturnTypeError(TypeError):
 43 |     """User defined training function returns value of wrong type."""
 44 | 
 45 |     def __init__(self, optimization_key, return_type):
 46 |         self.message = (
 47 |             "Training function cannot return value of type: {}. "
 48 |             "Return single numeric value or 'dict' containing optimization key"
 49 |             " `{}` with numeric value".format(
 50 |                 type(return_type).__name__, optimization_key
 51 |             )
 52 |         )
 53 |         super().__init__(self.message)
 54 | 
 55 | 
 56 | class MetricTypeError(TypeError):
 57 |     """User defined training function returns metric of wrong type."""
 58 | 
 59 |     def __init__(self, optimization_key, return_type):
 60 |         self.message = (
 61 |             "The optimization metric `{}` returned by the training function is"
 62 |             " of type: {}. The optimization metric can only be numeric".format(
 63 |                 optimization_key, type(return_type).__name__
 64 |             )
 65 |         )
 66 |         super().__init__(self.message)
 67 | 
 68 | 
 69 | class BroadcastMetricTypeError(TypeError):
 70 |     """User defined training function broadcasts metric of wrong type."""
 71 | 
 72 |     def __init__(self, return_type):
 73 |         self.message = (
 74 |             "The optimization metric broadcast by the training function with "
 75 |             "the reporter is of type: {}. The optimization metric can only "
 76 |             "be numeric".format(type(return_type).__name__)
 77 |         )
 78 |         super().__init__(self.message)
 79 | 
 80 | 
 81 | class BroadcastStepTypeError(TypeError):
 82 |     """User defined training function broadcasts metric with a non-numeric step
 83 |     type.
 84 |     """
 85 | 
 86 |     def __init__(self, value, step):
 87 |         self.message = (
 88 |             "The optimization metric `{}` was broadcast by the training "
 89 |             " function in step {}, which is of type {}. The step value can "
 90 |             "only be numeric.".format(value, step, type(value).__name__)
 91 |         )
 92 |         super().__init__(self.message)
 93 | 
 94 | 
 95 | class BroadcastStepValueError(ValueError):
 96 |     """User defined training function broadcasts metric with a
 97 |     non-monotonically increasing step attribute.
 98 |     """
 99 | 
100 |     def __init__(self, value, step, prev_step):
101 |         self.message = (
102 |             "The optimization metric `{}` was broadcast by the training "
103 |             " function in step {}, while the previous step was {}. The steps "
104 |             "should be a monotonically increasing attribute.".format(
105 |                 value, step, prev_step
106 |             )
107 |         )
108 |         super().__init__(self.message)
109 | 
110 | 
111 | class BadArgumentsError(Exception):
112 |     """Raised when a function or method has been called with incompatible arguments.
113 |     This can be used by developers to prevent bad usage of their functions
114 |     or classes by other developers.
115 |     """
116 | 
117 |     def __init__(self, callable, suggestion=""):
118 |         self.message = "{0} was called using incompatible arguments. {1}".format(
119 |             callable, suggestion
120 |         )
121 |         super().__init__(self.message)
122 | 


--------------------------------------------------------------------------------
/maggy/core/executors/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/core/executors/base_executor.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from typing import Callable, Any
18 | 
19 | 
20 | def base_executor_fn(
21 |     train_fn: Callable,
22 | ) -> Callable:
23 |     """Wraps the user supplied training function in order to be passed to the Spark Executors.
24 | 
25 |     :param train_fn: Original training function.
26 |     :param config: Experiment config.
27 | 
28 |     :returns: Patched function to execute on the Spark executors.
29 |     """
30 | 
31 |     def wrapper_function(_: Any) -> None:
32 |         """Patched function from tf_dist_executor_fn factory.
33 | 
34 |         :param _: Necessary catch for the iterator given by Spark to the
35 |         function upon foreach calls. Can safely be disregarded.
36 |         """
37 | 
38 |         retval = train_fn()
39 |         return retval
40 | 
41 |     return wrapper_function
42 | 


--------------------------------------------------------------------------------
/maggy/core/experiment_driver/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from .optimization_driver import HyperparameterOptDriver
18 | from .ablation_driver import AblationDriver
19 | from .base_driver import BaseDriver
20 | 
21 | 
22 | __all__ = ["HyperparameterOptDriver", "AblationDriver", "BaseDriver"]
23 | 


--------------------------------------------------------------------------------
/maggy/core/experiment_driver/torch_distributed_training_driver.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | from pickle import PicklingError
 18 | from typing import Callable, Type, Any
 19 | 
 20 | from maggy import util
 21 | from maggy.core.environment.singleton import EnvSing
 22 | from maggy.config import TorchDistributedConfig
 23 | from maggy.core.rpc import DistributedTrainingServer
 24 | from maggy.core.experiment_driver.spark_driver import Driver
 25 | from maggy.core.executors.torch_dist_executor import torch_dist_executor_fn
 26 | 
 27 | 
 28 | class TorchDistributedTrainingDriver(Driver):
 29 |     """Driver for distributed learning on a Spark cluster.
 30 | 
 31 |     Registers the workers on an RPC server, ensures proper configuration and
 32 |     logging, and accumulates final results.
 33 |     """
 34 | 
 35 |     def __init__(self, config: TorchDistributedConfig, app_id: int, run_id: int):
 36 |         """Initializes the server for initial training setup communication and log collection.
 37 | 
 38 |         :param config: Experiment config.
 39 |         :param app_id: Maggy application ID.
 40 |         :param run_id: Maggy run ID.
 41 |         """
 42 |         super().__init__(config, app_id, run_id)
 43 |         self.server = DistributedTrainingServer(self.num_executors, config.__class__)
 44 |         self.results = []
 45 | 
 46 |     def _exp_startup_callback(self) -> None:
 47 |         """No special startup actions required."""
 48 | 
 49 |     def _exp_final_callback(self, job_end: float, _: Any) -> dict:
 50 |         """Calculates the average test error from all partitions.
 51 | 
 52 |         :param job_end: Time of the job end.
 53 |         :param _: Catches additional callback arguments.
 54 | 
 55 |         :returns: The result in a dictionary.
 56 |         """
 57 |         result = {"test result": self.average_metric()}
 58 |         exp_ml_id = str(self.app_id) + "_" + str(self.run_id)
 59 |         EnvSing.get_instance().attach_experiment_xattr(
 60 |             exp_ml_id,
 61 |             {"state": "FINISHED", "duration": int(job_end - self.job_start) * 1000},
 62 |             "FULL_UPDATE",
 63 |         )
 64 |         print("Final average test loss: {:.3f}".format(self.average_metric()))
 65 |         print(
 66 |             "Finished experiment. Total run time: "
 67 |             + util.time_diff(self.job_start, job_end)
 68 |         )
 69 |         return result
 70 | 
 71 |     def _exp_exception_callback(self, exc: Type[Exception]) -> None:
 72 |         """Catches pickling errors in case the input arguments (most likely
 73 |         the dataset) are too large to be pickled, or not compatible.
 74 | 
 75 |         :param exc: The exception to handle.
 76 | 
 77 |         :raises RuntimeError: Provides the user with additional information
 78 |             about avoiding pickle problems and includes the pickle error.
 79 |         """
 80 |         if isinstance(exc, PicklingError):
 81 |             raise RuntimeError(
 82 |                 """Pickling has failed. This is most likely caused by one of
 83 |                 the following reasons: Your module class can't be pickled, or your
 84 |                 dataset is too large.
 85 |                 Consider passing a custom dataloader that reads from files in
 86 |                 case of large datasets, and verify that your module is
 87 |                 pickleable!"""
 88 |             )
 89 |         raise exc
 90 | 
 91 |     def _patching_fn(
 92 |         self, train_fn: Callable, config: TorchDistributedConfig
 93 |     ) -> Callable:
 94 |         """Monkey patches the user training function with the distributed
 95 |         executor modifications for distributed training.
 96 | 
 97 |         :param train_fn: User provided training function.
 98 | 
 99 |         :returns: The monkey patched training function.
100 |         """
101 |         return torch_dist_executor_fn(
102 |             train_fn,
103 |             config,
104 |             self.app_id,
105 |             self.run_id,
106 |             self.server_addr,
107 |             self.hb_interval,
108 |             self._secret,
109 |             self.log_dir,
110 |         )
111 | 
112 |     def _register_msg_callbacks(self) -> None:
113 |         """Registers a metric message callback for heartbeat responses to spark
114 |         magic and a final callback to process experiment results.
115 |         """
116 |         self.message_callbacks["METRIC"] = self._log_msg_callback
117 |         self.message_callbacks["FINAL"] = self._final_msg_callback
118 | 
119 |     def _log_msg_callback(self, msg: dict) -> None:
120 |         """Callback for heartbeat messages with logs from the executors.
121 | 
122 |         :param msg: Message from the executors. Contains logs to be written to
123 |             jupyter and the DFS.
124 |         """
125 |         logs = msg.get("logs", None)
126 |         if logs is not None:
127 |             with self.log_lock:
128 |                 self.executor_logs = self.executor_logs + logs
129 | 
130 |     def _final_msg_callback(self, msg: dict) -> None:
131 |         """Appends the test result from the workers to the result list.
132 | 
133 |         :param msg: Final message from the executors.
134 |         """
135 |         self.results.append(msg.get("data", None))
136 | 
137 |     def average_metric(self) -> float:
138 |         """Calculates the current average over the valid results.
139 | 
140 |         :returns: The average result value.
141 |         """
142 |         valid_results = [x for x in self.results if x is not None]
143 |         if len(valid_results) > 0:
144 |             return sum(valid_results) / len(valid_results)
145 |         else:
146 |             return 0
147 | 


--------------------------------------------------------------------------------
/maggy/core/patching/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import torch
18 | 
19 | from .dataloader import MaggyDataLoader, MaggyPetastormDataLoader
20 | from .modules import (
21 |     get_maggy_ddp_wrapper,
22 |     get_maggy_fairscale_wrapper,
23 |     get_maggy_deepspeed_wrapper,
24 | )
25 | 
26 | __all__ = [
27 |     "get_maggy_ddp_wrapper",
28 |     "get_maggy_fairscale_wrapper",
29 |     "get_maggy_deepspeed_wrapper",
30 |     "MaggyDataLoader",
31 |     "MaggyPetastormDataLoader",
32 | ]
33 | 
34 | # Check torch version, only import ZeroRedundancyOptimizer if >= 1.8
35 | _torch_version = torch.__version__.split(".")
36 | if int(_torch_version[0]) > 1 or int(_torch_version[1]) >= 8:
37 |     from .optim import (
38 |         MaggyZeroAdadelta,
39 |         MaggyZeroAdagrad,
40 |         MaggyZeroAdam,
41 |         MaggyZeroAdamW,
42 |         MaggyZeroSparseAdam,
43 |         MaggyZeroAdamax,
44 |         MaggyZeroASGD,
45 |         MaggyZeroLBFGS,
46 |         MaggyZeroRMSprop,
47 |         MaggyZeroRprop,
48 |         MaggyZeroSGD,
49 |     )
50 | 
51 |     __all__ += [
52 |         "MaggyZeroAdadelta",
53 |         "MaggyZeroAdagrad",
54 |         "MaggyZeroAdam",
55 |         "MaggyZeroAdamW",
56 |         "MaggyZeroSparseAdam",
57 |         "MaggyZeroAdamax",
58 |         "MaggyZeroASGD",
59 |         "MaggyZeroLBFGS",
60 |         "MaggyZeroRMSprop",
61 |         "MaggyZeroRprop",
62 |         "MaggyZeroSGD",
63 |     ]
64 | 


--------------------------------------------------------------------------------
/maggy/core/patching/dataloader.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | import os
 20 | from typing import Type, Union, Optional, Any, Callable
 21 | import collections
 22 | 
 23 | import torch
 24 | from torch.utils.data import Dataset, Sampler
 25 | from torch.utils.data import DataLoader as TorchDataLoader
 26 | from petastorm.reader import make_reader, make_batch_reader
 27 | from petastorm.pytorch import DataLoader as PetastormDataLoader
 28 | from petastorm.transform import TransformSpec
 29 | 
 30 | from maggy.core.environment.singleton import EnvSing
 31 | 
 32 | 
 33 | class MaggyDataLoader(TorchDataLoader):
 34 |     """Monkey patching class for PyTorch's DataLoader.
 35 | 
 36 |     Patches the DataLoader to include a distributed sampler. Uses environment
 37 |     variables for infos such as world size for the DataLoader. These can
 38 |     assumed to be present since Maggy's distributed experiment sets them prior
 39 |     to running thes training.
 40 |     Automatically moves training data to the GPU since distributed training
 41 |     requires execution on GPUs.
 42 |     """
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         dataset: Union[Type[Dataset], str],
 47 |         batch_size: int = 1,
 48 |         shuffle: Any = False,
 49 |         sampler: Optional[Sampler[int]] = None,
 50 |         batch_sampler: Optional[Any] = None,
 51 |         num_workers: int = 0,
 52 |         collate_fn: Optional[Callable] = None,
 53 |         pin_memory: bool = False,
 54 |         drop_last: bool = False,
 55 |         timeout: float = 0,
 56 |         worker_init_fn: Optional[Callable] = None,
 57 |         **_: Any,
 58 |     ):
 59 |         """Initializes a torch DataLoader.
 60 | 
 61 |         :param dataset: A PyTorch Dataset.
 62 |         :param batch_size: How many samples per batch to load (default: ``1``).
 63 |         :param shuffle: Discarded, not compatible with Maggy.
 64 |         :param sampler: Discarded, gets replaced by DistributedSampler.
 65 |         :param batch_sampler: Discarded, not compatible with Maggy.
 66 |         :param num_workers: Discarded, currently crashes Spark if set >0.
 67 |         :param collate_fn: Merges a list of samples to a minibatch of tensors.
 68 |         :param pin_memory: Automatically transfer tensors to GPU.
 69 |         :param drop_last: Drop last incomplete batch.
 70 |         :param timeout: Timeout for collecting a batch.
 71 |         :param worker_init_fn: Executed on each worker with worker ID.
 72 |         :param _: Argument catch to stay compatible with PyTorch.
 73 |         """
 74 |         sampler = torch.utils.data.distributed.DistributedSampler(dataset=dataset)
 75 |         super().__init__(
 76 |             dataset,
 77 |             batch_size,
 78 |             shuffle=False,
 79 |             sampler=sampler,
 80 |             batch_sampler=None,
 81 |             num_workers=0,  # Multiprocessing workers do not work at the moment.
 82 |             collate_fn=collate_fn,
 83 |             pin_memory=pin_memory,
 84 |             drop_last=drop_last,
 85 |             timeout=timeout,
 86 |             worker_init_fn=worker_init_fn,
 87 |         )
 88 |         self.iterator = None
 89 | 
 90 |     def __iter__(self) -> MaggyDataLoader:
 91 |         # Reload the dataset when new iterator requested.
 92 |         self.iterator = TorchDataLoader.__iter__(self)
 93 |         return self
 94 | 
 95 |     def __next__(self) -> Union[torch.Tensor, list, dict]:
 96 |         data = self.iterator.__next__()
 97 |         return _to_cuda(data)
 98 | 
 99 | 
100 | class MaggyPetastormDataLoader(PetastormDataLoader):
101 |     """Maggy implementation of a Petastorm parquet DataLoader.
102 | 
103 |     Arguments such as world size, reader and rank are automated to make
104 |     PetastormDataLoader as similar to PyTorch's DataLoader as possible.
105 |     """
106 | 
107 |     def __init__(
108 |         self, dataset: str, batch_size: int = 1, transform_spec: TransformSpec = None
109 |     ):
110 |         """Initializes a reader depending on the dataset (Petastorm/Parquet).
111 | 
112 |         :param dataset: Path to the dataset.
113 |         :param batch_size: How many samples per batch to load (default: ``1``).
114 |         :param transform_spec: Petastorm transform spec for data augmentation.
115 |         """
116 |         num_workers = int(os.environ["WORLD_SIZE"])  # Is set at lagom startup.
117 |         rank = int(os.environ["RANK"])
118 |         is_peta_ds = EnvSing.get_instance().exists(
119 |             dataset.rstrip("/") + "/_common_metadata"
120 |         )
121 |         # Make reader only compatible with petastorm dataset.
122 |         ds_type = "Petastorm" if is_peta_ds else "Parquet"
123 |         print(f"{ds_type} dataset detected in folder {dataset}")
124 |         reader_factory = make_reader if is_peta_ds else make_batch_reader
125 |         reader = reader_factory(
126 |             dataset,
127 |             cur_shard=rank,
128 |             shard_count=num_workers,
129 |             transform_spec=TransformSpec(transform_spec),
130 |         )
131 |         super().__init__(reader, batch_size=batch_size)
132 |         self.iterator = None
133 | 
134 |     def __iter__(self) -> MaggyPetastormDataLoader:
135 |         # Reload the dataset when new iterator requested.
136 |         self.iterator = PetastormDataLoader.__iter__(self)
137 |         return self
138 | 
139 |     def __next__(self) -> Union[torch.Tensor, list, dict]:
140 |         data = self.iterator.__next__()
141 |         return _to_cuda(data)
142 | 
143 |     def __len__(self):
144 |         raise NotImplementedError("Petastorm dataloader does not support __len__.")
145 | 
146 | 
147 | def _to_cuda(data: Union[torch.Tensor, list, dict]) -> Union[torch.Tensor, list, dict]:
148 |     """Recurses into data, transfers tensors to GPU.
149 | 
150 |     :param data: The data structure to be transferred.
151 | 
152 |     :raises TypeError: In case of unsupported data structures.
153 | 
154 |     :returns: The transfered data structure.
155 |     """
156 |     if isinstance(data, collections.abc.Mapping):
157 |         return {key: _to_cuda(val) for key, val in data.items()}
158 |     if isinstance(data, (list, tuple)):
159 |         data_list = [_to_cuda(el) for el in data]
160 |         return data_list if isinstance(data, list) else tuple(data_list)
161 |     if isinstance(data, torch.Tensor):
162 |         return data.cuda()
163 |     raise TypeError(f"Type {type(data)} currently not supported!")
164 | 


--------------------------------------------------------------------------------
/maggy/core/patching/modules.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | from types import SimpleNamespace
 20 | from typing import Type, Any
 21 | 
 22 | from torch.nn import Module as TorchModule
 23 | from torch.nn.parallel import DistributedDataParallel as TorchDistributedDataParallel
 24 | 
 25 | try:
 26 |     from deepspeed.pipe import PipelineModule
 27 |     from deepspeed.runtime.engine import DeepSpeedEngine
 28 |     from fairscale.nn import (
 29 |         FullyShardedDataParallel as FairscaleFullyShardedDataParallel,
 30 |     )
 31 | except ImportError:
 32 |     print(
 33 |         """Warning: deepspeed and/or fairscale import failed. DeepSpeed backend and zero_lvl 3
 34 |           won't be available"""
 35 |     )
 36 | 
 37 | 
 38 | def get_maggy_ddp_wrapper(module: Type[TorchModule]):
 39 |     """Factory function for MaggyDDPModuleWrapper.
 40 | 
 41 |     :param module: PyTorch module passed by the user.
 42 |     """
 43 | 
 44 |     class MaggyDDPModuleWrapper(TorchDistributedDataParallel):
 45 |         """Wrapper around PyTorch's DDP Module.
 46 | 
 47 |         The wrapper replaces the user's module. Since the module's signature needs to be preserved,
 48 |         we cannot add the module as an additional parameter during initialization. Instead, it is
 49 |         configured by its factory function.
 50 |         """
 51 | 
 52 |         __module = module  # Avoid overwriting torch module
 53 | 
 54 |         def __init__(self, *args: Any, **kwargs: Any):
 55 |             """Initializes the previously set module, moves it to the GPU and initializes a DDP
 56 |             module with it.
 57 | 
 58 |             :param args: Arguments passed by the user for module initialization.
 59 |             :param kwargs: Keyword arguments passed by the user for module initialization.
 60 |             """
 61 |             # Avoid self because bound method adds to args which makes the function call fail
 62 |             model = MaggyDDPModuleWrapper.__module(*args, **kwargs).cuda()
 63 |             super().__init__(model)
 64 | 
 65 |     return MaggyDDPModuleWrapper
 66 | 
 67 | 
 68 | def get_maggy_fairscale_wrapper(module: TorchModule, mixed_precision: bool):
 69 |     """Factory function for MaggyFairScaleModuleWrapper.
 70 | 
 71 |     :param module: PyTorch module passed by the user.
 72 |     :param mixed_precision: Switches on mixed precision for the FairscaleModule.
 73 |     """
 74 | 
 75 |     class MaggyFairScaleModuleWrapper(FairscaleFullyShardedDataParallel):
 76 |         """Wrapper around Fairscale's FullyShardedDataParallel Module.
 77 | 
 78 |         The wrapper replaces the user's module. Since the module's signature needs to be preserved,
 79 |         we cannot add the module as an additional parameter during initialization. Instead, it is
 80 |         configured by its factory function.
 81 |         """
 82 | 
 83 |         __module = module
 84 |         __mixed_precision = mixed_precision
 85 | 
 86 |         def __init__(self, *args: Any, **kwargs: Any):
 87 |             """Initializes the previously set module, moves it to the GPU and initializes a
 88 |             Fairscale FullyShardedDataParallel module with it.
 89 | 
 90 |             :param args: Arguments passed by the user for module initialization.
 91 |             :param kwargs: Keyword arguments passed by the user for module initialization.
 92 |             """
 93 |             # Avoid self because bound method adds to args which makes the function call fail
 94 |             model = MaggyFairScaleModuleWrapper.__module(*args, **kwargs).cuda()
 95 |             super().__init__(model, mixed_precision=self.__mixed_precision)
 96 | 
 97 |     return MaggyFairScaleModuleWrapper
 98 | 
 99 | 
100 | def get_maggy_deepspeed_wrapper(module: TorchModule, config_params: dict):
101 |     """Factory function for MaggyDeepSpeedModuleWrapper.
102 | 
103 |     :param module: PyTorch module passed by the user.
104 |     :param mixed_precision: DeepSpeed config dict passed by the user.
105 |     """
106 |     assert (
107 |         module != PipelineModule
108 |     ), """Maggy currently doesn't support pipeline
109 |              modules with DeepSpeed ZeRO."""
110 | 
111 |     class MaggyDeepSpeedModuleWrapper(DeepSpeedEngine):
112 |         """Wrapper around DeepSpeed's DeepSpeedEngine.
113 | 
114 |         The wrapper replaces the user's module. Since the module's signature needs to be preserved,
115 |         we cannot add the module as an additional parameter during initialization. Instead, it is
116 |         configured by its factory function.
117 |         """
118 | 
119 |         __module = module
120 |         __config_params = config_params
121 | 
122 |         def __init__(self, *args, **kwargs):
123 |             """Initializes the previously set module and initializes a DeepSpeedEngine with it.
124 | 
125 |             :param args: Arguments passed by the user for module initialization.
126 |             :param kwargs: Keyword arguments passed by the user for module initialization.
127 |             """
128 |             # Avoid self because bound method adds to args which makes the function call fail.
129 |             # No .cuda() calls for DeepSpeed necessary.
130 |             model = MaggyDeepSpeedModuleWrapper.__module(*args, **kwargs)
131 |             ds_args = SimpleNamespace(local_rank=0)
132 |             super().__init__(
133 |                 ds_args,
134 |                 model,
135 |                 model_parameters=model.parameters(),
136 |                 config_params=self.__config_params,
137 |             )
138 | 
139 |     return MaggyDeepSpeedModuleWrapper
140 | 


--------------------------------------------------------------------------------
/maggy/core/patching/optim.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | from __future__ import annotations
 18 | 
 19 | import inspect
 20 | from typing import Any
 21 | from abc import ABC, abstractclassmethod
 22 | 
 23 | 
 24 | import torch.optim as optim
 25 | from torch.distributed.optim import ZeroRedundancyOptimizer
 26 | 
 27 | 
 28 | class MaggyZeroOptimizer(ZeroRedundancyOptimizer, ABC):
 29 |     """Abstract base class for Maggy's optimizer patching classes."""
 30 | 
 31 |     def __init__(self, *args: Any, **kwargs: Any):
 32 |         """Initializes a ZeroRedundancyOptimizer with the defined optim_cls as optimizer class.
 33 | 
 34 |         Passes any arguments for initialization of the default optimizer to the Zero optimizer.
 35 |         :param args: Optimizer args. Get reassigned into kwargs.
 36 |         :param kwargs: Optimizer kwargs.
 37 |         """
 38 |         # Move args to kwargs to pass args into kwargs only ZeroRedundancyOptimizer
 39 |         arg_spec = inspect.getfullargspec(self.optim_cls.__init__)
 40 |         for idx, arg in enumerate(args):
 41 |             kwargs[arg_spec.args[idx + 1]] = arg  # +1 to skip self in arg_spec
 42 |         params = kwargs.pop("params", None)
 43 |         super().__init__(
 44 |             params, self.optim_cls, group=None, bucket_cap_kb=2**24, **kwargs
 45 |         )
 46 | 
 47 |     @property
 48 |     @abstractclassmethod
 49 |     def optim_cls(cls: optim.Optimizer) -> MaggyZeroOptimizer:
 50 |         """Optimizer class property needs to be defined by each implementation of the base class."""
 51 |         raise NotImplementedError
 52 | 
 53 | 
 54 | class MaggyZeroAdadelta(MaggyZeroOptimizer):
 55 |     """Maggy's Zero wrapper around torch's Adadelta optimizer."""
 56 | 
 57 |     optim_cls = optim.Adadelta
 58 | 
 59 | 
 60 | class MaggyZeroAdagrad(MaggyZeroOptimizer):
 61 |     """Maggy's Zero wrapper around torch's Adagrad optimizer."""
 62 | 
 63 |     optim_cls = optim.Adagrad
 64 | 
 65 | 
 66 | class MaggyZeroAdam(MaggyZeroOptimizer):
 67 |     """Maggy's Zero wrapper around torch's Adam optimizer."""
 68 | 
 69 |     optim_cls = optim.Adam
 70 | 
 71 | 
 72 | class MaggyZeroAdamW(MaggyZeroOptimizer):
 73 |     """Maggy's Zero wrapper around torch's AdamW optimizer."""
 74 | 
 75 |     optim_cls = optim.AdamW
 76 | 
 77 | 
 78 | class MaggyZeroSparseAdam(MaggyZeroOptimizer):
 79 |     """Maggy's Zero wrapper around torch's SparseAdam optimizer."""
 80 | 
 81 |     optim_cls = optim.SparseAdam
 82 | 
 83 | 
 84 | class MaggyZeroAdamax(MaggyZeroOptimizer):
 85 |     """Maggy's Zero wrapper around torch's Adamax optimizer."""
 86 | 
 87 |     optim_cls = optim.Adamax
 88 | 
 89 | 
 90 | class MaggyZeroASGD(MaggyZeroOptimizer):
 91 |     """Maggy's Zero wrapper around torch's ASGD optimizer."""
 92 | 
 93 |     optim_cls = optim.ASGD
 94 | 
 95 | 
 96 | class MaggyZeroLBFGS(MaggyZeroOptimizer):
 97 |     """Maggy's Zero wrapper around torch's LBFGS optimizer."""
 98 | 
 99 |     optim_cls = optim.LBFGS
100 | 
101 | 
102 | class MaggyZeroRMSprop(MaggyZeroOptimizer):
103 |     """Maggy's Zero wrapper around torch's RMSprop optimizer."""
104 | 
105 |     optim_cls = optim.RMSprop
106 | 
107 | 
108 | class MaggyZeroRprop(MaggyZeroOptimizer):
109 |     """Maggy's Zero wrapper around torch's Rprop optimizer."""
110 | 
111 |     optim_cls = optim.Rprop
112 | 
113 | 
114 | class MaggyZeroSGD(MaggyZeroOptimizer):
115 |     """Maggy's Zero wrapper around torch's SGD optimizer."""
116 | 
117 |     optim_cls = optim.SGD
118 | 


--------------------------------------------------------------------------------
/maggy/core/reporter.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | """
 18 | API Module for the user to include in his training code.
 19 | 
 20 | """
 21 | import threading
 22 | from datetime import datetime
 23 | 
 24 | from maggy import constants
 25 | from maggy.core import exceptions
 26 | 
 27 | from maggy.core.environment.singleton import EnvSing
 28 | 
 29 | 
 30 | class Reporter(object):
 31 |     """
 32 |     Thread-safe store for sending a metric and logs from executor to driver
 33 |     """
 34 | 
 35 |     def __init__(self, log_file, partition_id, task_attempt, print_executor):
 36 |         self.metric = None
 37 |         self.step = -1
 38 |         self.lock = threading.RLock()
 39 |         self.stop = False
 40 |         self.trial_id = None
 41 |         self.trial_log_file = None
 42 |         self.logs = ""
 43 |         self.log_file = log_file
 44 |         self.partition_id = partition_id
 45 |         self.task_attempt = task_attempt
 46 |         self.print_executor = print_executor
 47 | 
 48 |         # Open executor log file descriptor
 49 |         # This log is for all maggy system related log messages
 50 |         env = EnvSing.get_instance()
 51 |         if not env.exists(log_file):
 52 |             env.dump("", log_file)
 53 |         self.fd = env.open_file(log_file, flags="w")
 54 |         self.trial_fd = None
 55 | 
 56 |     def init_logger(self, trial_log_file):
 57 |         """Initializes the trial log file"""
 58 |         self.trial_log_file = trial_log_file
 59 |         env = EnvSing.get_instance()
 60 |         # Open trial log file descriptor
 61 |         if not env.exists(self.trial_log_file):
 62 |             env.dump("", self.trial_log_file)
 63 |         self.trial_fd = env.open_file(self.trial_log_file, flags="w")
 64 | 
 65 |     def close_logger(self):
 66 |         """Savely closes the file descriptors of the log files.
 67 | 
 68 |         close() can be called multiple times and flushes the buffer contents
 69 |         before closing
 70 |         """
 71 |         with self.lock:
 72 |             if self.trial_fd:
 73 |                 self.trial_fd.close()
 74 |             self.fd.close()
 75 | 
 76 |     # report
 77 |     def broadcast(self, metric, step=None):
 78 |         """Broadcast a metric to the experiment driver with the heartbeat.
 79 | 
 80 |         :param metric: Metric to be broadcasted
 81 |         :type metric: int, float
 82 |         :param step: The iteration step which produced the metric, e.g. batch or
 83 |             epoch number, or any other monotonically increasing progress attribute
 84 |         :type step: int
 85 |         :raises exception: EarlyStopException if told by the experiment driver
 86 |         """
 87 |         with self.lock:
 88 |             # if stop == True -> raise exception to break training function
 89 |             if step is None:
 90 |                 step = self.step + 1
 91 |             if not isinstance(metric, constants.USER_FCT.NUMERIC_TYPES):
 92 |                 raise exceptions.BroadcastMetricTypeError(metric)
 93 |             elif not isinstance(step, constants.USER_FCT.NUMERIC_TYPES):
 94 |                 raise exceptions.BroadcastStepTypeError(metric, step)
 95 |             elif step < self.step:
 96 |                 raise exceptions.BroadcastStepValueError(metric, step, self.step)
 97 |             else:
 98 |                 self.step = step
 99 |                 self.metric = metric
100 |             if self.stop:
101 |                 raise exceptions.EarlyStopException(metric)
102 | 
103 |     def log(self, log_msg, jupyter=False):
104 |         """Logs a message to the executor logfile and executor stderr and
105 |         optionally prints the message in jupyter.
106 | 
107 |         :param log_msg: Message to log.
108 |         :type log_msg: str
109 |         :param verbose: Print in Jupyter Notebook, defaults to True
110 |         :type verbose: bool, optional
111 |         """
112 |         with self.lock:
113 |             env = EnvSing.get_instance()
114 |             try:
115 |                 msg = (datetime.now().isoformat() + " ({0}/{1}): {2} \n").format(
116 |                     self.partition_id, self.task_attempt, log_msg
117 |                 )
118 |                 if jupyter:
119 |                     jupyter_log = str(self.partition_id) + ": " + log_msg
120 |                     if self.trial_fd:
121 |                         self.trial_fd.write(env.str_or_byte(msg))
122 |                     self.logs = self.logs + jupyter_log + "\n"
123 |                 else:
124 |                     self.fd.write(env.str_or_byte(msg))
125 |                     if self.trial_fd:
126 |                         self.trial_fd.write(env.str_or_byte(msg))
127 |                     self.print_executor(msg)
128 |             # Throws ValueError when operating on closed HDFS file object
129 |             # Throws AttributeError when calling file ops on NoneType object
130 |             except (IOError, ValueError, AttributeError) as e:
131 |                 self.fd.write(
132 |                     env.str_or_byte(
133 |                         "An error occurred while writing logs: {}".format(e)
134 |                     )
135 |                 )
136 | 
137 |     def get_data(self):
138 |         """Returns the metric and logs to be sent to the experiment driver."""
139 |         with self.lock:
140 |             log_to_send = self.logs
141 |             self.logs = ""
142 |             return self.metric, self.step, log_to_send
143 | 
144 |     def reset(self):
145 |         """
146 |         Resets the reporter to the initial state in order to start a new
147 |         trial.
148 |         """
149 |         with self.lock:
150 |             self.metric = None
151 |             self.step = -1
152 |             self.stop = False
153 |             self.trial_id = None
154 |             self.fd.flush()
155 |             self.trial_fd.close()
156 |             self.trial_fd = None
157 |             self.trial_log_file = None
158 | 
159 |     def early_stop(self):
160 |         with self.lock:
161 |             if self.metric is not None:
162 |                 self.stop = True
163 | 
164 |     def get_trial_id(self):
165 |         with self.lock:
166 |             return self.trial_id
167 | 
168 |     def set_trial_id(self, trial_id):
169 |         with self.lock:
170 |             self.trial_id = trial_id
171 | 


--------------------------------------------------------------------------------
/maggy/core/tf_patching/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/core/tf_patching/tf_modules.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | 
18 | def get_wrapped_model(model, strategy, is_chief):
19 |     """Build a wrap class for the user defined tensorflow model.
20 | 
21 |     :param model: The class of the user defined tensorflow model.
22 |     :param strategy: A class of the strategy to be used for the training.
23 | 
24 |     :returns: The TensorflowModelWrapper class.
25 |     """
26 | 
27 |     class TensorflowModelWrapper(model):
28 |         """A wrap for tensorflow model, the __init__() and compile() functions are overridden in order to launch
29 |         train the model in a distributed fashion.
30 |         """
31 | 
32 |         def __init__(self, *args, **kwargs):
33 |             self.__strategy = strategy
34 |             self.is_chief = is_chief
35 |             with self.__strategy.scope():
36 |                 try:
37 |                     super().__init__(*args, **kwargs)
38 |                 except TypeError as e:
39 |                     raise TypeError(
40 |                         "The parameters passed to TensorflowConfig (model_parameters) "
41 |                         "do not corresponds to the parameters defined in your model "
42 |                         "constructor."
43 |                     ) from e
44 | 
45 |     return TensorflowModelWrapper
46 | 


--------------------------------------------------------------------------------
/maggy/earlystop/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.earlystop import abstractearlystop, medianrule, nostop
18 | 
19 | AbstractEarlyStop = abstractearlystop.AbstractEarlyStop
20 | MedianStoppingRule = medianrule.MedianStoppingRule
21 | NoStoppingRule = nostop.NoStoppingRule
22 | 
23 | __all__ = ["AbstractEarlyStop", "MedianStoppingRule", "NoStoppingRule"]
24 | 


--------------------------------------------------------------------------------
/maggy/earlystop/abstractearlystop.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from abc import ABC, abstractmethod
18 | 
19 | 
20 | class AbstractEarlyStop(ABC):
21 |     """An Abstract class to implement custom early stopping criteria."""
22 | 
23 |     @staticmethod
24 |     @abstractmethod
25 |     def earlystop_check(to_check, finalized_trials, direction):
26 |         """A abstract static method that needs to be implemented with a custom
27 |         early stopping criterium.
28 | 
29 |         The function is called internally in the user specified interval
30 |         with three arguments. It is necessary to add these to the function
31 |         definition.
32 | 
33 |         :param to_check: A dictionary of currently running
34 |         trials, where the key is the `trial_id` and values are Trial objects.
35 |         :type to_check: dictionary
36 |         :param finalized_trials: A list of finalized Trial objects.
37 |         :type finalized_trials: list
38 |         :param direction: A string describing the search objective, i.e. 'min'
39 |         or 'max'.
40 |         :type direction: str
41 |         """
42 |         pass
43 | 


--------------------------------------------------------------------------------
/maggy/earlystop/medianrule.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import statistics
18 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop
19 | 
20 | 
21 | class MedianStoppingRule(AbstractEarlyStop):
22 |     """The Median Stopping Rule implements the simple strategy of stopping a
23 |     trial if its performance falls below the median of other trials at similar
24 |     points in time.
25 |     """
26 | 
27 |     @staticmethod
28 |     def earlystop_check(to_check, finalized_trials, direction):
29 | 
30 |         results = []
31 |         median = None
32 | 
33 |         # count step from zero so it can be used as index for array
34 |         step = len(to_check.metric_history)
35 | 
36 |         if step > 0:
37 | 
38 |             for fin_trial in finalized_trials:
39 | 
40 |                 if len(fin_trial.metric_history) >= step:
41 |                     avg = sum(fin_trial.metric_history[:step]) / float(step)
42 |                     results.append(avg)
43 | 
44 |             try:
45 |                 median = statistics.median(results)
46 |             except statistics.StatisticsError as e:
47 |                 raise Exception(
48 |                     "Warning: StatisticsError when calling early stop method\n{}".format(
49 |                         e
50 |                     )
51 |                 )
52 | 
53 |             if median is not None:
54 |                 if direction == "max":
55 |                     if max(to_check.metric_history) < median:
56 |                         return to_check.trial_id
57 |                 elif direction == "min":
58 |                     if min(to_check.metric_history) > median:
59 |                         return to_check.trial_id
60 |             return None
61 | 


--------------------------------------------------------------------------------
/maggy/earlystop/nostop.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.earlystop.abstractearlystop import AbstractEarlyStop
18 | 
19 | 
20 | class NoStoppingRule(AbstractEarlyStop):
21 |     """The no stopping rule never stops any trials early."""
22 | 
23 |     @staticmethod
24 |     def earlystop_check(to_check, finalized_trials, direction):
25 |         return None
26 | 


--------------------------------------------------------------------------------
/maggy/experiment/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/experiment/experiment.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from typing import Callable
18 | from maggy.config import LagomConfig, BaseConfig
19 | 
20 | 
21 | def lagom(train_fn: Callable, config: LagomConfig = None) -> dict:
22 | 
23 |     """Entry point for Maggy experiment, this function passes the parameters to the lagom function
24 |     depending whether the kernel is pyspark or python.
25 |     **lagom** is a Swedish word meaning "just the right amount".
26 | 
27 |     :param train_fn: User defined experiment containing the model training.
28 |     :param config: An experiment configuration. For more information, see config.
29 | 
30 |     :returns: The experiment results as a dict.
31 |     """
32 |     from maggy.experiment import experiment_python
33 |     from maggy.experiment import experiment_pyspark
34 |     from maggy.core import config as maggyconfig
35 | 
36 |     if config is None:
37 |         config = BaseConfig(
38 |             name="maggy_experiment",
39 |             description="experiment without config object",
40 |             hb_interval=1,
41 |         )
42 |     if maggyconfig.is_spark_available():
43 |         return experiment_pyspark.lagom(train_fn, config)
44 |     else:
45 |         return experiment_python.lagom(train_fn, config)
46 | 


--------------------------------------------------------------------------------
/maggy/experiment/experiment_pyspark.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | """
 18 | Experiment module used for running asynchronous optimization tasks.
 19 | The programming model is that you wrap the code containing the model
 20 | training inside a wrapper function.
 21 | Inside that wrapper function provide all imports and parts that make up your
 22 | experiment, see examples below. Whenever a function to run an experiment is
 23 | invoked it is also registered in the Experiments service along with the
 24 | provided information.
 25 | """
 26 | import atexit
 27 | import time
 28 | from functools import singledispatch
 29 | from typing import Callable
 30 | 
 31 | from maggy import util
 32 | from maggy.core.environment.singleton import EnvSing
 33 | from maggy.config import *
 34 | from maggy.core.experiment_driver import HyperparameterOptDriver, AblationDriver
 35 | 
 36 | 
 37 | APP_ID = None
 38 | RUNNING = False
 39 | RUN_ID = 1
 40 | EXPERIMENT_JSON = {}
 41 | 
 42 | 
 43 | def lagom(train_fn: Callable, config: LagomConfig) -> dict:
 44 |     """Launches a maggy experiment, which depending on 'config' can either
 45 |     be a hyperparameter optimization, an ablation study experiment or distributed
 46 |     training. Given a search space, objective and a model training procedure `train_fn`
 47 |     (black-box function), an experiment is the whole process of finding the
 48 |     best hyperparameter combination in the search space, optimizing the
 49 |     black-box function. Currently maggy supports random search and a median
 50 |     stopping rule.
 51 |     **lagom** is a Swedish word meaning "just the right amount".
 52 | 
 53 |     :param train_fn: User defined experiment containing the model training.
 54 |     :param config: An experiment configuration. For more information, see config.
 55 | 
 56 |     :returns: The experiment results as a dict.
 57 |     """
 58 |     global APP_ID
 59 |     global RUNNING
 60 |     global RUN_ID
 61 |     job_start = time.time()
 62 |     try:
 63 |         if RUNNING:
 64 |             raise RuntimeError("An experiment is currently running.")
 65 |         RUNNING = True
 66 |         spark_context = util.find_spark().sparkContext
 67 |         APP_ID = str(spark_context.applicationId)
 68 |         APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID)
 69 |         EnvSing.get_instance().set_app_id(APP_ID)
 70 |         driver = lagom_driver(config, APP_ID, RUN_ID)
 71 |         return driver.run_experiment(train_fn, config)
 72 |     except:  # noqa: E722
 73 |         _exception_handler(util.seconds_to_milliseconds(time.time() - job_start))
 74 |         raise
 75 |     finally:
 76 |         # Clean up spark jobs
 77 |         RUN_ID += 1
 78 |         RUNNING = False
 79 |         util.find_spark().sparkContext.setJobGroup("", "")
 80 | 
 81 | 
 82 | @singledispatch
 83 | def lagom_driver(config, app_id: int, run_id: int) -> None:
 84 |     """Dispatcher function for the experiment driver.
 85 | 
 86 |     Initializes the appropriate driver according to the config.
 87 | 
 88 |     :raises TypeError: Only gets called if no fitting config was found and
 89 |         raises an error.
 90 |     """
 91 |     raise TypeError(
 92 |         "Invalid config type! LagomConfig is expected to be of type {}, {}, {} or {}, \
 93 |                      but is of type {}".format(
 94 |             HyperparameterOptConfig,
 95 |             AblationConfig,
 96 |             TorchDistributedConfig,
 97 |             TfDistributedConfig,
 98 |             type(config),
 99 |         )
100 |     )
101 | 
102 | 
103 | @lagom_driver.register(HyperparameterOptConfig)
104 | def _(
105 |     config: HyperparameterOptConfig, app_id: int, run_id: int
106 | ) -> HyperparameterOptDriver:
107 |     return HyperparameterOptDriver(config, app_id, run_id)
108 | 
109 | 
110 | @lagom_driver.register(AblationConfig)
111 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver:
112 |     return AblationDriver(config, app_id, run_id)
113 | 
114 | 
115 | @lagom_driver.register(TorchDistributedConfig)
116 | # Lazy import of TorchDistributedTrainingDriver to avoid Torch import until necessary
117 | def _(
118 |     config: TorchDistributedConfig, app_id: int, run_id: int
119 | ) -> "TorchDistributedTrainingDriver":  # noqa: F821
120 |     from maggy.core.experiment_driver.torch_distributed_training_driver import (
121 |         TorchDistributedTrainingDriver,
122 |     )
123 | 
124 |     return TorchDistributedTrainingDriver(config, app_id, run_id)
125 | 
126 | 
127 | @lagom_driver.register(TfDistributedConfig)
128 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary
129 | def _(
130 |     config: TfDistributedConfig, app_id: int, run_id: int
131 | ) -> "TfDistributedTrainingDriver":  # noqa: F821
132 |     from maggy.core.experiment_driver.tf_distributed_training_driver import (
133 |         TfDistributedTrainingDriver,
134 |     )
135 | 
136 |     return TfDistributedTrainingDriver(config, app_id, run_id)
137 | 
138 | 
139 | @lagom_driver.register(LagomConfig)
140 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary
141 | def _(config: LagomConfig, app_id: int, run_id: int) -> "BaseDriver":  # noqa: F821
142 |     from maggy.core.experiment_driver.base_driver import (
143 |         BaseDriver,
144 |     )
145 | 
146 |     return BaseDriver(config, app_id, run_id)
147 | 
148 | 
149 | def _exception_handler(duration: int) -> None:
150 |     """Handles exceptions during execution of an experiment.
151 | 
152 |     :param duration: Duration of the experiment until exception in milliseconds
153 |     """
154 |     try:
155 |         global RUNNING
156 |         global EXPERIMENT_JSON
157 |         if RUNNING:
158 |             EXPERIMENT_JSON["state"] = "FAILED"
159 |             EXPERIMENT_JSON["duration"] = duration
160 |             exp_ml_id = APP_ID + "_" + str(RUN_ID)
161 |             EnvSing.get_instance().attach_experiment_xattr(
162 |                 exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE"
163 |             )
164 |     except Exception as err:
165 |         util.log(err)
166 | 
167 | 
168 | def _exit_handler() -> None:
169 |     """Handles jobs killed by the user."""
170 |     try:
171 |         global RUNNING
172 |         global EXPERIMENT_JSON
173 |         if RUNNING:
174 |             EXPERIMENT_JSON["status"] = "KILLED"
175 |             exp_ml_id = APP_ID + "_" + str(RUN_ID)
176 |             EnvSing.get_instance().attach_experiment_xattr(
177 |                 exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE"
178 |             )
179 |     except Exception as err:
180 |         util.log(err)
181 | 
182 | 
183 | atexit.register(_exit_handler)
184 | 


--------------------------------------------------------------------------------
/maggy/experiment/experiment_python.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2021 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | """
 18 | Experiment module used for running asynchronous optimization tasks.
 19 | The programming model is that you wrap the code containing the model
 20 | training inside a wrapper function.
 21 | Inside that wrapper function provide all imports and parts that make up your
 22 | experiment, see examples below. Whenever a function to run an experiment is
 23 | invoked it is also registered in the Experiments service along with the
 24 | provided information.
 25 | """
 26 | import atexit
 27 | import calendar
 28 | import time
 29 | from functools import singledispatch
 30 | from typing import Callable
 31 | 
 32 | from maggy import util
 33 | from maggy.core.environment.singleton import EnvSing
 34 | from maggy.config import *
 35 | from maggy.core.experiment_driver import (
 36 |     HyperparameterOptDriver,
 37 |     AblationDriver,
 38 |     BaseDriver,
 39 | )
 40 | 
 41 | 
 42 | APP_ID = None
 43 | RUNNING = False
 44 | RUN_ID = 1
 45 | EXPERIMENT_JSON = {}
 46 | 
 47 | 
 48 | def lagom(train_fn: Callable, config) -> dict:
 49 |     """Launches a maggy experiment, which depending on 'config' can either
 50 |     be a hyperparameter optimization, an ablation study experiment or distributed
 51 |     training. Given a search space, objective and a model training procedure `train_fn`
 52 |     (black-box function), an experiment is the whole process of finding the
 53 |     best hyperparameter combination in the search space, optimizing the
 54 |     black-box function. Currently maggy supports random search and a median
 55 |     stopping rule.
 56 |     **lagom** is a Swedish word meaning "just the right amount".
 57 | 
 58 |     :param train_fn: User defined experiment containing the model training.
 59 |     :param config: An experiment configuration. For more information, see config.
 60 | 
 61 |     :returns: The experiment results as a dict.
 62 |     """
 63 |     global APP_ID
 64 |     global RUNNING
 65 |     global RUN_ID
 66 |     job_start = time.time()
 67 |     try:
 68 |         if RUNNING:
 69 |             raise RuntimeError("An experiment is currently running.")
 70 |         RUNNING = True
 71 |         APP_ID = str(calendar.timegm(time.gmtime()))
 72 |         APP_ID = "application_" + APP_ID + "_0001"
 73 |         APP_ID, RUN_ID = util.register_environment(APP_ID, RUN_ID)
 74 |         driver = lagom_driver(config, APP_ID, RUN_ID)
 75 |         return driver.run_experiment(train_fn, config)
 76 |     except:  # noqa: E722
 77 |         _exception_handler(util.seconds_to_milliseconds(time.time() - job_start))
 78 |         raise
 79 |     finally:
 80 |         # Clean up spark jobs
 81 |         RUN_ID += 1
 82 |         RUNNING = False
 83 | 
 84 | 
 85 | @singledispatch
 86 | def lagom_driver(config, app_id: int, run_id: int) -> None:
 87 |     """Dispatcher function for the experiment driver.
 88 | 
 89 |     Initializes the appropriate driver according to the config.
 90 | 
 91 |     :raises TypeError: Only gets called if no fitting config was found and
 92 |         raises an error.
 93 |     """
 94 |     raise TypeError(
 95 |         "Invalid config type! Config is expected to be of type {}, {}, {}, {} or {}, \
 96 |                      but is of type {}".format(
 97 |             HyperparameterOptConfig,
 98 |             AblationConfig,
 99 |             TorchDistributedConfig,
100 |             TfDistributedConfig,
101 |             BaseConfig,
102 |             type(config),
103 |         )
104 |     )
105 | 
106 | 
107 | @lagom_driver.register(HyperparameterOptConfig)
108 | def _(
109 |     config: HyperparameterOptConfig, app_id: int, run_id: int
110 | ) -> HyperparameterOptDriver:
111 |     return HyperparameterOptDriver(config, app_id, run_id)
112 | 
113 | 
114 | @lagom_driver.register(AblationConfig)
115 | def _(config: AblationConfig, app_id: int, run_id: int) -> AblationDriver:
116 |     return AblationDriver(config, app_id, run_id)
117 | 
118 | 
119 | @lagom_driver.register(TorchDistributedConfig)
120 | # Lazy import of DistributedDriver to avoid Torch import until necessary
121 | def _(
122 |     config: TorchDistributedConfig, app_id: int, run_id: int
123 | ) -> "TorchDistributedTrainingDriver":  # noqa: F821
124 |     from maggy.core.experiment_driver.torch_distributed_training_driver import (
125 |         TorchDistributedTrainingDriver,
126 |     )
127 | 
128 |     return TorchDistributedTrainingDriver(config, app_id, run_id)
129 | 
130 | 
131 | @lagom_driver.register(TfDistributedConfig)
132 | # Lazy import of TfDistributedTrainingDriver to avoid Tensorflow import until necessary
133 | def _(
134 |     config: TfDistributedConfig, app_id: int, run_id: int
135 | ) -> "TfDistributedTrainingDriver":  # noqa: F821
136 |     from maggy.core.experiment_driver.tf_distributed_training_driver import (
137 |         TfDistributedTrainingDriver,
138 |     )
139 | 
140 |     return TfDistributedTrainingDriver(config, app_id, run_id)
141 | 
142 | 
143 | @lagom_driver.register(BaseConfig)
144 | # Lazy import of BaseConfig
145 | def _(config: BaseConfig, app_id: int, run_id: int) -> BaseDriver:
146 |     from maggy.core.experiment_driver.base_driver import (
147 |         BaseDriver,
148 |     )
149 | 
150 |     return BaseDriver(config, app_id, run_id)
151 | 
152 | 
153 | @lagom_driver.register(LagomConfig)
154 | # Lazy import of LagomConfig
155 | def _(config: LagomConfig, app_id: int, run_id: int) -> BaseDriver:
156 |     from maggy.core.experiment_driver.base_driver import (
157 |         BaseDriver,
158 |     )
159 | 
160 |     return BaseDriver(config, app_id, run_id)
161 | 
162 | 
163 | def _exception_handler(duration: int) -> None:
164 |     """Handles exceptions during execution of an experiment.
165 | 
166 |     :param duration: Duration of the experiment until exception in milliseconds
167 |     """
168 |     try:
169 |         global RUNNING
170 |         global EXPERIMENT_JSON
171 |         if RUNNING:
172 |             EXPERIMENT_JSON["state"] = "FAILED"
173 |             EXPERIMENT_JSON["duration"] = duration
174 |             exp_ml_id = APP_ID + "_" + str(RUN_ID)
175 |             EnvSing.get_instance().attach_experiment_xattr(
176 |                 exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE"
177 |             )
178 |     except Exception as err:
179 |         util.log(err)
180 | 
181 | 
182 | def _exit_handler() -> None:
183 |     """Handles jobs killed by the user."""
184 |     try:
185 |         global RUNNING
186 |         global EXPERIMENT_JSON
187 |         if RUNNING:
188 |             EXPERIMENT_JSON["status"] = "KILLED"
189 |             exp_ml_id = APP_ID + "_" + str(RUN_ID)
190 |             EnvSing.get_instance().attach_experiment_xattr(
191 |                 exp_ml_id, EXPERIMENT_JSON, "FULL_UPDATE"
192 |             )
193 |     except Exception as err:
194 |         util.log(err)
195 | 
196 | 
197 | atexit.register(_exit_handler)
198 | 


--------------------------------------------------------------------------------
/maggy/optimizer/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.optimizer import abstractoptimizer, randomsearch, asha, singlerun, gridsearch
18 | 
19 | AbstractOptimizer = abstractoptimizer.AbstractOptimizer
20 | RandomSearch = randomsearch.RandomSearch
21 | Asha = asha.Asha
22 | SingleRun = singlerun.SingleRun
23 | GridSearch = gridsearch.GridSearch
24 | 
25 | __all__ = ["AbstractOptimizer", "RandomSearch", "Asha", "SingleRun", "GridSearch"]
26 | 


--------------------------------------------------------------------------------
/maggy/optimizer/asha.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | import math
 18 | 
 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer
 20 | from maggy.trial import Trial
 21 | 
 22 | 
 23 | class Asha(AbstractOptimizer):
 24 |     """Implements the Asynchronous Successiv Halving Algorithm - ASHA
 25 |     (https://arxiv.org/abs/1810.05934). ASHA needs three additional parameters:
 26 |     'reduction_factor', 'resource_min' and 'resource_max'. To set custom values
 27 |     for these, initialize the optimizer first and pass it as an argument to
 28 |     'experiment.lagom()'.
 29 | 
 30 |     Sample usage:
 31 | 
 32 |     >>> # Import Asha optimizer
 33 |     >>> from maggy.optimizer import Asha
 34 |     >>> # Instantiate the optimizer with custom arguments
 35 |     >>> asha = Asha(3, 1, 9)
 36 |     >>> experiment.lagom(..., optimizer=asha, ...)
 37 |     """
 38 | 
 39 |     def __init__(self, reduction_factor=2, resource_min=1, resource_max=4):
 40 |         super().__init__()
 41 | 
 42 |         if reduction_factor < 2 or not isinstance(reduction_factor, int):
 43 |             raise Exception(
 44 |                 "Can't initialize ASHA optimizer. 'reduction_factor'"
 45 |                 + "has to be an integer equal to or larger than 2: {}".format(
 46 |                     reduction_factor
 47 |                 )
 48 |             )
 49 |         else:
 50 |             self.reduction_factor = reduction_factor
 51 | 
 52 |         if not isinstance(resource_min, int):
 53 |             raise Exception(
 54 |                 "Can't initialize ASHA optimizer. 'resource_min'"
 55 |                 + "not of type INTEGER."
 56 |             )
 57 |         if not isinstance(resource_max, int):
 58 |             raise Exception(
 59 |                 "Can't initialize ASHA optimizer. 'resource_max'"
 60 |                 + "not of type INTEGER."
 61 |             )
 62 |         if resource_min >= resource_max:
 63 |             raise Exception(
 64 |                 "Can't initialize ASHA optimizer. 'resource_min' is larger"
 65 |                 + "than 'resource_max'."
 66 |             )
 67 | 
 68 |         self.resource_min = resource_min
 69 |         self.resource_max = resource_max
 70 | 
 71 |     def initialize(self):
 72 | 
 73 |         # maps rung index k to trials in that rung
 74 |         self.rungs = {0: []}
 75 |         # maps rung index k to trial ids of trials that were promoted
 76 |         self.promoted = {0: []}
 77 | 
 78 |         self.max_rung = int(
 79 |             math.floor(
 80 |                 math.log(self.resource_max / self.resource_min, self.reduction_factor)
 81 |             )
 82 |         )
 83 | 
 84 |         assert self.num_trials >= self.reduction_factor ** (self.max_rung + 1)
 85 | 
 86 |     def get_suggestion(self, trial=None):
 87 | 
 88 |         if trial is not None:
 89 |             # stopping criterium: one trial in max rung
 90 |             if self.max_rung in self.rungs:
 91 |                 # return None to signal end to experiment driver
 92 |                 return None
 93 | 
 94 |             # for each rung
 95 |             for k in range(self.max_rung - 1, -1, -1):
 96 |                 # if rung doesn't exist yet go one lower
 97 |                 if k not in self.rungs:
 98 |                     continue
 99 | 
100 |                 # get top_k
101 |                 rung_finished = len(
102 |                     [x for x in self.rungs[k] if x.status == Trial.FINALIZED]
103 |                 )
104 | 
105 |                 if (rung_finished // self.reduction_factor) - len(
106 |                     self.promoted.get(k, [])
107 |                 ) > 0:
108 |                     candidates = self._top_k(
109 |                         k, (rung_finished // self.reduction_factor)
110 |                     )
111 |                 else:
112 |                     candidates = []
113 | 
114 |                 # if there are no candidates, check one rung below
115 |                 if not candidates:
116 |                     continue
117 | 
118 |                 # select all that haven't been promoted yet
119 |                 promotable = [
120 |                     t for t in candidates if t.trial_id not in self.promoted.get(k, [])
121 |                 ]
122 | 
123 |                 nr_promotable = len(promotable)
124 |                 if nr_promotable >= 1:
125 |                     new_rung = k + 1
126 |                     # sorted in decending order, take highest -> index 0
127 |                     old_trial = promotable[0]
128 |                     # make copy of params to be able to change resource
129 |                     params = old_trial.params.copy()
130 |                     params["budget"] = self.resource_min * (
131 |                         self.reduction_factor**new_rung
132 |                     )
133 |                     promote_trial = Trial(params)
134 | 
135 |                     # open new rung if not exists
136 |                     if new_rung in self.rungs:
137 |                         self.rungs[new_rung].append(promote_trial)
138 |                     else:
139 |                         self.rungs[new_rung] = [promote_trial]
140 | 
141 |                     # remember promoted trial
142 |                     if k in self.promoted:
143 |                         self.promoted[k].append(old_trial.trial_id)
144 |                     else:
145 |                         self.promoted[k] = [old_trial.trial_id]
146 | 
147 |                     return promote_trial
148 | 
149 |         # else return random configuration in base rung
150 |         params = self.searchspace.get_random_parameter_values(1)[0]
151 |         # set resource to minimum
152 |         params["budget"] = self.resource_min
153 |         to_return = Trial(params)
154 |         # add to bottom rung
155 |         self.rungs[0].append(to_return)
156 |         return to_return
157 | 
158 |     def finalize_experiment(self, trials):
159 |         return
160 | 
161 |     def _top_k(self, rung_k, number):
162 |         """Find top-`number` trials in `rung_k`."""
163 |         if number > 0:
164 |             filtered = [x for x in self.rungs[rung_k] if x.status == Trial.FINALIZED]
165 |             filtered.sort(key=lambda x: x.final_metric, reverse=True)
166 |             # return top k trials if finalized
167 |             return filtered[:number]
168 |         else:
169 |             return []
170 | 


--------------------------------------------------------------------------------
/maggy/optimizer/bayes/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.optimizer.bayes import base, gp, tpe
18 | 
19 | BaseAsyncBO = base.BaseAsyncBO
20 | GP = gp.GP
21 | TPE = tpe.TPE
22 | 
23 | __all__ = [
24 |     "TPE",
25 |     "BaseAsyncBO",
26 |     "GP",
27 | ]
28 | 


--------------------------------------------------------------------------------
/maggy/optimizer/bayes/acquisitions.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | from abc import ABC
 18 | from abc import abstractmethod
 19 | 
 20 | import numpy as np
 21 | from skopt.acquisition import _gaussian_acquisition
 22 | from skopt.acquisition import gaussian_acquisition_1D
 23 | 
 24 | 
 25 | class AbstractAcquisitionFunction(ABC):
 26 |     @staticmethod
 27 |     @abstractmethod
 28 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
 29 |         """evaluates acquisition function at given points
 30 | 
 31 |         :param X: Values where the acquisition function should be computed. shape = (n_locations, n_hparams)
 32 |         :type X: np.ndarray
 33 |         :param surrogate_model: the surrogate model of the bayesian optimizer.
 34 |         :type surrogate_model: GaussianProcessRegressor
 35 |         :param y_opt: currently best observed value
 36 |         :type y_opt: float
 37 |         :param acq_func_kwargs: additional arguments for the acquisition function
 38 |         :type acq_func_kwargs: dict|None
 39 |         :return: Acquisition function values computed at X. shape = (n_locations,)
 40 |         :rtype: np.ndarray
 41 |         """
 42 |         pass
 43 | 
 44 |     @staticmethod
 45 |     @abstractmethod
 46 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
 47 |         """A wrapper around the acquisition function that is called by fmin_l_bfgs_b.
 48 |            This is because lbfgs allows only 1-D input.
 49 | 
 50 |         :param x: value where acquisition function should be evaluated. shape=(n_hparams, )
 51 |         :type x: np.ndarray
 52 |         :param surrogate_model: the surrogate model of the bayesian optimizer.
 53 |         :type surrogate_model: GaussianProcessRegressor
 54 |         :param y_opt: currently best observed value
 55 |         :type y_opt: float
 56 |         :param acq_func_kwargs: additional arguments for the acquisition function
 57 |         :type acq_func_kwargs: dict|None
 58 |         :return: tuple containing two arrays. the first holds the evaluated values of the acquisition function at value
 59 |                  x; shape = (1,) . the second holds the gradients; shape = (n_hparams,).
 60 |         :rtype: tuple
 61 |         """
 62 |         pass
 63 | 
 64 |     def name(self):
 65 |         return str(self.__class__.__name__)
 66 | 
 67 | 
 68 | class GaussianProcess_EI(AbstractAcquisitionFunction):
 69 |     """xi in acq_func_kwargs"""
 70 | 
 71 |     @staticmethod
 72 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
 73 |         return _gaussian_acquisition(
 74 |             X=X,
 75 |             model=surrogate_model,
 76 |             y_opt=y_opt,
 77 |             acq_func="EI",
 78 |             acq_func_kwargs=acq_func_kwargs,
 79 |         )
 80 | 
 81 |     @staticmethod
 82 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
 83 |         return gaussian_acquisition_1D(
 84 |             X=x,
 85 |             model=surrogate_model,
 86 |             y_opt=y_opt,
 87 |             acq_func="EI",
 88 |             acq_func_kwargs=acq_func_kwargs,
 89 |         )
 90 | 
 91 | 
 92 | class GaussianProcess_PI(AbstractAcquisitionFunction):
 93 |     @staticmethod
 94 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
 95 |         return _gaussian_acquisition(
 96 |             X=X,
 97 |             model=surrogate_model,
 98 |             y_opt=y_opt,
 99 |             acq_func="PI",
100 |             acq_func_kwargs=acq_func_kwargs,
101 |         )
102 | 
103 |     @staticmethod
104 |     def evaluate_1_d(X, surrogate_model, y_opt, acq_func_kwargs=None):
105 |         return gaussian_acquisition_1D(
106 |             X=X,
107 |             model=surrogate_model,
108 |             y_opt=y_opt,
109 |             acq_func="PI",
110 |             acq_func_kwargs=acq_func_kwargs,
111 |         )
112 | 
113 | 
114 | class GaussianProcess_LCB(AbstractAcquisitionFunction):
115 |     """kappa in acq_func_kwargs"""
116 | 
117 |     @staticmethod
118 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
119 |         return _gaussian_acquisition(
120 |             X=X,
121 |             model=surrogate_model,
122 |             y_opt=y_opt,
123 |             acq_func="LCB",
124 |             acq_func_kwargs=acq_func_kwargs,
125 |         )
126 | 
127 |     @staticmethod
128 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
129 |         return gaussian_acquisition_1D(
130 |             X=x,
131 |             model=surrogate_model,
132 |             y_opt=y_opt,
133 |             acq_func="LCB",
134 |             acq_func_kwargs=acq_func_kwargs,
135 |         )
136 | 
137 | 
138 | class GaussianProcess_UCB(AbstractAcquisitionFunction):
139 |     @staticmethod
140 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
141 |         raise NotImplementedError
142 | 
143 |     @staticmethod
144 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
145 |         raise NotImplementedError
146 | 
147 | 
148 | class TPE_EI(AbstractAcquisitionFunction):
149 |     @staticmethod
150 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
151 |         raise NotImplementedError
152 | 
153 |     @staticmethod
154 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
155 |         raise NotImplementedError
156 | 
157 | 
158 | class AsyTS(AbstractAcquisitionFunction):
159 |     @staticmethod
160 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
161 |         return surrogate_model.sample_y(X).reshape(
162 |             X.shape[0],
163 |         )
164 | 
165 |     @staticmethod
166 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
167 |         """A wrapper around the acquisition function that is called by fmin_l_bfgs_b.
168 |            This is because lbfgs allows only 1-D input.
169 | 
170 |         :param x: value where acquisition function should be evaluated. shape=(n_hparams, )
171 |         :type x: np.ndarray
172 |         :param surogate_model: the surrogate model of the bayesian optimizer.
173 |         :type surogate_model: GaussianProcessRegressor
174 |         :param y_opt: currently best observed value
175 |         :type y_opt: float
176 |         :param acq_func_kwargs: additional arguments for the acquisition function
177 |         :type acq_func_kwargs: dict|None
178 |         :return: values of the acquisition function at value x. shape = (1,)
179 |         :rtype: np.ndarray
180 |         """
181 |         return surrogate_model.sample_y(np.expand_dims(x, axis=0)).reshape(
182 |             1,
183 |         )
184 | 
185 | 
186 | class HLP(AbstractAcquisitionFunction):
187 |     @staticmethod
188 |     def evaluate(X, surrogate_model, y_opt, acq_func_kwargs=None):
189 |         raise NotImplementedError
190 | 
191 |     @staticmethod
192 |     def evaluate_1_d(x, surrogate_model, y_opt, acq_func_kwargs=None):
193 |         raise NotImplementedError
194 | 


--------------------------------------------------------------------------------
/maggy/optimizer/gridsearch.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import itertools
18 | 
19 | from maggy import Searchspace
20 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer
21 | 
22 | 
23 | class GridSearch(AbstractOptimizer):
24 |     def __init__(self, **kwargs):
25 |         super().__init__(**kwargs)
26 |         self.config_buffer = []
27 | 
28 |     def initialize(self):
29 |         self._validate_searchspace(self.searchspace)
30 |         # create all trials ahead of time
31 |         self.config_buffer = self._grid_params(self.searchspace)
32 | 
33 |     @classmethod
34 |     def get_num_trials(cls, searchspace):
35 |         """For grid search the number of trials is determined by the size of the
36 |         cartisian product, depending on the user-set number of parameters and values
37 | 
38 |         This method is duplicating part of the code in the `initialize()` mainly to keep
39 |         the flow of things the same as for other optimizers, where the user sets only
40 |         the number of trials to evaluate.
41 |         """
42 |         cls._validate_searchspace(searchspace)
43 |         return len(cls._grid_params(searchspace))
44 | 
45 |     def get_suggestion(self, trial=None):
46 |         # sampling routine for randomsearch + pruner
47 |         if self.pruner:
48 |             raise NotImplementedError(
49 |                 "Grid search in combination with trial pruning "
50 |                 "is currently not supported."
51 |             )
52 |         elif self.config_buffer:
53 |             run_budget = 0
54 |             next_trial_params = self.config_buffer.pop()
55 |             next_trial = self.create_trial(
56 |                 hparams=next_trial_params,
57 |                 sample_type="grid",
58 |                 run_budget=run_budget,
59 |             )
60 | 
61 |             self._log(
62 |                 "start trial {}: {}, {} \n".format(
63 |                     next_trial.trial_id, next_trial.params, next_trial.info_dict
64 |                 )
65 |             )
66 | 
67 |             return next_trial
68 |         else:
69 |             return None
70 | 
71 |     def finalize_experiment(self, trials):
72 |         return
73 | 
74 |     @staticmethod
75 |     def _grid_params(searchspace):
76 |         return_list = []
77 |         for hparams in itertools.product(
78 |             *[item["values"] for item in searchspace.items()]
79 |         ):
80 |             return_list.append(searchspace.list_to_dict(hparams))
81 |         return return_list
82 | 
83 |     @staticmethod
84 |     def _validate_searchspace(searchspace):
85 |         if (
86 |             Searchspace.DOUBLE in searchspace.names().values()
87 |             or Searchspace.INTEGER in searchspace.names().values()
88 |         ):
89 |             raise NotImplementedError(
90 |                 "Searchspace can only contain `discrete` or `categorical` "
91 |                 "hyperparameters for grid search."
92 |             )
93 | 


--------------------------------------------------------------------------------
/maggy/optimizer/randomsearch.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | import time
 17 | from copy import deepcopy
 18 | 
 19 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer
 20 | from maggy.searchspace import Searchspace
 21 | 
 22 | 
 23 | class RandomSearch(AbstractOptimizer):
 24 |     def __init__(self, **kwargs):
 25 |         super().__init__(**kwargs)
 26 |         self.config_buffer = []
 27 | 
 28 |     def initialize(self):
 29 | 
 30 |         if (
 31 |             Searchspace.DOUBLE not in self.searchspace.names().values()
 32 |             and Searchspace.INTEGER not in self.searchspace.names().values()
 33 |         ):
 34 |             raise NotImplementedError(
 35 |                 "Searchspace needs at least one continuous parameter for random search."
 36 |             )
 37 | 
 38 |         self.config_buffer = self.searchspace.get_random_parameter_values(
 39 |             self.num_trials
 40 |         )
 41 | 
 42 |     def get_suggestion(self, trial=None):
 43 |         self._log("### start get_suggestion ###")
 44 |         self.sampling_time_start = time.time()
 45 | 
 46 |         # sampling routine for randomsearch + pruner
 47 |         if self.pruner:
 48 |             next_trial_info = self.pruner.pruning_routine()
 49 |             if next_trial_info == "IDLE":
 50 |                 self._log(
 51 |                     "Worker is IDLE and has to wait until a new trial can be scheduled"
 52 |                 )
 53 |                 return "IDLE"
 54 |             elif next_trial_info is None:
 55 |                 # experiment is finished
 56 |                 self._log("Experiment has finished")
 57 |                 return None
 58 |             elif next_trial_info["trial_id"]:
 59 |                 # copy hparams of given promoted trial and start new trial with it
 60 |                 parent_trial_id = next_trial_info["trial_id"]
 61 |                 parent_trial_hparams = deepcopy(
 62 |                     self.get_hparams_dict(trial_ids=parent_trial_id)[parent_trial_id]
 63 |                 )
 64 |                 # update trial info dict and create new trial object
 65 |                 next_trial = self.create_trial(
 66 |                     hparams=parent_trial_hparams,
 67 |                     sample_type="promoted",
 68 |                     run_budget=next_trial_info["budget"],
 69 |                 )
 70 |                 self._log("use hparams from promoted trial {}".format(parent_trial_id))
 71 |             else:
 72 |                 # start sampling procedure with given budget
 73 |                 parent_trial_id = None
 74 |                 run_budget = next_trial_info["budget"]
 75 |                 hparams = self.searchspace.get_random_parameter_values(1)[0]
 76 |                 next_trial = self.create_trial(
 77 |                     hparams=hparams, sample_type="random", run_budget=run_budget
 78 |                 )
 79 | 
 80 |             # report new trial id to pruner
 81 |             self.pruner.report_trial(
 82 |                 original_trial_id=parent_trial_id, new_trial_id=next_trial.trial_id
 83 |             )
 84 | 
 85 |             self._log(
 86 |                 "start trial {}: {}. info_dict: {} \n".format(
 87 |                     next_trial.trial_id, next_trial.params, next_trial.info_dict
 88 |                 )
 89 |             )
 90 |             return next_trial
 91 | 
 92 |         # sampling routine for pure random search
 93 |         elif self.config_buffer:
 94 |             run_budget = 0
 95 |             next_trial_params = self.config_buffer.pop()
 96 |             next_trial = self.create_trial(
 97 |                 hparams=next_trial_params,
 98 |                 sample_type="random",
 99 |                 run_budget=run_budget,
100 |             )
101 | 
102 |             self._log(
103 |                 "start trial {}: {}, {} \n".format(
104 |                     next_trial.trial_id, next_trial.params, next_trial.info_dict
105 |                 )
106 |             )
107 | 
108 |             return next_trial
109 |         else:
110 |             return None
111 | 
112 |     def finalize_experiment(self, trials):
113 |         return
114 | 


--------------------------------------------------------------------------------
/maggy/optimizer/singlerun.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.optimizer.abstractoptimizer import AbstractOptimizer
18 | from maggy.trial import Trial
19 | 
20 | 
21 | class SingleRun(AbstractOptimizer):
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.trial_buffer = []
25 | 
26 |     def initialize(self):
27 |         for _ in range(self.num_trials):
28 |             self.trial_buffer.append(Trial({}))
29 | 
30 |     def get_suggestion(self, trial=None):
31 |         if self.trial_buffer:
32 |             return self.trial_buffer.pop()
33 |         else:
34 |             return None
35 | 
36 |     def finalize_experiment(self, trials):
37 |         return
38 | 


--------------------------------------------------------------------------------
/maggy/pruner/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from maggy.pruner import hyperband, abstractpruner
18 | 
19 | Hyperband = hyperband.Hyperband
20 | AbstractPruner = abstractpruner.AbstractPruner
21 | 
22 | __all__ = ["Hyperband", "AbstractPruner"]
23 | 


--------------------------------------------------------------------------------
/maggy/pruner/abstractpruner.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | from abc import ABC, abstractmethod
18 | from datetime import datetime
19 | 
20 | from maggy.core.environment.singleton import EnvSing
21 | 
22 | 
23 | class AbstractPruner(ABC):
24 |     def __init__(self, trial_metric_getter):
25 |         """
26 |         :param trial_metric_getter: a function that returns a dict with `trial_id` as key and `metric` as value
27 |                              with the lowest metric being the "best"
28 |                              It's only argument is `trial_ids`, it can be either str of single trial or list of trial ids
29 |         :type trial_metric_getter: function
30 |         """
31 | 
32 |         self.trial_metric_getter = trial_metric_getter
33 | 
34 |         # logger variables
35 |         self.log_file = None
36 |         self.fd = None
37 | 
38 |     @abstractmethod
39 |     def pruning_routine(self):
40 |         """
41 |         runs pruning routine.
42 |         interface top `optimizer`
43 |         """
44 |         pass
45 | 
46 |     @abstractmethod
47 |     def report_trial(self):
48 |         """
49 |         hook for reporting trial id of created trial from optimizer to pruner
50 |         """
51 |         pass
52 | 
53 |     @abstractmethod
54 |     def finished(self):
55 |         """
56 |         checks if experiment is finished
57 |         """
58 |         pass
59 | 
60 |     @abstractmethod
61 |     def num_trials(self):
62 |         """
63 |         calculates the number of trials in the experiment
64 | 
65 |         :return: number of trials
66 |         :rtype: int
67 |         """
68 | 
69 |     def name(self):
70 |         return str(self.__class__.__name__)
71 | 
72 |     def initialize_logger(self, exp_dir):
73 |         """Initialize logger of optimizer
74 | 
75 |         :param exp_dir: path of experiment directory
76 |         :rtype exp_dir: str
77 |         """
78 |         env = EnvSing.get_instance()
79 |         # configure logger
80 |         self.log_file = exp_dir + "/pruner.log"
81 | 
82 |         if not env.exists(self.log_file):
83 |             env.dump("", self.log_file)
84 |         self.fd = env.open_file(self.log_file, flags="w")
85 |         self._log("Initialized Pruner Logger")
86 | 
87 |     def _log(self, msg):
88 |         if self.fd and not self.fd.closed:
89 |             msg = datetime.now().isoformat() + ": " + str(msg)
90 |             self.fd.write(EnvSing.get_instance().str_or_byte(msg + "\n"))
91 | 
92 |     def _close_log(self):
93 |         if not self.fd.closed:
94 |             self.fd.flush()
95 |             self.fd.close()
96 | 


--------------------------------------------------------------------------------
/maggy/tensorboard.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | """
 18 | Module to encapsulate functionality related to writing to the tensorboard
 19 | log dir and programmatically structure the outputs.
 20 | """
 21 | 
 22 | import tensorflow as tf
 23 | from tensorboard.plugins.hparams import api as hp
 24 | 
 25 | _tensorboard_dir = None
 26 | 
 27 | 
 28 | def _register(trial_dir):
 29 |     global _tensorboard_dir
 30 |     _tensorboard_dir = trial_dir
 31 | 
 32 | 
 33 | def logdir():
 34 |     """Returns the path to the tensorboard log directory.
 35 | 
 36 |     Instead of hardcoding a log dir path in a training function, users should
 37 |     make use of this function call, which will programmatically create a folder
 38 |     structure for tensorboard to visualize the machine learning experiment.
 39 | 
 40 |     :return: Path of the log directory in HOPSFS
 41 |     :rtype: str
 42 |     """
 43 |     global _tensorboard_dir
 44 |     return _tensorboard_dir
 45 | 
 46 | 
 47 | def _create_hparams_config(searchspace):
 48 |     hparams = []
 49 | 
 50 |     for key, val in searchspace.names().items():
 51 |         if val == "DOUBLE":
 52 |             hparams.append(
 53 |                 hp.HParam(
 54 |                     key,
 55 |                     hp.RealInterval(
 56 |                         float(searchspace.get(key)[0]), float(searchspace.get(key)[1])
 57 |                     ),
 58 |                 )
 59 |             )
 60 |         elif val == "INTEGER":
 61 |             hparams.append(
 62 |                 hp.HParam(
 63 |                     key,
 64 |                     hp.IntInterval(searchspace.get(key)[0], searchspace.get(key)[1]),
 65 |                 )
 66 |             )
 67 |         elif val == "DISCRETE":
 68 |             hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key))))
 69 |         elif val == "CATEGORICAL":
 70 |             hparams.append(hp.HParam(key, hp.Discrete(searchspace.get(key))))
 71 | 
 72 |     return hparams
 73 | 
 74 | 
 75 | def _write_hparams_config(log_dir, searchspace):
 76 |     HPARAMS = _create_hparams_config(searchspace)
 77 |     METRICS = [
 78 |         hp.Metric(
 79 |             "epoch_accuracy",
 80 |             group="validation",
 81 |             display_name="accuracy (val.)",
 82 |         ),
 83 |         hp.Metric(
 84 |             "epoch_loss",
 85 |             group="validation",
 86 |             display_name="loss (val.)",
 87 |         ),
 88 |         hp.Metric(
 89 |             "epoch_accuracy",
 90 |             group="train",
 91 |             display_name="accuracy (train)",
 92 |         ),
 93 |         hp.Metric(
 94 |             "epoch_loss",
 95 |             group="train",
 96 |             display_name="loss (train)",
 97 |         ),
 98 |     ]
 99 | 
100 |     with tf.summary.create_file_writer(log_dir).as_default():
101 |         hp.hparams_config(hparams=HPARAMS, metrics=METRICS)
102 | 
103 | 
104 | def _write_hparams(hparams, trial_id):
105 |     global _tensorboard_dir
106 |     with tf.summary.create_file_writer(_tensorboard_dir).as_default():
107 |         hp.hparams(hparams, trial_id)
108 | 


--------------------------------------------------------------------------------
/maggy/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/maggy/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | """ pytest fixtures that can be resued across tests. the filename needs to be conftest.py
18 | """
19 | 
20 | # make sure env variables are set correctly
21 | import findspark  # this needs to be the first import
22 | 
23 | findspark.init()
24 | 
25 | import logging
26 | import pytest
27 | 
28 | from pyspark import HiveContext
29 | from pyspark import SparkConf
30 | from pyspark import SparkContext
31 | from pyspark.streaming import StreamingContext
32 | 
33 | 
34 | def quiet_py4j():
35 |     """turn down spark logging for the test context"""
36 |     logger = logging.getLogger("py4j")
37 |     logger.setLevel(logging.WARN)
38 | 
39 | 
40 | def pytest_addoption(parser):
41 |     parser.addoption(
42 |         "--spark-master",
43 |         action="store",
44 |         default=None,
45 |         help='spark-master: "spark://name.local:7077"',
46 |     )
47 | 
48 | 
49 | @pytest.fixture(scope="session")
50 | def sc(request):
51 |     """fixture for creating a spark context
52 |     Args:
53 |         request: pytest.FixtureRequest object
54 |     """
55 | 
56 |     assert (
57 |         request.config.getoption("--spark-master") is not None
58 |     ), 'No Spark Master Address provided, use --spark-master: "spark://host:port" '
59 | 
60 |     conf = (
61 |         SparkConf()
62 |         .setMaster(request.config.getoption("--spark-master"))
63 |         .setAppName("pytest-pyspark-local-testing")
64 |         .set("spark.dynamicAllocation.maxExecutors", 2)
65 |         .set("spark.executor.instances", 2)
66 |     )
67 |     scont = SparkContext(conf=conf)
68 |     request.addfinalizer(lambda: scont.stop())
69 | 
70 |     quiet_py4j()
71 |     return scont
72 | 
73 | 
74 | @pytest.fixture(scope="session")
75 | def hive_context(sc):
76 |     """fixture for creating a Hive Context. Creating a fixture enables it to be reused across all
77 |         tests in a session
78 |     Args:
79 |         spark_context: spark_context fixture
80 |     Returns:
81 |         HiveContext for tests
82 |     """
83 |     return HiveContext(sc)
84 | 
85 | 
86 | @pytest.fixture(scope="session")
87 | def streaming_context(sc):
88 |     return StreamingContext(sc, 1)
89 | 


--------------------------------------------------------------------------------
/maggy/tests/test_maggy.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import pytest
18 | from maggy.searchspace import Searchspace
19 | from maggy.optimizer import RandomSearch
20 | 
21 | # this allows using the fixture in all tests in this module
22 | pytestmark = pytest.mark.usefixtures("sc")
23 | 
24 | 
25 | def test_nr_executors(sc):
26 | 
27 |     executor_instances = int(sc._conf.get("spark.executor.instances"))
28 |     expected_number = 2
29 |     assert executor_instances == expected_number
30 | 
31 | 
32 | def test_random_search(sc):
33 | 
34 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]))
35 | 
36 |     rs = RandomSearch()
37 |     rs.searchspace = sp
38 | 
39 |     rs.num_trials = 5
40 |     exp_result = {"argument_param": "DOUBLE"}
41 | 
42 |     assert sp.names() == exp_result
43 |     assert rs.num_trials == 5
44 |     assert rs.searchspace == sp
45 | 


--------------------------------------------------------------------------------
/maggy/tests/test_randomsearch.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | import pytest
 18 | import time
 19 | import random
 20 | 
 21 | import tensorflow as tf
 22 | from tensorflow import keras
 23 | import numpy as np
 24 | 
 25 | from maggy.searchspace import Searchspace
 26 | from maggy.optimizer import RandomSearch
 27 | from maggy import experiment
 28 | from maggy.config import HyperparameterOptConfig, TfDistributedConfig
 29 | 
 30 | # this allows using the fixture in all tests in this module
 31 | pytestmark = pytest.mark.usefixtures("sc")
 32 | 
 33 | 
 34 | def test_randomsearch_init():
 35 | 
 36 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4]))
 37 | 
 38 |     rs = RandomSearch(5, sp, [])
 39 | 
 40 |     assert rs.num_trials == 5
 41 |     assert rs.searchspace == sp
 42 | 
 43 | 
 44 | def test_randomsearch_initialize():
 45 | 
 46 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4]))
 47 | 
 48 |     rs = RandomSearch(5, sp, [])
 49 | 
 50 |     rs.initialize()
 51 | 
 52 |     assert len(rs.trial_buffer) == 5
 53 | 
 54 | 
 55 | def test_rs_initialize2():
 56 | 
 57 |     sp = Searchspace(argument_param=("DISCRETE", [1, 5]))
 58 | 
 59 |     rs = RandomSearch()
 60 |     rs.searchspace = sp
 61 | 
 62 |     with pytest.raises(NotImplementedError) as excinfo:
 63 |         rs.initialize()
 64 |     assert "Searchspace needs at least one continuous parameter" in str(excinfo.value)
 65 | 
 66 | 
 67 | def test_randomsearch(sc):
 68 |     def train(model, train_set, test_set, hparams, reporter):
 69 | 
 70 |         if "argument_param" in hparams.keys():
 71 |             print(
 72 |                 "Entered train function with param {}".format(hparams["argument_param"])
 73 |             )
 74 | 
 75 |         for i in range(5):
 76 |             acc = i + random.random()
 77 |             reporter.broadcast(metric=acc)
 78 |             reporter.log("Metric: {}".format(acc))
 79 | 
 80 |             # do something with HP.
 81 |             if "argument_param" in hparams.keys():
 82 |                 time.sleep(hparams["argument_param"])
 83 | 
 84 |         return acc
 85 | 
 86 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]))
 87 | 
 88 |     config = HyperparameterOptConfig(
 89 |         searchspace=sp,
 90 |         optimizer="randomsearch",
 91 |         direction="max",
 92 |         num_trials=5,
 93 |         name="test",
 94 |         hb_interval=1,
 95 |         es_interval=10,
 96 |     )
 97 | 
 98 |     result = experiment.lagom(train_fn=train, config=config)
 99 |     assert type(result) == type({})
100 | 
101 |     test_dt_tensorflow(sc)
102 | 
103 | 
104 | def test_dt_tensorflow(sc):
105 | 
106 |     mnist = tf.keras.datasets.mnist
107 | 
108 |     (x_train, y_train), (x_test, y_test) = mnist.load_data()
109 | 
110 |     x_train, x_test = x_train / 255.0, x_test / 255.0
111 |     x_train = np.reshape(x_train, (60000, 28, 28, 1))
112 |     x_test = np.reshape(x_test, (10000, 28, 28, 1))
113 | 
114 |     def training_function(model, train_set, test_set, hparams):
115 |         from tensorflow import keras
116 | 
117 |         # Define training parameters
118 |         num_epochs = 10
119 |         batch_size = 256
120 |         learning_rate = 0.1
121 | 
122 |         criterion = keras.losses.SparseCategoricalCrossentropy()
123 |         optimizer = keras.optimizers.SGD(
124 |             learning_rate=learning_rate, momentum=0.9, decay=1e-5
125 |         )
126 | 
127 |         model = model(nlayers=2)
128 | 
129 |         model.compile(optimizer=optimizer, loss=criterion, metrics=["accuracy"])
130 | 
131 |         model.fit(
132 |             x_train,
133 |             y_train,
134 |             # batch_size=batch_size,
135 |             # epochs=num_epochs,
136 |         )
137 | 
138 |         print("Testing")
139 | 
140 |         loss = model.evaluate(x_test, y_test)
141 | 
142 |         return loss
143 | 
144 |     class NeuralNetwork(tf.keras.Model):
145 |         def __init__(self, nlayers):
146 |             super().__init__()
147 |             self.conv1 = keras.layers.Conv2D(28, 2, activation="relu")
148 |             self.flatten = keras.layers.Flatten()
149 |             self.d1 = keras.layers.Dense(32, activation="relu")
150 |             self.d2 = keras.layers.Dense(10, activation="softmax")
151 | 
152 |         def call(self, x):
153 |             x = self.conv1(x)
154 |             x = self.flatten(x)
155 |             x = self.d1(x)
156 |             return self.d2(x)
157 | 
158 |     model = NeuralNetwork
159 | 
160 |     # define the constructor parameters of your model
161 |     model_parameters = {
162 |         "train_batch_size": 30000,
163 |         "test_batch_size": 5000,
164 |         "nlayers": 2,
165 |     }
166 | 
167 |     # pass the model parameters in the last
168 |     config = TfDistributedConfig(
169 |         name="tf_test",
170 |         model=model,
171 |         train_set=None,
172 |         test_set=None,
173 |         hparams=model_parameters,
174 |     )
175 | 
176 |     result = experiment.lagom(train_fn=training_function, config=config)
177 | 
178 |     assert type(result) == list
179 | 


--------------------------------------------------------------------------------
/maggy/tests/test_searchspace.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import pytest
18 | import time
19 | import random
20 | 
21 | from maggy import Searchspace
22 | 
23 | 
24 | def test_searchspace_init():
25 | 
26 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]), param2=("integer", [3, 4]))
27 | 
28 |     exp_get = [1, 5]
29 | 
30 |     assert sp.get("argument_param") == exp_get
31 |     assert sp.argument_param == exp_get  # pylint: disable=no-member
32 | 
33 | 
34 | def test_searchspace_add():
35 | 
36 |     sp = Searchspace(argument_param=("DOUBLE", [1, 5]))
37 | 
38 |     with pytest.raises(ValueError) as excinfo:
39 |         sp.add("argument_param", ("DOUBLE", [1, 5]))
40 |     assert "Hyperparameter name is reserved" in str(excinfo.value)
41 | 
42 |     with pytest.raises(ValueError) as excinfo:
43 |         # add tuple with too many elements
44 |         sp.add("param", ("DOUBLE", [1, 5], "too many"))
45 |     assert "Hyperparameter tuple has to be of length two" in str(excinfo.value)
46 | 
47 |     with pytest.raises(ValueError) as excinfo:
48 |         # add unknown type
49 |         sp.add("param", ("FLOAT", [1, 5]))
50 |     assert "Hyperparameter type is not of type " in str(excinfo.value)
51 | 
52 |     with pytest.raises(ValueError) as excinfo:
53 |         # add empty region list
54 |         sp.add("param", ("DOUBLE", []))
55 |     assert "Hyperparameter feasible region list" in str(excinfo.value)
56 | 
57 |     with pytest.raises(AssertionError) as excinfo:
58 |         # add incompatible type and feasible region
59 |         sp.add("param", ("DOUBLE", [1, 5, 5]))
60 |         sp.add("param2", ("INTEGER", [1, 5, 5]))
61 |     assert "For DOUBLE or " in str(excinfo.value)
62 | 
63 |     with pytest.raises(AssertionError) as excinfo:
64 |         # lower bound higher than upper bound
65 |         sp.add("param", ("DOUBLE", [5, 1]))
66 |         sp.add("param2", ("INTEGER", [4, 1]))
67 |     assert "Lower bound " in str(excinfo.value)
68 | 
69 |     with pytest.raises(ValueError) as excinfo:
70 |         # Non integer boundaries for integer type parameter
71 |         sp.add("param2", ("INTEGER", [1.5, 5]))
72 |     assert "type INTEGER need to be integer:" in str(excinfo.value)
73 | 
74 |     with pytest.raises(ValueError) as excinfo:
75 |         # Non numeric interval boundaries
76 |         sp.add("param2", ("DOUBLE", ["lower", 5]))
77 |     assert "type DOUBLE need to be integer or float:" in str(excinfo.value)
78 | 


--------------------------------------------------------------------------------
/maggy/tests/test_trial.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import pytest
18 | import time
19 | import random
20 | 
21 | from maggy import Trial
22 | 
23 | 
24 | def test_trial_init():
25 | 
26 |     trial = Trial({"param1": 5, "param2": "ada"})
27 | 
28 |     exp = {"param1": 5, "param2": "ada"}
29 | 
30 |     assert trial.params == exp
31 |     assert trial.status == Trial.PENDING
32 |     assert trial.trial_id == "3d1cc9fdb1d4d001"
33 | 
34 | 
35 | def test_trial_serialization():
36 | 
37 |     trial = Trial({"param1": 5, "param2": "ada"})
38 | 
39 |     exp = {"param1": 5, "param2": "ada"}
40 | 
41 |     json_str = trial.to_json()
42 | 
43 |     new_trial = Trial.from_json(json_str)
44 | 
45 |     assert isinstance(new_trial, Trial)
46 |     assert new_trial.params == exp
47 |     assert new_trial.status == Trial.PENDING
48 |     assert new_trial.trial_id == "3d1cc9fdb1d4d001"
49 | 


--------------------------------------------------------------------------------
/maggy/tests/test_wordcount.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import pytest
18 | from operator import add
19 | 
20 | # this allows using the fixture in all tests in this module
21 | pytestmark = pytest.mark.usefixtures("sc")
22 | 
23 | # Can also use a decorator such as this to use specific fixtures in specific functions
24 | # @pytest.mark.usefixtures("spark_context", "hive_context")
25 | 
26 | 
27 | def do_word_counts(lines):
28 |     """count of words in an rdd of lines"""
29 | 
30 |     counts = lines.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(add)
31 |     results = {word: count for word, count in counts.collect()}
32 |     return results
33 | 
34 | 
35 | # start function with test_ so pytest can discover them
36 | def test_do_word_counts(sc):
37 |     """test that a single event is parsed correctly
38 |     Args:
39 |         spark_context: test fixture SparkContext
40 |         hive_context: test fixture HiveContext
41 |     """
42 | 
43 |     test_input = [" hello spark ", " hello again spark spark"]
44 | 
45 |     input_rdd = sc.parallelize(test_input, 1)
46 |     results = do_word_counts(input_rdd)
47 | 
48 |     expected_results = {"hello": 2, "spark": 3, "again": 1}
49 |     assert results == expected_results
50 | 


--------------------------------------------------------------------------------
/maggy/trial.py:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Copyright 2020 Logical Clocks AB
  3 | #
  4 | #   Licensed under the Apache License, Version 2.0 (the "License");
  5 | #   you may not use this file except in compliance with the License.
  6 | #   You may obtain a copy of the License at
  7 | #
  8 | #       http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | #   Unless required by applicable law or agreed to in writing, software
 11 | #   distributed under the License is distributed on an "AS IS" BASIS,
 12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | #   See the License for the specific language governing permissions and
 14 | #   limitations under the License.
 15 | #
 16 | 
 17 | import json
 18 | import threading
 19 | import hashlib
 20 | 
 21 | from maggy import util
 22 | 
 23 | 
 24 | class Trial(object):
 25 |     """A Trial object contains all relevant information about the evaluation
 26 |     of an hyperparameter combination.
 27 | 
 28 |     It is used as shared memory between
 29 |     the worker thread and rpc server thread. The server thread performs only
 30 |     lookups on the `early_stop` and `params` attributes.
 31 |     """
 32 | 
 33 |     PENDING = "PENDING"
 34 |     SCHEDULED = "SCHEDULED"
 35 |     RUNNING = "RUNNING"
 36 |     ERROR = "ERROR"
 37 |     FINALIZED = "FINALIZED"
 38 | 
 39 |     def __init__(self, params, trial_type="optimization", info_dict=None):
 40 |         """Create a new trial object from a hyperparameter combination
 41 |         ``params``.
 42 | 
 43 |         :param params: A dictionary of Hyperparameters as key value pairs.
 44 |         :type params: dict
 45 |         :param info_dict: dict containing additional information about the trial including
 46 |                             - sample_type
 47 |                             - sampling_time
 48 |                             - run_budget
 49 |                             - model_budget (optinally)
 50 |                             see `create_trial()` method of base.py for further reference
 51 |         :type info_dict: dict
 52 |         """
 53 |         # XXX before merge, we should remove the default value for trial_type
 54 |         # and make sure everywhere Trial() is called (e.g. in all optimizers)
 55 |         # trial_type is passed
 56 |         # @Moritz
 57 | 
 58 |         self.trial_type = trial_type
 59 |         # XXX temp fix, have to come up with abstractions
 60 |         if self.trial_type == "optimization":
 61 |             self.trial_id = Trial._generate_id(params)
 62 |         elif self.trial_type == "ablation":
 63 |             serializable_params = {
 64 |                 "ablated_feature": params.get("ablated_feature", None),
 65 |                 "ablated_layer": params.get("ablated_layer", None),
 66 |             }
 67 |             self.trial_id = Trial._generate_id(serializable_params)
 68 |         self.params = params
 69 |         self.status = Trial.PENDING
 70 |         self.early_stop = False
 71 |         self.final_metric = None
 72 |         self.metric_history = []
 73 |         self.step_history = []
 74 |         self.metric_dict = {}
 75 |         self.start = None
 76 |         self.duration = None
 77 |         self.lock = threading.RLock()
 78 |         if info_dict is None:
 79 |             self.info_dict = {}
 80 |         else:
 81 |             self.info_dict = info_dict
 82 | 
 83 |     def get_early_stop(self):
 84 |         """Return the early stopping flag of the trial."""
 85 |         with self.lock:
 86 |             return self.early_stop
 87 | 
 88 |     def set_early_stop(self):
 89 |         """Set the early stopping flag of the trial to true."""
 90 |         with self.lock:
 91 |             self.early_stop = True
 92 | 
 93 |     def append_metric(self, metric_data):
 94 |         """Append a metric from the heartbeats to the history."""
 95 |         with self.lock:
 96 |             # from python 3.7 dicts are insertion ordered,
 97 |             # so two of these data structures can be removed
 98 |             if (
 99 |                 metric_data["step"] not in self.metric_dict
100 |                 and metric_data["value"] is not None
101 |             ):
102 |                 self.metric_dict[metric_data["step"]] = metric_data["value"]
103 |                 self.metric_history.append(metric_data["value"])
104 |                 self.step_history.append(metric_data["step"])
105 |                 # return step number to indicate that it was a new unique step
106 |                 return metric_data["step"]
107 |             # return None to indicate that no new step has finished
108 |             return None
109 | 
110 |     @classmethod
111 |     def _generate_id(cls, params):
112 |         """
113 |         Class method to generate a hash from a hyperparameter dictionary.
114 | 
115 |         All keys in the dictionary have to be strings. The hash is a to 16
116 |         characters truncated md5 hash and stable across processes.
117 | 
118 |         :param params: Hyperparameters
119 |         :type params: dictionary
120 |         :raises ValueError: All hyperparameter names have to be strings.
121 |         :raises ValueError: Hyperparameters need to be a dictionary.
122 |         :return: Sixteen character truncated md5 hash
123 |         :rtype: str
124 |         """
125 | 
126 |         # ensure params is a dictionary
127 |         if isinstance(params, dict):
128 |             # check that all keys are strings
129 |             if False in set(isinstance(k, str) for k in params.keys()):
130 |                 raise ValueError("All hyperparameter names have to be strings.")
131 | 
132 |             return hashlib.md5(
133 |                 json.dumps(params, sort_keys=True).encode("utf-8")
134 |             ).hexdigest()[:16]
135 | 
136 |         raise ValueError("Hyperparameters need to be a dictionary.")
137 | 
138 |     def to_json(self):
139 |         return json.dumps(self.to_dict(), default=util.json_default_numpy)
140 | 
141 |     def to_dict(self):
142 |         obj_dict = {"__class__": self.__class__.__name__}
143 | 
144 |         temp_dict = self.__dict__.copy()
145 |         temp_dict.pop("lock")
146 |         temp_dict.pop("start")
147 | 
148 |         obj_dict.update(temp_dict)
149 | 
150 |         return obj_dict
151 | 
152 |     @classmethod
153 |     def from_json(cls, json_str):
154 |         """Creates a Trial instance from a previously json serialized Trial
155 |         object instance.
156 | 
157 |         :param json_str: String containing the object.
158 |         :type json_str: str
159 |         :raises ValueError: json_str is not a Trial object.
160 |         :return: Instantiated object instance of Trial.
161 |         :rtype: Trial
162 |         """
163 | 
164 |         temp_dict = json.loads(json_str)
165 |         if temp_dict.get("__class__", None) != "Trial":
166 |             raise ValueError("json_str is not a Trial object.")
167 |         if temp_dict.get("params", None) is not None:
168 |             instance = cls(temp_dict.get("params"))
169 |             instance.trial_id = temp_dict["trial_id"]
170 |             instance.status = temp_dict["status"]
171 |             instance.early_stop = temp_dict.get("early_stop", False)
172 |             instance.final_metric = temp_dict["final_metric"]
173 |             instance.metric_history = temp_dict["metric_history"]
174 |             instance.duration = temp_dict["duration"]
175 | 
176 |         return instance
177 | 


--------------------------------------------------------------------------------
/maggy/version.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2020 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | __version__ = "1.1.2"
18 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: "MAGGY"
 2 | site_description: "Official website and documentation for MAGGY - Distribution transparent Machine Learning experiments on Apache Spark."
 3 | site_author: "Logical Clocks"
 4 | site_url: "https://maggy.ai"
 5 | 
 6 | # Repository
 7 | repo_name: logicalclocks/maggy
 8 | repo_url: https://github.com/logicalclocks/maggy
 9 | edit_uri: ""
10 | 
11 | nav:
12 |   - Home:
13 |     - Introduction: README.md
14 |     - Blogs: blogs.md
15 |     - Publications: publications.md
16 |     - Releases: releases.md
17 |     - Contributing: CONTRIBUTING.md
18 |     - Issues: https://github.com/logicalclocks/maggy/issues
19 |     - Hopsworks.ai: https://hopsworks.ai/
20 |   - Getting Started:
21 |     - Installation: start/install.md
22 |     - Quickstart: start/quickstart.md
23 |   - Hyperparameter Optimization:
24 |     - Introduction: hpo/intro.md
25 |     - Strategies: hpo/strategies.md
26 |   - Ablation Studies:
27 |     - Introduction: ablation/intro.md
28 |   - Distributed Training:
29 |     - Introduction: dist_training/intro.md
30 |     - TensorFlow: dist_training/tensorflow.md
31 |     - PyTorch: dist_training/torch.md
32 | 
33 | theme:
34 |   name: material
35 |   favicon: assets/images/maggyfav.png
36 |   logo: assets/images/whitemaggy-eye.svg
37 |   icon:
38 |     repo: fontawesome/brands/github
39 |   font:
40 |     text: "Roboto"
41 |   palette:
42 |     accent: orange
43 |   features:
44 |     - navigation.tabs
45 |     - navigation.tabs.sticky
46 | 
47 | extra:
48 |   generator: false
49 |   social:
50 |     - icon: fontawesome/brands/twitter
51 |       link: https://twitter.com/logicalclocks
52 |     - icon: fontawesome/brands/github
53 |       link: https://github.com/logicalclocks
54 |     - icon: fontawesome/brands/discourse
55 |       link: https://community.hopsworks.ai/
56 |     - icon: fontawesome/brands/linkedin
57 |       link: https://www.linkedin.com/company/logicalclocks/
58 |   analytics:
59 |     provider: google
60 |     property: G-J3F4GSLKE8
61 | 
62 | extra_css:
63 |   - assets/css/custom.css
64 |   - assets/css/version-select.css
65 | 
66 | extra_javascript:
67 |   - assets/javascript/version-select.js
68 | 
69 | plugins:
70 |   - search
71 | 
72 | markdown_extensions:
73 |   - admonition
74 |   - codehilite
75 |   - footnotes
76 |   - pymdownx.tabbed:
77 |         alternate_style: true
78 |   - pymdownx.arithmatex
79 |   - pymdownx.superfences
80 |   - pymdownx.details
81 |   - pymdownx.caret
82 |   - pymdownx.mark
83 |   - pymdownx.tilde
84 |   - pymdownx.critic
85 |   - toc:
86 |       permalink: "#"
87 |       toc_depth: 3
88 |   - pymdownx.tasklist:
89 |       custom_checkbox: true
90 |   - markdown_include.include:
91 |       base_path: docs
92 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = maggy/tests
3 | max-line-length = 80
4 | select = C,E,F,W,B,B950
5 | ignore = E203, E501, W503
6 | per-file-ignores =
7 |     maggy/experiment/experiment_python.py:F403, F405
8 |     maggy/experiment/experiment_pyspark.py:F403, F405
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | #   Copyright 2021 Logical Clocks AB
 3 | #
 4 | #   Licensed under the Apache License, Version 2.0 (the "License");
 5 | #   you may not use this file except in compliance with the License.
 6 | #   You may obtain a copy of the License at
 7 | #
 8 | #       http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #   Unless required by applicable law or agreed to in writing, software
11 | #   distributed under the License is distributed on an "AS IS" BASIS,
12 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #   See the License for the specific language governing permissions and
14 | #   limitations under the License.
15 | #
16 | 
17 | import os
18 | from setuptools import setup, find_packages
19 | from importlib.machinery import SourceFileLoader
20 | 
21 | 
22 | version = (
23 |     SourceFileLoader("maggy.version", os.path.join("maggy", "version.py")).load_module().__version__
24 | )
25 | 
26 | 
27 | def read(fname):
28 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
29 | 
30 | 
31 | setup(
32 |     name='maggy',
33 |     version=version,
34 |     install_requires=[
35 |         'numpy>=1.19.2', 'scikit-optimize==0.9.0', 'statsmodels==0.12.2', 'scipy==1.10.0'
36 |     ],
37 |     extras_require={
38 |         'pydoop': ['pydoop'],
39 |         'tf': ['tensorflow==2.4.1'],
40 |         'torch': ['torch==1.7.1'],  # Should be 1.8.1 if we want to support PyTorch's ZeRO.
41 |         'zero': ['deepspeed==0.3.13',
42 |                  'fairscale==0.3.0'],
43 |         'docs': [
44 |             'mkdocs==1.5.3',
45 |             'mike==2.0.0',
46 |             'mkdocs-material==9.5.10',
47 |             'markdown-include==0.8.1',
48 |         ],
49 |         'dev': [
50 |             'black==20.8b1',
51 |             'flake8==3.9.0',
52 |             'pre-commit==2.11.1',
53 |         ],
54 |         'spark': ['pyspark==2.4.3']
55 |     },
56 |     author='Moritz Meister',
57 |     author_email='moritz@logicalclocks.com',
58 |     description='Distribution transparent Machine Learning experiments on Apache Spark ',
59 |     license='Apache License 2.0',
60 |     keywords='Hyperparameter, Optimization, Distributed, Training, Keras, PyTorch, TensorFlow, Spark',
61 |     url='https://github.com/logicalclocks/maggy',
62 |     download_url='',
63 |     packages=find_packages(),
64 |     long_description=read('README.md'),
65 |     long_description_content_type="text/markdown",
66 |     python_requires=">=3.7",
67 |     classifiers=[
68 |         'Development Status :: 5 - Production/Stable',
69 |         'Topic :: Utilities',
70 |         'License :: OSI Approved :: Apache Software License',
71 |         'Programming Language :: Python :: 3',
72 |         'Programming Language :: Python :: 3.7',
73 |         'Intended Audience :: Developers',
74 |     ]
75 | )
76 | 


--------------------------------------------------------------------------------