├── .dir-locals.el ├── .dockerignore ├── .gitattributes ├── .github └── workflows │ ├── coverage.yml │ ├── pre-commit.yml │ ├── release.yml │ └── workflow.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── COMMITTERS.md ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE_SHORT ├── MANIFEST.in ├── Makefile ├── README.md ├── caliban ├── __init__.py ├── __main__.py ├── _version.py ├── cli.py ├── config │ ├── __init__.py │ └── experiment.py ├── docker │ ├── __init__.py │ ├── build.py │ └── push.py ├── expansion.py ├── history │ ├── __init__.py │ ├── cli.py │ ├── submit.py │ ├── types.py │ └── util.py ├── main.py ├── platform │ ├── __init__.py │ ├── cloud │ │ ├── __init__.py │ │ ├── core.py │ │ ├── types.py │ │ └── util.py │ ├── gke │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── cluster.py │ │ ├── constants.py │ │ ├── types.py │ │ └── util.py │ ├── notebook.py │ ├── run.py │ └── shell.py ├── resources │ ├── __init__.py │ ├── caliban_launcher.py │ └── cloud_sql_proxy.py └── util │ ├── __init__.py │ ├── argparse.py │ ├── auth.py │ ├── fs.py │ ├── metrics.py │ ├── schema.py │ └── tqdm.py ├── cloudbuild.json ├── codemeta.json ├── dockerfiles ├── Dockerfile └── Dockerfile.gpu ├── docs ├── Makefile ├── _static │ └── img │ │ ├── cloud │ │ ├── activate.png │ │ ├── create_new_key.png │ │ ├── create_service_account.png │ │ ├── new_project.png │ │ ├── project_id.png │ │ ├── select_project.png │ │ └── service_acct_permissions.png │ │ └── gke │ │ ├── cleanup_job.png │ │ ├── cluster_create_progress.png │ │ ├── cluster_dashboard.png │ │ ├── job_logs.png │ │ ├── node_pool_autoprovision.png │ │ ├── pod_events.png │ │ ├── pre_job_details.png │ │ ├── pre_job_submission.png │ │ ├── stackdriver_logs.png │ │ ├── unschedulable.png │ │ └── unschedulable_details.png ├── cli │ ├── caliban_build.rst │ ├── caliban_cloud.rst │ ├── caliban_cluster.rst │ ├── caliban_notebook.rst │ ├── caliban_resubmit.rst │ ├── caliban_run.rst │ ├── caliban_shell.rst │ ├── caliban_status.rst │ ├── caliban_stop.rst │ └── expansion.rst ├── cloud │ ├── adc.rst │ ├── ai_platform_tpu.rst │ ├── bucket.rst │ ├── gpu_specs.rst │ ├── labels.rst │ ├── rate_limit.rst │ └── service_account.rst ├── conf.py ├── explore │ ├── base_image.rst │ ├── calibanconfig.rst │ ├── custom_docker_run.rst │ ├── custom_script_args.rst │ ├── declaring_requirements.rst │ ├── exp_stdin.rst │ ├── experiment_broadcasting.rst │ ├── experiment_groups.rst │ ├── gcloud.rst │ ├── mac.rst │ ├── script_vs_module.rst │ └── why_caliban.rst ├── getting_started │ ├── cloud.rst │ ├── getting_caliban.rst │ └── prerequisites.rst ├── gke │ ├── cluster_management.rst │ ├── concepts.rst │ ├── job_submission.rst │ └── prereq.rst ├── index.rst ├── make.bat ├── recipes │ ├── dockerignore.rst │ ├── flagfile.rst │ ├── local_dir.rst │ └── single_gpu.rst └── requirements.txt ├── paper ├── 10.21105.joss.02403.pdf ├── paper.bib └── paper.md ├── pylintrc ├── pyproject.toml ├── requirements-dev.txt ├── scripts ├── bashrc ├── build_dockerfiles.sh ├── cloudbuild.py ├── cloudbuild_config.json └── run_tests.sh ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── caliban │ ├── __init__.py │ ├── config │ │ ├── __init__.py │ │ ├── test_config.py │ │ └── test_experiment.py │ ├── docker │ │ ├── __init__.py │ │ ├── test_build.py │ │ └── test_push.py │ ├── history │ │ ├── __init__.py │ │ └── test_history.py │ ├── platform │ │ ├── __init__.py │ │ ├── cloud │ │ │ ├── __init__.py │ │ │ ├── test_types.py │ │ │ └── test_util.py │ │ └── gke │ │ │ ├── __init__.py │ │ │ ├── test_types.py │ │ │ └── test_util.py │ ├── resources │ │ ├── __init__.py │ │ └── test_caliban_launcher.py │ ├── test_cli.py │ └── util │ │ ├── __init__.py │ │ ├── test_argparse.py │ │ ├── test_auth.py │ │ ├── test_fs.py │ │ ├── test_metrics.py │ │ ├── test_schema.py │ │ ├── test_tqdm.py │ │ └── test_util.py ├── conftest.py └── context.py ├── tutorials ├── README.md ├── basic │ ├── .calibanconfig.json │ ├── README.md │ ├── experiment.json │ ├── mnist.py │ └── requirements.txt └── uv-metrics │ ├── .calibanconfig.json │ ├── README.md │ ├── cli.py │ ├── experiment.json │ ├── hello_world.sh │ ├── mnist.py │ ├── setup.py │ └── trainer │ ├── __init__.py │ ├── cli.py │ └── train.py └── versioneer.py /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ((python-mode 2 | . ((py-indent-offset . 2)))) 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # ignore .git and .cache folders 2 | .git 3 | .cache 4 | env 5 | tests 6 | *.egg-info 7 | Makefile 8 | pylintrc 9 | setup.cfg 10 | __pycache__ 11 | .coverage 12 | .pytest_cache 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | caliban/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: coverage 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.11 17 | - name: Cache pip 18 | uses: actions/cache@v2 19 | with: 20 | # This path is specific to Ubuntu 21 | path: ~/.cache/pip 22 | # Look to see if there is a cache hit for the corresponding requirements file 23 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }} 24 | restore-keys: | 25 | ${{ runner.os }}-pip- 26 | ${{ runner.os }}- 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install . 31 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi 32 | - name: Run pytest, generate coverage 33 | run: | 34 | pytest --doctest-modules -v -s \ 35 | --hypothesis-profile dev \ 36 | --cov-config setup.cfg \ 37 | --cov-report=xml \ 38 | --cov caliban \ 39 | caliban tests 40 | - name: Upload coverage to Codecov 41 | uses: codecov/codecov-action@v1 42 | with: 43 | fail_ci_if_error: true 44 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: pre-commit 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | pre-commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - uses: actions/setup-python@v4 15 | with: 16 | python-version: 3.11 17 | 18 | - uses: pre-commit/action@v3.0.0 19 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release to PyPi 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 3.11 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: 3.11 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 24 | run: | 25 | python setup.py sdist bdist_wheel 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.github/workflows/workflow.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: ["3.9", "3.10", "3.11"] 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Cache pip 22 | uses: actions/cache@v2 23 | with: 24 | # This path is specific to Ubuntu 25 | path: ~/.cache/pip 26 | # Look to see if there is a cache hit for the corresponding requirements file 27 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }} 28 | restore-keys: | 29 | ${{ runner.os }}-pip- 30 | ${{ runner.os }}- 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install . 35 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi 36 | - name: Run pytest 37 | run: | 38 | pytest --doctest-modules -v -s \ 39 | --hypothesis-profile dev \ 40 | caliban tests 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | output-dist 106 | keras_tensorboard 107 | keras_export 108 | 109 | # emacs backup files 110 | *~ 111 | auto 112 | mlruns 113 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | repos: 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v2.3.0 18 | hooks: 19 | - id: check-yaml 20 | - id: end-of-file-fixer 21 | - id: trailing-whitespace 22 | 23 | - repo: https://github.com/astral-sh/ruff-pre-commit 24 | rev: v0.1.11 25 | hooks: 26 | - id: ruff 27 | types_or: [ python, pyi, jupyter ] 28 | 29 | - id: ruff-format 30 | types_or: [ python, pyi, jupyter ] 31 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/conf.py 11 | 12 | # Optionally set the version of Python and requirements required to build your docs 13 | python: 14 | version: 3.7 15 | install: 16 | - requirements: docs/requirements.txt 17 | -------------------------------------------------------------------------------- /COMMITTERS.md: -------------------------------------------------------------------------------- 1 | # Committers 2 | 3 | These are the folks who can +1 a pull request and approve it for merge. 4 | 5 | ## Active 6 | 7 | | Name | Handle | 8 | |-----------------|------------------------------------------------------| 9 | | Sam Ritchie | [@isnotinvain](https://github.com/sritchie) | 10 | | Ambrose Slone | [@ajslone](https://github.com/ajslone) | 11 | | Guy Gur-Ari | [@guygurari](https://github.com/guygurari) | 12 | | Vinay Ramasesh | [@ramasesh](https://github.com/ramasesh) | 13 | 14 | 15 | ## Emeritus 16 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | So you want to add some code to Caliban. Excellent! 4 | 5 | Pull requests and bug reports are always welcome! Check out our [Contributor's 6 | Guide](CONTRIBUTING.md) for information on how to get started contributing to 7 | Caliban. 8 | 9 | The TL;DR; is: 10 | 11 | - send us a pull request, 12 | - iterate on the feedback + discussion, and 13 | - get a +1 from a [Committer](COMMITTERS.md) 14 | 15 | in order to get your PR accepted. 16 | 17 | Issues should be reported on the [GitHub issue 18 | tracker](https://github.com/google/caliban/issues). 19 | 20 | If you want to discuss an idea for a new feature or ask us a question, 21 | discussion occurs primarily in the body of [Github 22 | Issues](https://github.com/google/caliban/issues), though the project is growing 23 | large enough that we may start a Gitter channel soon. 24 | 25 | The current list of active committers (who can +1 a pull request) can be found 26 | here: [COMMITTERS.md](COMMITTERS.md) 27 | 28 | A list of contributors to the project can be found at the project's 29 | [Contributors](https://github.com/google/caliban/graphs/contributors) page. 30 | 31 | ## Contributor License Agreement 32 | 33 | Contributions to this project must be accompanied by a Contributor License 34 | Agreement. You (or your employer) retain the copyright to your contribution; 35 | this simply gives us permission to use and redistribute your contributions as 36 | part of the project. Head over to to see 37 | your current agreements on file or to sign a new one. 38 | 39 | You generally only need to submit a CLA once, so if you've already submitted one 40 | (even if it was for a different project), you probably don't need to do it 41 | again. 42 | 43 | ## Developing in Caliban 44 | 45 | We use [pre-commit](https://pre-commit.com/) to manage a series of git 46 | pre-commit hooks for the project; for example, each time you commit code, the 47 | hooks will make sure that your python is formatted properly. If your code isn't, 48 | the hook will format it, so when you try to commit the second time you'll get 49 | past the hook. 50 | 51 | All hooks are defined in `.pre-commit-config.yaml`. To install these hooks, 52 | install `pre-commit` if you don't yet have it. I prefer using 53 | [pipx](https://github.com/pipxproject/pipx) so that `pre-commit` stays globally 54 | available. 55 | 56 | ```bash 57 | pipx install pre-commit 58 | ``` 59 | 60 | Then install the hooks with this command: 61 | 62 | ```bash 63 | pre-commit install 64 | ``` 65 | 66 | Now they'll run on every commit. If you want to run them manually, you can run either of these commands: 67 | 68 | ```bash 69 | pre-commit run --all-files 70 | 71 | # or this, if you've previously run `make build`: 72 | make lint 73 | ``` 74 | 75 | ## Documentation 76 | 77 | We use Sphinx to generate docs. If you want to live-preview your changes to the 78 | documentation as you are editing, you can use 79 | [sphinx-reload](https://pypi.org/project/sphinx-reload/). To get this working: 80 | 81 | ```bash 82 | pipx install sphinx-reload 83 | ``` 84 | 85 | Then, inside the caliban folder: 86 | 87 | ```bash 88 | make build 89 | sphinx-reload docs 90 | ``` 91 | 92 | If all goes well, `sphinx-reload` will tell you it is serving the documentation 93 | on a port, which you can listen into from your browser. 94 | 95 | ## Publishing Caliban 96 | 97 | - First, run `make build` to get your virtual environment set up. 98 | - Make sure that you're on the master branch! 99 | - add a new tag, with `git tag 0.2.3` or the equivalent 100 | - run `make release` to push the latest code and tags to all relevant 101 | repositories. 102 | -------------------------------------------------------------------------------- /LICENSE_SHORT: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include caliban/_version.py 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ## 2 | # Variables 3 | ## 4 | 5 | ENV_NAME = env 6 | ENV_ACT = . env/bin/activate; 7 | PIP = $(ENV_NAME)/bin/pip 8 | PY = $(ENV_NAME)/bin/python 9 | PYTEST_ARGS = --doctest-modules -v -s --hypothesis-profile dev 10 | PYTEST_TARGET = caliban tests 11 | COVERAGE_ARGS = --cov-config setup.cfg --cov-report term-missing --cov 12 | COVERAGE_TARGET = caliban 13 | 14 | ## 15 | # Targets 16 | ## 17 | 18 | .PHONY: build 19 | build: clean install 20 | 21 | .PHONY: clean 22 | clean: clean-env clean-files 23 | 24 | .PHONY: clean-env 25 | clean-env: 26 | rm -rf $(ENV_NAME) 27 | 28 | .PHONY: clean-files 29 | clean-files: 30 | rm -rf .tox 31 | rm -rf .coverage 32 | find . -name \*.pyc -type f -delete 33 | find . -name \*.test.db -type f -delete 34 | find . -depth -name __pycache__ -type d -exec rm -rf {} \; 35 | rm -rf dist *.egg* build 36 | 37 | .PHONY: install 38 | install: 39 | rm -rf $(ENV_NAME) 40 | virtualenv -p python3 $(ENV_NAME) 41 | $(PIP) install -r requirements-dev.txt 42 | $(PIP) install -r docs/requirements.txt 43 | $(PIP) install -e . 44 | 45 | .PHONY: test 46 | test: lint pytest 47 | 48 | .PHONY: pytest 49 | pytest: 50 | $(ENV_ACT) pytest $(PYTEST_ARGS) $(COVERAGE_ARGS) $(COVERAGE_TARGET) $(PYTEST_TARGET) 51 | 52 | .PHONY: test-full 53 | test-full: lint test-setuppy clean-files 54 | 55 | .PHONY: test-setuppy 56 | test-setuppy: 57 | $(PY) setup.py test 58 | 59 | .PHONY: lint 60 | lint: pre-commit 61 | 62 | .PHONY: pre-commit 63 | pre-commit: $(ENV_ACT) pre-commit run --all-files 64 | 65 | .PHONY: push 66 | push: 67 | git push origin master 68 | git push --tags 69 | 70 | .PHONY: release-egg 71 | release-egg: 72 | $(ENV_ACT) python setup.py sdist bdist_wheel 73 | $(ENV_ACT) twine upload -r pypi dist/* 74 | rm -rf dist *.egg* build 75 | 76 | .PHONY: release 77 | release: push release-egg 78 | -------------------------------------------------------------------------------- /caliban/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from ._version import get_versions 18 | 19 | __version__ = get_versions()["version"] 20 | del get_versions 21 | -------------------------------------------------------------------------------- /caliban/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .main import main 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /caliban/docker/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/docker/push.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Functions required to interact with Docker to build and run images, shells 17 | and notebooks in a Docker environment. 18 | 19 | """ 20 | 21 | import json 22 | import subprocess 23 | 24 | from absl import logging 25 | 26 | 27 | def _image_tag_for_project( 28 | project_id: str, image_id: str, include_tag: bool = True 29 | ) -> str: 30 | """Generate the GCR Docker image tag for the supplied pair of project_id and 31 | image_id. 32 | 33 | This function properly handles "domain scoped projects", where the project ID 34 | contains a domain name and project ID separated by : 35 | https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects. 36 | 37 | """ 38 | project_s = project_id.replace(":", "/") 39 | base = f"gcr.io/{project_s}/{image_id}" 40 | return f"{base}:latest" if include_tag else base 41 | 42 | 43 | def _gcr_list_tags(project_id: str, image_id: str): 44 | """Returns a sequence of metadata for all tags of the supplied image_id in the 45 | supplied project. 46 | 47 | """ 48 | image_tag = _image_tag_for_project(project_id, image_id, include_tag=False) 49 | cmd = [ 50 | "gcloud", 51 | "container", 52 | "images", 53 | "list-tags", 54 | f"--project={project_id}", 55 | "--format=json", 56 | image_tag, 57 | ] 58 | return json.loads(subprocess.check_output(cmd)) 59 | 60 | 61 | def gcr_image_pushed(project_id: str, image_id: str) -> bool: 62 | """Returns true if the supplied image has been pushed to the container registry 63 | for the supplied project, false otherwise. 64 | 65 | """ 66 | return len(_gcr_list_tags(project_id, image_id)) > 0 67 | 68 | 69 | def push_uuid_tag(project_id: str, image_id: str, force: bool = False) -> str: 70 | """Takes a base image and tags it for upload, then pushes it to a remote Google 71 | Container Registry. 72 | 73 | Returns the tag on a successful push. 74 | """ 75 | image_tag = _image_tag_for_project(project_id, image_id) 76 | 77 | def missing_remotely(): 78 | missing = not gcr_image_pushed(project_id, image_id) 79 | if not missing: 80 | logging.info(f"Skipping docker push, as {image_tag} already exists remotely.") 81 | return missing 82 | 83 | if force or missing_remotely(): 84 | subprocess.run(["docker", "tag", image_id, image_tag], check=True) 85 | subprocess.run(["docker", "push", image_tag], check=True) 86 | 87 | return image_tag 88 | -------------------------------------------------------------------------------- /caliban/expansion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Entry point for Caliban's experiment config expansion.""" 17 | 18 | from __future__ import absolute_import, division, print_function 19 | 20 | import json 21 | import logging as ll 22 | from typing import List 23 | 24 | from absl import app, logging 25 | from absl.flags import argparse_flags 26 | 27 | import caliban.config.experiment as ce 28 | from caliban import __version__ 29 | 30 | ll.getLogger("caliban.expansion").setLevel(logging.ERROR) 31 | 32 | 33 | def expansion_parser(): 34 | """Creates and returns the argparse instance for the experiment config 35 | expansion app. 36 | 37 | """ 38 | 39 | parser = argparse_flags.ArgumentParser( 40 | description="Experiment config expander. For documentation, visit https://github.com/google/caliban", 41 | prog="expansion", 42 | ) 43 | parser.add_argument( 44 | "--version", action="version", version="%(prog)s {}".format(__version__) 45 | ) 46 | parser.add_argument( 47 | "--pprint", action="store_true", help="Pretty-print the config to stdout." 48 | ) 49 | parser.add_argument( 50 | "--print_flags", 51 | action="store_true", 52 | help="Print the actual flags generated by each experiment in the expansion, \ 53 | one per line.", 54 | ) 55 | parser.add_argument( 56 | "experiment_config", 57 | type=ce.load_experiment_config, 58 | help="Path to an experiment config, or 'stdin' to read from stdin.", 59 | ) 60 | 61 | return parser 62 | 63 | 64 | def parse_flags(argv): 65 | """Function required by absl.app.run. Internally generates a parser and returns 66 | the results of parsing caliban arguments. 67 | 68 | """ 69 | args = argv[1:] 70 | return expansion_parser().parse_args(args) 71 | 72 | 73 | def _print_flags(expanded: List[ce.Experiment]) -> None: 74 | """Print the flags associated with each experiment in the supplied expansion 75 | list. 76 | 77 | """ 78 | for m in expanded: 79 | flags = ce.experiment_to_args(m) 80 | print(" ".join(flags)) 81 | 82 | 83 | def _print_json(expanded: List[ce.Experiment], pprint: bool = False) -> None: 84 | """Print the list of expanded experiments to stdout; if pprint is true, 85 | pretty-prints each JSON dict using an indent of 2, else prints the list with 86 | no newlines. 87 | 88 | """ 89 | indent = 2 if pprint else None 90 | print(json.dumps(expanded, indent=indent)) 91 | 92 | 93 | def run_app(args): 94 | """Main function to run the Caliban app. Accepts a Namespace-type output of an 95 | argparse argument parser. 96 | 97 | """ 98 | conf = args.experiment_config 99 | expanded = ce.expand_experiment_config(conf) 100 | 101 | if args.print_flags: 102 | _print_flags(expanded) 103 | else: 104 | _print_json(expanded, pprint=args.pprint) 105 | 106 | 107 | def main(): 108 | app.run(run_app, flags_parser=parse_flags) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /caliban/history/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/history/submit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """caliban utilities for job re-submission""" 17 | 18 | from typing import List, Optional 19 | 20 | import caliban.platform.cloud.core as cloud 21 | import caliban.platform.gke.cli as gke_cli 22 | import caliban.platform.run as r 23 | from caliban.history.types import JobSpec, Platform 24 | 25 | 26 | # ---------------------------------------------------------------------------- 27 | def submit_job_specs( 28 | specs: List[JobSpec], 29 | platform: Platform, 30 | project_id: Optional[str] = None, 31 | credentials_path: Optional[str] = None, 32 | ) -> None: 33 | """submits a job spec""" 34 | 35 | if len(specs) == 0: 36 | return 37 | 38 | if platform == Platform.LOCAL: 39 | return r.execute_jobs(job_specs=specs) 40 | 41 | if platform == Platform.CAIP: 42 | return cloud.submit_job_specs( 43 | specs=specs, 44 | project_id=project_id, 45 | credentials_path=credentials_path, 46 | num_specs=len(specs), 47 | ) 48 | 49 | if platform == Platform.GKE: 50 | return gke_cli.submit_job_specs( 51 | args={ 52 | "cloud_key": credentials_path, 53 | "project_id": project_id, 54 | "specs": specs, 55 | }, 56 | ) 57 | 58 | return None 59 | -------------------------------------------------------------------------------- /caliban/platform/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/platform/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/platform/cloud/util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Utilities relevant to AI Platform. 18 | """ 19 | import re 20 | from typing import Dict, List, Optional, Tuple, Union 21 | 22 | import caliban.util as u 23 | import caliban.util.argparse as ua 24 | 25 | # key and value for labels can be at most this-many-characters long. 26 | AI_PLATFORM_MAX_LABEL_LENGTH = 63 27 | 28 | 29 | def _truncate(s: str, max_length: int) -> str: 30 | """Returns the input string s truncated to be at most max_length characters 31 | long. 32 | 33 | """ 34 | return s if len(s) <= max_length else s[0:max_length] 35 | 36 | 37 | def _clean_label(s: Optional[str], is_key: bool) -> str: 38 | """Processes the string into the sanitized format required by AI platform 39 | labels. 40 | 41 | https://cloud.google.com/ml-engine/docs/resource-labels 42 | 43 | """ 44 | if s is None: 45 | return "" 46 | 47 | # periods are not allowed by AI Platform labels, but often occur in, 48 | # e.g., learning rates 49 | DECIMAL_REPLACEMENT = "_" 50 | s = s.replace(".", DECIMAL_REPLACEMENT) 51 | 52 | # lowercase, letters, - and _ are valid, so strip the leading dashes, make 53 | # everything lowercase and then kill any remaining unallowed characters. 54 | cleaned = re.sub(r"[^a-z0-9_-]", "", s.lower()).lstrip("-") 55 | 56 | # Keys must start with a letter. If is_key is set and the cleaned version 57 | # starts with something else, append `k`. 58 | if is_key and cleaned != "" and not cleaned[0].isalpha(): 59 | cleaned = "k" + cleaned 60 | 61 | return _truncate(cleaned, AI_PLATFORM_MAX_LABEL_LENGTH) 62 | 63 | 64 | def key_label(k: Optional[str]) -> str: 65 | """converts the argument into a valid label, suitable for submission as a label 66 | key to Cloud. 67 | 68 | """ 69 | return _clean_label(k, True) 70 | 71 | 72 | def value_label(v: Optional[str]) -> str: 73 | """converts the argument into a valid label, suitable for submission as a label 74 | value to Cloud. 75 | 76 | """ 77 | return _clean_label(v, False) 78 | 79 | 80 | def script_args_to_labels(script_args: Optional[List[str]]) -> Dict[str, str]: 81 | """Converts the arguments supplied to our scripts into a dictionary usable as 82 | labels valid for Cloud submission. 83 | 84 | """ 85 | ret = {} 86 | 87 | def process_pair(k, v): 88 | if ua.is_key(k): 89 | clean_k = key_label(k) 90 | if clean_k != "": 91 | ret[clean_k] = "" if ua.is_key(v) else value_label(v) 92 | 93 | if script_args is None or len(script_args) == 0: 94 | return ret 95 | 96 | elif len(script_args) == 1: 97 | process_pair(script_args[0], None) 98 | 99 | # Handle the case where the final argument in the list is a boolean flag. 100 | # This won't get picked up by partition. 101 | elif len(script_args) > 1: 102 | for k, v in u.partition(script_args, 2): 103 | process_pair(k, v) 104 | 105 | process_pair(script_args[-1], None) 106 | 107 | return ret 108 | 109 | 110 | def sanitize_labels( 111 | pairs: Union[Dict[str, str], List[Tuple[str, str]]], 112 | ) -> Dict[str, str]: 113 | """Turns a dict, or a list of unsanitized key-value pairs (each represented by 114 | a tuple) into a dictionary suitable to submit to Cloud as a label dict. 115 | 116 | """ 117 | if isinstance(pairs, dict): 118 | return sanitize_labels(pairs.items()) 119 | 120 | return {key_label(k): value_label(v) for (k, v) in pairs if key_label(k)} 121 | -------------------------------------------------------------------------------- /caliban/platform/gke/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/platform/gke/constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """constants for gke""" 17 | 18 | import re 19 | 20 | from caliban.config import DEFAULT_MACHINE_TYPE, JobMode 21 | from caliban.platform.cloud.types import GPU, GPUSpec 22 | from caliban.platform.gke.types import ReleaseChannel 23 | 24 | COMPUTE_SCOPE_URL = "https://www.googleapis.com/auth/compute" 25 | COMPUTE_READONLY_SCOPE_URL = "https://www.googleapis.com/auth/compute.readonly" 26 | CLOUD_PLATFORM_SCOPE_URL = "https://www.googleapis.com/auth/cloud-platform" 27 | KUBE_SYSTEM_NAMESPACE = "kube-system" 28 | DEFAULT_NAMESPACE = "default" 29 | BATCH_V1_VERSION = "batch/v1" 30 | NODE_SELECTOR_GKE_ACCELERATOR = "cloud.google.com/gke-accelerator" 31 | NODE_SELECTOR_INSTANCE_TYPE = "beta.kubernetes.io/instance-type" 32 | NODE_SELECTOR_PREEMPTIBLE = "cloud.google.com/gke-preemptible" 33 | CONTAINER_RESOURCE_LIMIT_TPU = "cloud-tpus.google.com" 34 | CONTAINER_RESOURCE_LIMIT_GPU = "nvidia.com/gpu" 35 | CONTAINER_RESOURCE_REQUEST_CPU = "cpu" 36 | CONTAINER_RESOURCE_REQUEST_MEM = "memory" 37 | TEMPLATE_META_ANNOTATION_TPU_DRIVER = "tf-version.cloud-tpus.google.com" 38 | DEFAULT_TPU_DRIVER = "1.14" 39 | ZONE_DEFAULT = "-" # all zones 40 | DEFAULT_MACHINE_TYPE_CPU = DEFAULT_MACHINE_TYPE[JobMode.CPU].value 41 | DEFAULT_MACHINE_TYPE_GPU = DEFAULT_MACHINE_TYPE[JobMode.GPU].value 42 | DEFAULT_GPU_SPEC = GPUSpec(GPU.P100, 1) 43 | DASHBOARD_JOB_URL = "https://console.cloud.google.com/kubernetes/job" 44 | DASHBOARD_CLUSTER_URL = "https://console.cloud.google.com/kubernetes/clusters/details" 45 | MAX_GB_PER_CPU = 64 46 | DEFAULT_CLUSTER_NAME = "blueshift" 47 | VALID_JOB_FILE_EXT = (".yaml", ".json") 48 | DEFAULT_RELEASE_CHANNEL = ReleaseChannel.REGULAR 49 | CLUSTER_API_VERSION = "v1beta1" 50 | 51 | # default min_cpu for gpu/tpu -accelerated jobs (in milli-cpu) 52 | DEFAULT_MIN_CPU_ACCEL = 1500 53 | # default min_cpu for cpu-only jobs (in milli-cpu) 54 | DEFAULT_MIN_CPU_CPU = 31000 55 | 56 | # default min_mem for gpu/tpu jobs (in MB) 57 | DEFAULT_MIN_MEM_ACCEL = 7000 58 | # default min_mem for cpu-only jobs (in MB) 59 | DEFAULT_MIN_MEM_CPU = 25000 60 | 61 | # ---------------------------------------------------------------------------- 62 | # The following urls specify kubernetes daemonsets that apply the appropriate 63 | # nvidia drivers to auto-created gpu instances. If this is not running, then your 64 | # gpu jobs will mysteriously fail to schedule, and you will be sad. 65 | # see https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers 66 | 67 | # daemonset for COS instances 68 | NVIDIA_DRIVER_COS_DAEMONSET_URL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml" 69 | 70 | # daemonset for Ubuntu instances 71 | NVIDIA_DRIVER_UBUNTU_DAEMONSET_URL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml" 72 | 73 | # ---------------------------------------------------------------------------- 74 | DNS_1123_RE = re.compile("\A[a-z0-9]([a-z0-9\-\.]*[a-z0-9])?\Z") 75 | -------------------------------------------------------------------------------- /caliban/platform/gke/types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """types relevant to gke""" 17 | 18 | from enum import Enum 19 | from typing import NamedTuple, Optional 20 | 21 | from google.auth.credentials import Credentials 22 | from kubernetes.client import V1Job 23 | 24 | # ---------------------------------------------------------------------------- 25 | # Node image types 26 | # see https://cloud.google.com/kubernetes-engine/docs/concepts/node-images 27 | NodeImage = Enum( 28 | "NODE_IMAGE", 29 | { 30 | "COS": "cos", 31 | "UBUNTU": "ubuntu", 32 | "COS_CONTAINERD": "cos_containerd", 33 | "UBUNTU_CONTAINERD": "ubuntu_containerd", 34 | }, 35 | ) 36 | 37 | # ---------------------------------------------------------------------------- 38 | # GKE operation status, see: 39 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.locations.operations 40 | OpStatus = Enum( 41 | "OP_STATUS", 42 | { 43 | "STATUS_UNSPECIFIED": "STATUS_UNSPECIFIED", 44 | "PENDING": "PENDING", 45 | "RUNNING": "RUNNING", 46 | "DONE": "DONE", 47 | "ABORTING": "ABORTING", 48 | }, 49 | ) 50 | 51 | # ---------------------------------------------------------------------------- 52 | # Credentials data (credentials, project id) 53 | CredentialsData = NamedTuple( 54 | "CredentialsData", 55 | [("credentials", Optional[Credentials]), ("project_id", Optional[str])], 56 | ) 57 | 58 | # ---------------------------------------------------------------------------- 59 | # GKE release channel, see: 60 | # https://cloud.google.com/kubernetes-engine/docs/concepts/release-channels 61 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#Cluster.ReleaseChannel 62 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#channel 63 | ReleaseChannel = Enum( 64 | "RELEASE_CHANNEL", 65 | { 66 | "UNSPECIFIED": "UNSPECIFIED", 67 | "RAPID": "RAPID", 68 | "REGULAR": "REGULAR", 69 | "STABLE": "STABLE", 70 | }, 71 | ) 72 | 73 | 74 | # ---------------------------------------------------------------------------- 75 | class JobStatus(Enum): 76 | """gke job status""" 77 | 78 | STATE_UNSPECIFIED = 0 79 | PENDING = 1 80 | RUNNING = 2 81 | FAILED = 3 82 | SUCCEEDED = 4 83 | UNAVAILABLE = 5 84 | 85 | def is_terminal(self) -> bool: 86 | return self.name in ["FAILED", "SUCCEEDED", "UNAVAILABLE"] 87 | 88 | @classmethod 89 | def from_job_info(cls, job_info: V1Job) -> "JobStatus": 90 | if job_info is None: 91 | return JobStatus.STATE_UNSPECIFIED 92 | 93 | if job_info.status is None: 94 | return JobStatus.STATE_UNSPECIFIED 95 | 96 | # completed 97 | if job_info.status.completion_time is not None: 98 | if job_info.status.succeeded is not None: 99 | if job_info.status.succeeded > 0: 100 | return JobStatus.SUCCEEDED 101 | else: 102 | return JobStatus.FAILED 103 | 104 | # active/pending 105 | if job_info.status.active is not None: 106 | if job_info.status.active > 0: 107 | return JobStatus.RUNNING 108 | else: 109 | return JobStatus.PENDING 110 | 111 | # unknown 112 | return JobStatus.STATE_UNSPECIFIED 113 | -------------------------------------------------------------------------------- /caliban/platform/notebook.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Functions required to interact with Docker to build and run images, shells 17 | and notebooks in a Docker environment. 18 | 19 | """ 20 | 21 | from typing import List, Optional 22 | 23 | from blessings import Terminal 24 | 25 | import caliban.config as c 26 | import caliban.docker.build as b 27 | import caliban.platform.shell as ps 28 | import caliban.util.fs as ufs 29 | 30 | t = Terminal() 31 | 32 | 33 | def run_notebook( 34 | job_mode: c.JobMode, 35 | port: Optional[int] = None, 36 | lab: Optional[bool] = None, 37 | version: Optional[bool] = None, 38 | run_args: Optional[List[str]] = None, 39 | **run_interactive_kwargs, 40 | ) -> None: 41 | """Start a notebook in the current working directory; the process will run 42 | inside of a Docker container that's identical to the environment available to 43 | Cloud jobs that are submitted by `caliban cloud`, or local jobs run with 44 | `caliban run.` 45 | 46 | if you pass mount_home=True your jupyter settings will persist across calls. 47 | 48 | Keyword args: 49 | 50 | - port: the port to pass to Jupyter when it boots, useful if you have 51 | multiple instances running on one machine. 52 | - lab: if True, starts jupyter lab, else jupyter notebook. 53 | - version: explicit Jupyter version to install. 54 | 55 | run_interactive_kwargs are all extra arguments taken by run_interactive. 56 | 57 | """ 58 | 59 | if port is None: 60 | port = ufs.next_free_port(8888) 61 | 62 | if lab is None: 63 | lab = False 64 | 65 | if run_args is None: 66 | run_args = [] 67 | 68 | inject_arg = b.NotebookInstall.lab if lab else b.NotebookInstall.jupyter 69 | jupyter_cmd = "lab" if lab else "notebook" 70 | jupyter_args = [ 71 | "-m", 72 | "jupyter", 73 | jupyter_cmd, 74 | "--ip=0.0.0.0", 75 | "--port={}".format(port), 76 | "--no-browser", 77 | ] 78 | docker_args = ["-p", "{}:{}".format(port, port)] + run_args 79 | 80 | ps.run_interactive( 81 | job_mode, 82 | entrypoint="python", 83 | entrypoint_args=jupyter_args, 84 | run_args=docker_args, 85 | inject_notebook=inject_arg, 86 | jupyter_version=version, 87 | **run_interactive_kwargs, 88 | ) 89 | -------------------------------------------------------------------------------- /caliban/platform/shell.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Functions required to interact with Docker to build and run images, shells 17 | and notebooks in a Docker environment. 18 | 19 | """ 20 | 21 | import os 22 | from pathlib import Path 23 | from typing import List, Optional 24 | 25 | import caliban.config as c 26 | import caliban.docker.build as b 27 | import caliban.platform.run as r 28 | 29 | 30 | def _home_mount_cmds(enable_home_mount: bool) -> List[str]: 31 | """Returns the argument needed by Docker to mount a user's local home directory 32 | into the home directory location inside their container. 33 | 34 | If enable_home_mount is false returns an empty list. 35 | 36 | """ 37 | ret = [] 38 | if enable_home_mount: 39 | ret = ["-v", "{}:{}".format(Path.home(), b.container_home())] 40 | return ret 41 | 42 | 43 | def _interactive_opts(workdir: str) -> List[str]: 44 | """Returns the basic arguments we want to run a docker process locally.""" 45 | return [ 46 | "-w", 47 | workdir, 48 | "-u", 49 | "{}:{}".format(os.getuid(), os.getgid()), 50 | "-v", 51 | "{}:{}".format(os.getcwd(), workdir), 52 | ] 53 | 54 | 55 | def run_interactive( 56 | job_mode: c.JobMode, 57 | workdir: Optional[str] = None, 58 | image_id: Optional[str] = None, 59 | run_args: Optional[List[str]] = None, 60 | mount_home: Optional[bool] = None, 61 | shell: Optional[b.Shell] = None, 62 | entrypoint: Optional[str] = None, 63 | entrypoint_args: Optional[List[str]] = None, 64 | **build_image_kwargs, 65 | ) -> None: 66 | """Start a live shell in the terminal, with all dependencies installed and the 67 | current working directory (and optionally the user's home directory) mounted. 68 | 69 | Keyword args: 70 | 71 | - job_mode: c.JobMode. 72 | - image_id: ID of the image to run. Supplying this will skip an image build. 73 | - run_args: extra arguments to supply to `docker run`. 74 | - mount_home: if true, mounts the user's $HOME directory into the container 75 | to `/home/$USERNAME`. If False, nothing. 76 | - shell: name of the shell to install into the container. Also configures the 77 | entrypoint if that's not supplied. 78 | - entrypoint: command to run. Defaults to the executable command for the 79 | supplied shell. 80 | - entrypoint_args: extra arguments to supply to the entrypoint. 81 | 82 | any extra kwargs supplied are passed through to build_image. 83 | 84 | """ 85 | if workdir is None: 86 | workdir = b.DEFAULT_WORKDIR 87 | 88 | if run_args is None: 89 | run_args = [] 90 | 91 | if entrypoint_args is None: 92 | entrypoint_args = [] 93 | 94 | if mount_home is None: 95 | mount_home = True 96 | 97 | if shell is None: 98 | # Only set a default shell if we're also mounting the home volume. 99 | # Otherwise a custom shell won't have access to the user's profile. 100 | shell = b.default_shell() if mount_home else b.Shell.bash 101 | 102 | if entrypoint is None: 103 | entrypoint = b.SHELL_DICT[shell].executable 104 | 105 | interactive_run_args = ( 106 | _interactive_opts(workdir) 107 | + ["-it", "--entrypoint", entrypoint] 108 | + _home_mount_cmds(mount_home) 109 | + run_args 110 | ) 111 | 112 | r.run( 113 | job_mode=job_mode, 114 | run_args=interactive_run_args, 115 | script_args=entrypoint_args, 116 | image_id=image_id, 117 | shell=shell, 118 | workdir=workdir, 119 | **build_image_kwargs, 120 | ) 121 | -------------------------------------------------------------------------------- /caliban/resources/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /caliban/resources/cloud_sql_proxy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Python wrapper around Google's cloud_sql_proxy tool that accepts 17 | configuration via a JSON dictionary of the form: 18 | 19 | { 20 | "proxy": "path to cloud_sql_proxy", 21 | "path": "cloud_sql socket path", 22 | "project": "gcp_project", 23 | "region": "gcp_region", 24 | "db": "database_name", 25 | "creds": "path_to_credentials (optional)" 26 | } 27 | 28 | This script lives in a dotfile 29 | """ 30 | 31 | import argparse 32 | import copy 33 | import json 34 | import logging 35 | import os 36 | import subprocess 37 | import sys 38 | 39 | logging.basicConfig(level=logging.INFO) 40 | 41 | 42 | # ---------------------------------------------------------------------------- 43 | def _parser(): 44 | parser = argparse.ArgumentParser( 45 | description="cloud_sql_proxy wrapper that allows JSON configuration.", 46 | prog="cloud_sql_proxy", 47 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 48 | ) 49 | 50 | parser.add_argument("config", type=json.loads) 51 | return parser 52 | 53 | 54 | # ---------------------------------------------------------------------------- 55 | def _parse_flags(argv): 56 | return _parser().parse_args(argv[1:]) 57 | 58 | 59 | # ---------------------------------------------------------------------------- 60 | def main(proxy="", path="", project="", region="", db="", creds=None, debug=False): 61 | cmd = [ 62 | proxy, 63 | "-dir", 64 | path, 65 | "-instances", 66 | f"{project}:{region}:{db}", 67 | ] 68 | 69 | if not debug: 70 | cmd.append("-quiet") 71 | 72 | env = copy.copy(dict(os.environ)) 73 | 74 | if creds is not None: 75 | env["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(os.path.expanduser(creds)) 76 | 77 | subprocess.check_call(cmd, env=env) 78 | 79 | 80 | # ---------------------------------------------------------------------------- 81 | if __name__ == "__main__": 82 | m = _parse_flags(sys.argv) 83 | main(**m.config) 84 | -------------------------------------------------------------------------------- /caliban/util/argparse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Utilities for our job runner. 18 | """ 19 | import argparse 20 | import itertools as it 21 | import os 22 | from typing import Dict, List, Optional, Tuple 23 | 24 | from blessings import Terminal 25 | 26 | import caliban.util as u 27 | import caliban.util.fs as ufs 28 | import schema as s 29 | 30 | t = Terminal() 31 | 32 | 33 | def expand_args(items: Dict[str, str]) -> List[str]: 34 | """Converts the input map into a sequence of k, v pair strings. A None value is 35 | interpreted to mean that the key is a solo flag; it's evicted from the 36 | output. 37 | 38 | """ 39 | pairs = [[k, v] if v is not None else [k] for k, v in items.items()] 40 | return list(it.chain.from_iterable(pairs)) 41 | 42 | 43 | def argparse_schema(schema): 44 | """Wrapper that performs validation and converts SchemaErrors into 45 | ArgumentTypeErrors for better argument error reporting. 46 | 47 | """ 48 | 49 | def check(x): 50 | try: 51 | return schema.validate(x) 52 | except s.SchemaError as e: 53 | raise argparse.ArgumentTypeError(e.code) from None 54 | 55 | return check 56 | 57 | 58 | # TODO: Now that we use schema, validated_package and parse_kv_pair should be 59 | # converted to schema instances. 60 | 61 | 62 | def validated_package(path: str) -> u.Package: 63 | """similar to generate_package but runs argparse validation on packages that 64 | don't actually exist in the filesystem. 65 | 66 | """ 67 | p = ufs.generate_package(path) 68 | 69 | if not os.path.isdir(p.package_path): 70 | raise argparse.ArgumentTypeError( 71 | """Directory '{}' doesn't exist in directory. Code must be 72 | nested in a folder that exists in the current directory.""".format(p.package_path) 73 | ) 74 | 75 | filename = p.script_path 76 | if not ufs.file_exists_in_cwd(filename): 77 | raise argparse.ArgumentTypeError( 78 | """File '{}' doesn't exist locally as a script or python module; code 79 | must live inside the current directory.""".format(filename) 80 | ) 81 | 82 | return p 83 | 84 | 85 | def parse_kv_pair(s: str) -> Tuple[str, str]: 86 | """ 87 | Parse a key, value pair, separated by '=' 88 | 89 | On the command line (argparse) a declaration will typically look like: 90 | foo=hello 91 | or 92 | foo="hello world" 93 | """ 94 | items = s.split("=") 95 | k = items[0].strip() # Remove whitespace around keys 96 | 97 | if len(items) <= 1: 98 | raise argparse.ArgumentTypeError( 99 | "Couldn't parse label '{}' into k=v format.".format(s) 100 | ) 101 | 102 | v = "=".join(items[1:]) 103 | return (k, v) 104 | 105 | 106 | def is_key(k: Optional[str]) -> bool: 107 | """Returns True if the argument is a valid argparse optional arg input, False 108 | otherwise. 109 | 110 | Strings that start with - or -- are considered valid for now. 111 | 112 | """ 113 | return k is not None and len(k) > 0 and k[0] == "-" 114 | -------------------------------------------------------------------------------- /caliban/util/auth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Utilities for interacting with the filesystem and packages. 17 | 18 | """ 19 | 20 | from subprocess import CalledProcessError, check_output 21 | from typing import Optional 22 | 23 | from google.oauth2 import service_account 24 | from google.oauth2.credentials import Credentials 25 | 26 | 27 | def auth_access_token() -> Optional[str]: 28 | """Attempts to fetch the local Oauth2 access token from the user's environment. 29 | Returns the token if it exists, or None if not 30 | 31 | """ 32 | try: 33 | ret = check_output( 34 | ["gcloud", "auth", "print-access-token"], encoding="utf8" 35 | ).rstrip() 36 | return ret if len(ret) > 0 else None 37 | except CalledProcessError: 38 | return None 39 | 40 | 41 | def gcloud_auth_credentials() -> Optional[Credentials]: 42 | """Attempt to generate credentials from the oauth2 workflow triggered by 43 | `gcloud auth login`. Returns 44 | 45 | """ 46 | token = auth_access_token() 47 | if token: 48 | return Credentials(token) 49 | 50 | 51 | def gcloud_credentials(credentials_path: Optional[str] = None) -> Optional[Credentials]: 52 | credentials = None 53 | 54 | if credentials_path is not None: 55 | credentials = service_account.Credentials.from_service_account_file( 56 | credentials_path 57 | ) 58 | else: 59 | # attempt to fetch credentials acquired via `gcloud auth login`. If this 60 | # fails, the following API object will attempt to use application default 61 | # credentials. 62 | credentials = gcloud_auth_credentials() 63 | 64 | return credentials 65 | -------------------------------------------------------------------------------- /caliban/util/schema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Useful shared schemas. 18 | """ 19 | import os 20 | import sys 21 | from contextlib import contextmanager 22 | from typing import Optional 23 | 24 | import commentjson 25 | 26 | import caliban.util as u 27 | import schema as s 28 | 29 | 30 | class FatalSchemaError(Exception): 31 | """Wrapper for an exception that can bubble itself up to the top level of the 32 | program.""" 33 | 34 | def __init__(self, message, context): 35 | self.message = message 36 | self.context = context 37 | super().__init__(self.message) 38 | 39 | 40 | @contextmanager 41 | def error_schema(context: Optional[str] = None): 42 | """Wrap functions that check schemas in this context manager to throw an 43 | appropriate error with a nice message. 44 | 45 | """ 46 | prefix = "" 47 | if context is not None: 48 | prefix = f"\nValidation error while parsing {context}:\n" 49 | 50 | try: 51 | yield 52 | except s.SchemaError as e: 53 | raise FatalSchemaError(e.code, prefix) 54 | 55 | 56 | @contextmanager 57 | def fatal_errors(): 58 | """Context manager meant to wrap an entire program and present schema errors in 59 | an easy-to-read way. 60 | 61 | """ 62 | try: 63 | yield 64 | except FatalSchemaError as e: 65 | u.err(f"{e.context}\n{e.message}\n\n") 66 | sys.exit(1) 67 | except s.SchemaError as e: 68 | u.err(f"\n{e.code}\n\n") 69 | sys.exit(1) 70 | 71 | 72 | def load_json(path): 73 | with open(path) as f: 74 | return commentjson.load(f) 75 | 76 | 77 | Directory = s.Schema( 78 | os.path.isdir, 79 | error="""Directory '{}' doesn't exist in this directory. Check yourself!""", 80 | ) 81 | 82 | File = s.Schema( 83 | lambda path: os.path.isfile(os.path.expanduser(path)), 84 | error="""File '{}' isn't a valid file on your system. Try again!""", 85 | ) 86 | 87 | Json = s.And( 88 | File, 89 | s.Use( 90 | load_json, error="""File '{}' doesn't seem to contain valid JSON. Try again!""" 91 | ), 92 | ) 93 | -------------------------------------------------------------------------------- /caliban/util/tqdm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Progress bar utilities. 18 | """ 19 | 20 | import contextlib 21 | import sys 22 | 23 | from absl import logging 24 | from blessings import Terminal 25 | 26 | import tqdm 27 | from tqdm.utils import _term_move_up 28 | 29 | t = Terminal() 30 | 31 | 32 | class TqdmFile(object): 33 | """Dummy file-like that will write to tqdm""" 34 | 35 | file = None 36 | prefix = _term_move_up() + "\r" 37 | 38 | def __init__(self, file): 39 | self.file = file 40 | self._carriage_pending = False 41 | 42 | def write(self, line): 43 | if self._carriage_pending: 44 | line = self.prefix + line 45 | self._carriage_pending = False 46 | 47 | if line.endswith("\r"): 48 | self._carriage_pending = True 49 | line = line[:-1] + "\n" 50 | 51 | tqdm.tqdm.write(line, file=self.file, end="") 52 | 53 | def flush(self): 54 | return getattr(self.file, "flush", lambda: None)() 55 | 56 | def isatty(self): 57 | return getattr(self.file, "isatty", lambda: False)() 58 | 59 | def close(self): 60 | return getattr(self.file, "close", lambda: None)() 61 | 62 | 63 | def config_logging(): 64 | """Overrides logging to go through TQDM. 65 | 66 | TODO use this call to kill then restore: 67 | https://github.com/tqdm/tqdm#redirecting-writing 68 | 69 | """ 70 | h = logging.get_absl_handler() 71 | _old = h.python_handler 72 | h._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) 73 | logging.use_python_logging() 74 | 75 | 76 | @contextlib.contextmanager 77 | def tqdm_logging(): 78 | """Overrides logging to go through TQDM. 79 | 80 | https://github.com/tqdm/tqdm#redirecting-writing 81 | 82 | """ 83 | handler = logging.get_absl_handler() 84 | orig = handler.python_handler 85 | 86 | try: 87 | handler._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr)) 88 | 89 | # The changes won't take effect if this hasn't been called. Defensively 90 | # call it again here. 91 | logging.use_python_logging() 92 | yield orig.stream 93 | except Exception as exc: 94 | raise exc 95 | finally: 96 | handler._python_handler = orig 97 | -------------------------------------------------------------------------------- /codemeta.json: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld", 3 | "@type": "Code", 4 | "author": [ 5 | { 6 | "@id": "http://orcid.org/0000-0002-0545-6360", 7 | "@type": "Person", 8 | "email": "samritchie@google.com", 9 | "name": "Sam Ritchie", 10 | "affiliation": "Google" 11 | }, 12 | { 13 | "@id": "", 14 | "@type": "Person", 15 | "email": "aslone@google.com", 16 | "name": "Ambrose Slone", 17 | "affiliation": "Google" 18 | }, 19 | { 20 | "@id": "http://orcid.org/0000-0003-0625-3327", 21 | "@type": "Person", 22 | "email": "ramasesh@google.com", 23 | "name": "Vinay Ramasesh", 24 | "affiliation": "Google" 25 | } 26 | ], 27 | "identifier": "", 28 | "maintainer": "http://orcid.org/0000-0002-0545-6360", 29 | "codeRepository": "https://github.com/google/caliban", 30 | "issueTracker": "https://github.com/google/caliban/issues", 31 | "datePublished": "2020-06-22", 32 | "dateModified": "2020-06-22", 33 | "dateCreated": "2020-06-22", 34 | "description": "Docker-based job manager for reproducible workflows", 35 | "keywords": "python, docker, machine learning, reproducibility", 36 | "license": "Apache 2.0", 37 | "title": "Caliban", 38 | "version": "0.2.5" 39 | } 40 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This builds the base images that we can use for development at Blueshift. 16 | # Tensorflow 2.1 by default, but we can override the image when we call docker. 17 | # 18 | # docker build -t gcr.io/blueshift-playground/blueshift:cpu -f- . 32 | 33 | ARG GCLOUD_LOC=/usr/local/gcloud 34 | ARG PYTHON_VERSION=3.7 35 | 36 | # minicoda release archive is here: https://repo.anaconda.com/miniconda 37 | # see the docs here for managing python versions with conda: 38 | # https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-python.html 39 | ARG MINICONDA_VERSION=py37_4.8.2 40 | 41 | LABEL maintainer="samritchie@google.com" 42 | 43 | # See http://bugs.python.org/issue19846 44 | ENV LANG C.UTF-8 45 | 46 | # Install git so that users can declare git dependencies, and python3 plus 47 | # python3-virtualenv so we can generate an isolated Python environment inside 48 | # the container. 49 | RUN apt-get update && apt-get install -y --no-install-recommends \ 50 | git \ 51 | python3 \ 52 | python3-virtualenv \ 53 | wget && \ 54 | apt-get clean && \ 55 | rm -rf /var/lib/apt/lists/* 56 | 57 | # Some tools expect a "python" binary. 58 | RUN ln -s $(which python3) /usr/local/bin/python 59 | 60 | # install the google cloud SDK. 61 | RUN wget -nv \ 62 | https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \ 63 | mkdir -m 777 ${GCLOUD_LOC} && \ 64 | tar xvzf google-cloud-sdk.tar.gz -C ${GCLOUD_LOC} && \ 65 | rm google-cloud-sdk.tar.gz && \ 66 | ${GCLOUD_LOC}/google-cloud-sdk/install.sh --usage-reporting=false \ 67 | --path-update=false --bash-completion=false \ 68 | --disable-installation-options && \ 69 | rm -rf /root/.config/* && \ 70 | ln -s /root/.config /config && \ 71 | # Remove the backup directory that gcloud creates 72 | rm -rf ${GCLOUD_LOC}/google-cloud-sdk/.install/.backup 73 | 74 | # Path configuration 75 | ENV PATH $PATH:${GCLOUD_LOC}/google-cloud-sdk/bin 76 | 77 | COPY scripts/bashrc /etc/bash.bashrc 78 | 79 | # Install Miniconda and prep the system to activate our custom environment. 80 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh -O ~/miniconda.sh && \ 81 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \ 82 | rm ~/miniconda.sh && \ 83 | /opt/conda/bin/conda clean -tipsy && \ 84 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ 85 | echo ". /opt/conda/etc/profile.d/conda.sh" >> /etc/bash.bashrc && \ 86 | echo "conda activate caliban" >> /etc/bash.bashrc 87 | 88 | RUN yes | /opt/conda/bin/conda create --name caliban python=${PYTHON_VERSION} && /opt/conda/bin/conda clean --all 89 | 90 | # This allows a user to install packages in the conda environment once it 91 | # launches. 92 | RUN chmod -R 757 /opt/conda/envs/caliban && mkdir /.cache && chmod -R 757 /.cache 93 | 94 | # This is equivalent to activating the env. 95 | ENV PATH /opt/conda/envs/caliban/bin:$PATH 96 | 97 | # This makes pip recognize our conda environment 98 | # as a virtual environment, so it installs editables properly 99 | # See https://github.com/conda/conda/issues/5861 for details 100 | ENV PIP_SRC /opt/conda/envs/caliban/pipsrc 101 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ARG UBUNTU_VERSION=18.04 16 | ARG CUDA=10.1 17 | 18 | FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base 19 | 20 | # ARCH and CUDA are specified again because the FROM directive resets ARGs 21 | # (but their default value is retained if set previously) 22 | ARG ARCH 23 | ARG CUDA 24 | ARG CUDNN=7.6.4.38-1 25 | ARG CUDNN_MAJOR_VERSION=7 26 | ARG LIB_DIR_PREFIX=x86_64 27 | ARG LIBNVINFER=6.0.1-1 28 | ARG LIBNVINFER_MAJOR_VERSION=6 29 | 30 | # Needed for string substitution 31 | SHELL ["/bin/bash", "-c"] 32 | 33 | # These dependencies come from the list at the official Tensorflow GPU base 34 | # image: 35 | # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile 36 | 37 | RUN apt-get update && apt-get install -y --no-install-recommends \ 38 | build-essential \ 39 | cuda-command-line-tools-${CUDA/./-} \ 40 | # There appears to be a regression in libcublas10=10.2.2.89-1 which 41 | # prevents cublas from initializing in TF. See 42 | # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257 43 | libcublas10=10.2.1.243-1 \ 44 | cuda-nvrtc-${CUDA/./-} \ 45 | cuda-cufft-${CUDA/./-} \ 46 | cuda-curand-${CUDA/./-} \ 47 | cuda-cusolver-${CUDA/./-} \ 48 | cuda-cusparse-${CUDA/./-} \ 49 | curl \ 50 | libcudnn7=${CUDNN}+cuda${CUDA} \ 51 | libfreetype6-dev \ 52 | libhdf5-serial-dev \ 53 | libzmq3-dev \ 54 | pkg-config \ 55 | software-properties-common \ 56 | unzip \ 57 | libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ 58 | libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \ 59 | && apt-get clean \ 60 | && rm -rf /var/lib/apt/lists/* 61 | 62 | # For CUDA profiling, TensorFlow requires CUPTI. 63 | ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH 64 | 65 | # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure 66 | # dynamic linker run-time bindings 67 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ 68 | && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ 69 | && ldconfig 70 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | # You can set these variables from the command line, and also 4 | # from the environment for the first two. 5 | SPHINXOPTS ?= 6 | SPHINXBUILD ?= ../env/bin/sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 20 | -------------------------------------------------------------------------------- /docs/_static/img/cloud/activate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/activate.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/create_new_key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/create_new_key.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/create_service_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/create_service_account.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/new_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/new_project.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/project_id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/project_id.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/select_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/select_project.png -------------------------------------------------------------------------------- /docs/_static/img/cloud/service_acct_permissions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/service_acct_permissions.png -------------------------------------------------------------------------------- /docs/_static/img/gke/cleanup_job.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cleanup_job.png -------------------------------------------------------------------------------- /docs/_static/img/gke/cluster_create_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cluster_create_progress.png -------------------------------------------------------------------------------- /docs/_static/img/gke/cluster_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cluster_dashboard.png -------------------------------------------------------------------------------- /docs/_static/img/gke/job_logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/job_logs.png -------------------------------------------------------------------------------- /docs/_static/img/gke/node_pool_autoprovision.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/node_pool_autoprovision.png -------------------------------------------------------------------------------- /docs/_static/img/gke/pod_events.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pod_events.png -------------------------------------------------------------------------------- /docs/_static/img/gke/pre_job_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pre_job_details.png -------------------------------------------------------------------------------- /docs/_static/img/gke/pre_job_submission.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pre_job_submission.png -------------------------------------------------------------------------------- /docs/_static/img/gke/stackdriver_logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/stackdriver_logs.png -------------------------------------------------------------------------------- /docs/_static/img/gke/unschedulable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/unschedulable.png -------------------------------------------------------------------------------- /docs/_static/img/gke/unschedulable_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/unschedulable_details.png -------------------------------------------------------------------------------- /docs/cli/caliban_build.rst: -------------------------------------------------------------------------------- 1 | caliban build 2 | ^^^^^^^^^^^^^ 3 | 4 | This command builds the Docker image used in :doc:`caliban_run`, 5 | :doc:`caliban_cloud` and friends, without actually executing the container or 6 | submitting it remotely. 7 | 8 | ``caliban build`` supports the following arguments: 9 | 10 | .. code-block:: text 11 | 12 | usage: caliban build [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY] 13 | [--extras EXTRAS] [-d DIR] 14 | module 15 | 16 | positional arguments: 17 | module Code to execute, in either 'trainer.train' or 18 | 'trainer/train.py' format. Accepts python scripts, 19 | modules or a path to an arbitrary script. 20 | 21 | optional arguments: 22 | -h, --help show this help message and exit 23 | --helpfull show full help message and exit 24 | --nogpu Disable GPU mode and force CPU-only. 25 | --cloud_key CLOUD_KEY 26 | Path to GCloud service account key. (Defaults to 27 | $GOOGLE_APPLICATION_CREDENTIALS.) 28 | --extras EXTRAS setup.py dependency keys. 29 | --no_cache Disable Docker's caching mechanism and force a 30 | rebuild of the container from scratch. 31 | -d DIR, --dir DIR Extra directories to include. List these from large to 32 | small to take full advantage of Docker's build cache. 33 | -------------------------------------------------------------------------------- /docs/cli/caliban_notebook.rst: -------------------------------------------------------------------------------- 1 | caliban notebook 2 | ^^^^^^^^^^^^^^^^ 3 | 4 | This command generates the same isolated environment as the other commands, but 5 | instead of running your code or dropping you into a shell, runs a local instance 6 | of Jupyter based in the folder where you execute the command. 7 | 8 | ``caliban notebook`` supports the following arguments: 9 | 10 | .. code-block:: text 11 | 12 | usage: caliban notebook [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY] 13 | [--extras EXTRAS] [--docker_run_args DOCKER_RUN_ARGS] 14 | [-p PORT] [-jv JUPYTER_VERSION] [--lab] [--bare] 15 | 16 | optional arguments: 17 | -h, --help show this help message and exit 18 | --helpfull show full help message and exit 19 | --nogpu Disable GPU mode and force CPU-only. 20 | --cloud_key CLOUD_KEY 21 | Path to GCloud service account key. (Defaults to 22 | $GOOGLE_APPLICATION_CREDENTIALS.) 23 | --extras EXTRAS setup.py dependency keys. 24 | --docker_run_args DOCKER_RUN_ARGS 25 | String of args to add to Docker. 26 | -p PORT, --port PORT Port to use for Jupyter, inside container and locally. 27 | -jv JUPYTER_VERSION, --jupyter_version JUPYTER_VERSION 28 | Jupyterlab version to install via pip. 29 | --lab run 'jupyter lab', vs the default 'jupyter notebook'. 30 | --bare Skip mounting the $HOME directory; run an isolated 31 | Jupyter lab. 32 | 33 | By default ``caliban notebook`` runs ``jupyter notebook`` inside the container. To 34 | run Jupyterlab, pass the ``--lab`` flag: 35 | 36 | .. code-block:: bash 37 | 38 | caliban notebook --lab 39 | 40 | As with the other commands, the only python dependencies available in the 41 | container will be dependencies that you declare explicitly in either: 42 | 43 | 44 | * a ``requirements.txt`` file 45 | * a ``setup.py`` file. 46 | 47 | Your setup file can declare groups of dependencies using the setuptools 48 | `extras_require 49 | `_ 50 | feature. (See the :doc:`../explore/declaring_requirements` docs for more detail 51 | on how to use ``extras_require`` to create separate environments for GPU and 52 | CPU.) 53 | 54 | Mounted Home Directory 55 | ~~~~~~~~~~~~~~~~~~~~~~ 56 | 57 | ``caliban notebook`` mounts your ``$HOME`` directory into the container, which 58 | allows your Jupyter settings to persist across sessions. If you don't want this 59 | for some reason, run the command with the ``--bare`` flag. 60 | 61 | Custom Jupyer Port 62 | ~~~~~~~~~~~~~~~~~~ 63 | 64 | If you'd like to run ``notebook`` using a different port, use the ``--port`` option: 65 | 66 | .. code-block:: bash 67 | 68 | caliban notebook --lab --port 8889 69 | 70 | On the Mac you'll have to pass ``--nogpu`` to ``notebook``\ , as the NVIDIA runtime 71 | isn't supported on non-Linux machines. 72 | -------------------------------------------------------------------------------- /docs/cli/caliban_resubmit.rst: -------------------------------------------------------------------------------- 1 | caliban resubmit 2 | ^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Often one needs to re-run an experiment after making code changes, or to run the 5 | same code with a different random seed. Caliban supports this with its 6 | ``resubmit`` command. 7 | 8 | This command allows you to resubmit jobs in an experiment group without having 9 | to remember or re-enter all of the parameters for your experiments. For example, 10 | suppose you run a set of experiments in an experiment group on CAIP: 11 | 12 | .. code-block:: 13 | 14 | caliban cloud --xgroup resubmit_test --nogpu --experiment_config experiment.json cpu.py -- --foo 3 15 | 16 | You then realize that you made a coding error, causing some of your jobs to 17 | fail: 18 | 19 | .. code-block:: 20 | 21 | $ caliban status --xgroup resubmit_test 22 | xgroup resubmit_test: 23 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None 24 | experiment id 37: cpu.py --foo 3 --sleep 2 25 | job 69 SUCCEEDED CAIP 2020-05-29 10:53:41 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105340_2 26 | experiment id 38: cpu.py --foo 3 --sleep 1 27 | job 68 FAILED CAIP 2020-05-29 10:53:40 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105338_1 28 | 29 | You then go and modify your code, and now you can use the ``resubmit`` command to 30 | run the jobs that failed: 31 | 32 | .. code-block:: 33 | 34 | $ caliban resubmit --xgroup resubmit_test 35 | the following jobs would be resubmitted: 36 | cpu.py --foo 3 --sleep 1 37 | job 68 FAILED CAIP 2020-05-29 10:53:40 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105338_1 38 | 39 | do you wish to resubmit these 1 jobs? [yN]: y 40 | rebuilding containers... 41 | ... 42 | Submitting request! 43 | ... 44 | 45 | Checking back in with ``caliban status`` shows that the code change worked, and 46 | now all of the experiments in the group have succeeded, and you can see that the 47 | container hash has changed for the previously failed jobs, reflecting your code 48 | change: 49 | 50 | .. code-block:: 51 | 52 | $ caliban status --xgroup resubmit_test 53 | xgroup resubmit_test: 54 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None 55 | experiment id 37: cpu.py --foo 3 --sleep 2 56 | job 69 SUCCEEDED CAIP 2020-05-29 10:53:41 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105340_2 57 | experiment id 38: cpu.py --foo 3 --sleep 1 58 | job 70 SUCCEEDED CAIP 2020-05-29 11:03:01 container: gcr.io/totoro-project/81b2087b5026:latest name: caliban_totoro_20200529_110259_1 59 | 60 | The ``resubmit`` command supports the following arguments: 61 | 62 | .. code-block:: 63 | 64 | $ caliban resubmit --help 65 | usage: caliban resubmit [-h] [--helpfull] [--xgroup XGROUP] [--dry_run] [--all_jobs] [--project_id PROJECT_ID] [--cloud_key CLOUD_KEY] 66 | 67 | optional arguments: 68 | -h, --help show this help message and exit 69 | --helpfull show full help message and exit 70 | --xgroup XGROUP experiment group 71 | --dry_run Don't actually submit; log everything that's going to happen. 72 | --all_jobs resubmit all jobs regardless of current state, otherwise only jobs that are in FAILED or STOPPED state will be resubmitted 73 | --project_id PROJECT_ID 74 | ID of the GCloud AI Platform/GKE project to use for Cloud job submission and image persistence. (Defaults to $PROJECT_ID; errors if both the argument and $PROJECT_ID are empty.) 75 | --cloud_key CLOUD_KEY 76 | Path to GCloud service account key. (Defaults to $GOOGLE_APPLICATION_CREDENTIALS.) 77 | -------------------------------------------------------------------------------- /docs/cli/caliban_shell.rst: -------------------------------------------------------------------------------- 1 | caliban shell 2 | ^^^^^^^^^^^^^ 3 | 4 | This command is designed for fast, iterative workflows on scripts in an 5 | environment that's guaranteed to match the environment available to your code on 6 | Cloud. 7 | 8 | ``caliban shell`` supports the following arguments: 9 | 10 | .. code-block:: text 11 | 12 | usage: caliban shell [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY] 13 | [--extras EXTRAS] [--image_id IMAGE_ID] 14 | [--docker_run_args DOCKER_RUN_ARGS] [--shell {bash,zsh}] 15 | [--bare] 16 | 17 | optional arguments: 18 | -h, --help show this help message and exit 19 | --helpfull show full help message and exit 20 | --nogpu Disable GPU mode and force CPU-only. 21 | --cloud_key CLOUD_KEY 22 | Path to GCloud service account key. (Defaults to 23 | $GOOGLE_APPLICATION_CREDENTIALS.) 24 | --extras EXTRAS setup.py dependency keys. 25 | --image_id IMAGE_ID Docker image ID accessible in the local Docker 26 | registry. If supplied, Caliban will skip the 'docker 27 | build' step and use this image. 28 | --docker_run_args DOCKER_RUN_ARGS 29 | String of args to add to Docker. 30 | --shell {bash,zsh} This argument sets the shell used inside the container 31 | to one of Caliban's supported shells. Defaults to the 32 | shell specified by the $SHELL environment variable, or 33 | 'bash' if your shell isn't supported. 34 | --bare Skip mounting the $HOME directory; load a bare shell. 35 | 36 | Running ``caliban shell`` in any directory will generate a Docker image 37 | containing the minimal environment necessary to execute Python ML workflows and 38 | drop you into an interactive shell inside of that image. 39 | 40 | Caliban will copy in your Cloud credentials and set the required 41 | ``$GOOGLE_APPLICATION_CREDENTIALS`` env variable, so all Cloud interaction from 42 | Python should Just Work. (See the :doc:`guide on gcloud authentication 43 | <../explore/gcloud>` for more detail.) 44 | 45 | The base Caliban images also have ``gcloud`` installed; all ``gcloud`` and ``gsutil`` 46 | commands will work with the same permissions granted to the key found at 47 | ``$GOOGLE_APPLICATION_CREDENTIALS``. 48 | 49 | .. NOTE:: If you run ``caliban shell --bare``\ , your gcloud and gsutil will 50 | have the same permissions that they'll have in the cloud - the permissions 51 | granted by your JSON key file. If you just run ``caliban shell``\ , which 52 | mounts your home directory, ``gcloud`` and ``gsutil`` will preferentially 53 | load the config you have on your local machine. 54 | 55 | The only python dependencies available in the container will be dependencies 56 | that you declare explicitly in either: 57 | 58 | 59 | * a ``requirements.txt`` file 60 | * a ``setup.py`` file. 61 | 62 | Your setup file can declare groups of dependencies using the setuptools 63 | `extras_require 64 | `_ 65 | feature. (See the :doc:`../explore/declaring_requirements` docs for more detail 66 | on how to use ``extras_require`` to create separate environments for GPU and 67 | CPU.) 68 | 69 | By default your home directory will mount into the container, along with the 70 | folder you're in when you run ``caliban shell``. This means that: 71 | 72 | 73 | * your default ``bash`` (or ``zsh``\ ) environment will be available to you at the 74 | ``caliban shell``. 75 | * Any changes you make to files in the mounted directory will be immediately 76 | available to you to run with, say, ``python -m trainer.train`` or some similar 77 | command. 78 | 79 | On the Mac you'll have to pass ``--nogpu`` to ``shell``\ , as the NVIDIA runtime isn't 80 | supported on non-Linux machines. If you forget ``caliban`` will remind you and 81 | prevent you from getting too far. 82 | 83 | .. NOTE:: Caliban currently supports ``bash`` and ``zsh`` shells. The command 84 | will use your ``$SHELL`` environment variable to pick a default; to override 85 | the default, you can always pass the ``--shell`` argument, like this: 86 | ``caliban shell --shell bash``. 87 | 88 | .. WARNING:: One potential issue resulting from the fact that your home directory will mount 89 | into the container is that some binaries from your ``$HOME`` directory might 90 | leak into the container. For example, we have seen a case in which, in trying 91 | to run a CUDA binary to communicate with the GPU, ``caliban shell`` called a 92 | binary from the home directory rather than the one which the container should 93 | have used. This issue can be mitigated simply by using the ``--bare`` option, 94 | which will not mount the ``$HOME`` directory inside the container. 95 | -------------------------------------------------------------------------------- /docs/cli/caliban_status.rst: -------------------------------------------------------------------------------- 1 | caliban status 2 | ^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | The ``caliban status`` command allows you to check on the status of jobs submitted 5 | via caliban. There are two primary modes for this command. The first returns 6 | your most recent job submissions across all experiment groups: 7 | 8 | .. code-block:: 9 | 10 | $ caliban status --max_jobs 5 11 | most recent 5 jobs for user totoro: 12 | 13 | xgroup totoro-xgroup-2020-05-28-11-33-35: 14 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None 15 | experiment id 28: cpu.py --foo 3 --sleep 2 16 | job 56 STOPPED GKE 2020-05-28 11:33:35 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-rssqq 17 | experiment id 29: cpu.py --foo 3 --sleep 600 18 | job 57 STOPPED GKE 2020-05-28 11:33:36 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-c5x6v 19 | 20 | xgroup totoro-xgroup-2020-05-28-11-40-52: 21 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None 22 | experiment id 30: cpu.py --foo 3 --sleep -1 23 | job 58 STOPPED CAIP 2020-05-28 11:40:54 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114052_1 24 | experiment id 31: cpu.py --foo 3 --sleep 2 25 | job 59 STOPPED CAIP 2020-05-28 11:40:55 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114054_2 26 | experiment id 32: cpu.py --foo 3 --sleep 600 27 | job 60 RUNNING CAIP 2020-05-28 11:40:56 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114055_3 28 | 29 | Here we can see five jobs that we recently submitted, in two experiment groups. 30 | The first experiment group has jobs submitted to GKE, while the second has jobs 31 | submitted to CAIP. You can specify the maximum number of jobs to return using 32 | the ``--max_jobs`` flag. 33 | 34 | The second mode for the ``caliban status`` command returns jobs in a given 35 | experiment group, using the ``--xgroup`` flag: 36 | 37 | .. code-block:: 38 | 39 | $ caliban status --xgroup xg2 --max_jobs 2 40 | xgroup xg2: 41 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None 42 | experiment id 1: cpu.py --foo 3 --sleep -1 43 | job 34 FAILED CAIP 2020-05-08 18:26:56 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_1_20200508_182654 44 | job 37 FAILED CAIP 2020-05-08 19:01:08 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_1_20200508_190107 45 | experiment id 2: cpu.py --foo 3 --sleep 2 46 | job 30 SUCCEEDED LOCAL 2020-05-08 09:59:04 container: e2a0b8fca1dc 47 | job 35 SUCCEEDED CAIP 2020-05-08 18:26:57 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_2_20200508_182656 48 | experiment id 5: cpu.py --foo 3 --sleep 600 49 | job 36 STOPPED CAIP 2020-05-08 18:26:58 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_3_20200508_182657 50 | job 38 SUCCEEDED CAIP 2020-05-08 19:01:09 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_3_20200508_190108 51 | 52 | Here we can see the jobs that have been submitted as part of the ``xg2`` 53 | experiment group. By specifying ``--max_jobs 2`` in the call, we can see the two 54 | most recent job submissions for each experiment in the group. In this case, we 55 | can see that experiment 2 was submitted both locally and to CAIP at different 56 | times. We can also see that experiment 1 failed (due to an invalid parameter), 57 | and that the first submision to CAIP of experiment 5 was stopped by the user. 58 | 59 | Another interesting thing to note here is that the container hash is the same 60 | for each of these job submissions, so we can tell that the underlying code did 61 | not change between submissions. 62 | 63 | This command supports the following arguments: 64 | 65 | .. code-block:: 66 | 67 | $ caliban status --help 68 | usage: caliban status [-h] [--helpfull] [--xgroup XGROUP] 69 | [--max_jobs MAX_JOBS] 70 | 71 | optional arguments: 72 | -h, --help show this help message and exit 73 | --helpfull show full help message and exit 74 | --xgroup XGROUP experiment group 75 | --max_jobs MAX_JOBS Maximum number of jobs to view. If you specify an 76 | experiment group, then this specifies the maximum 77 | number of jobs per experiment to view. If you do not 78 | specify an experiment group, then this specifies the 79 | total number of jobs to return, ordered by creation 80 | date, or all jobs if max_jobs==0. 81 | -------------------------------------------------------------------------------- /docs/cli/caliban_stop.rst: -------------------------------------------------------------------------------- 1 | caliban stop 2 | ^^^^^^^^^^^^^^^^^^^^ 3 | 4 | This command allows you to stop running jobs submitted using caliban. 5 | 6 | For example, suppose you submit a group of experiments to GKE using an 7 | experiment config file like the following: 8 | 9 | .. code-block:: 10 | 11 | $ caliban cluster job submit --xgroup my-xgroup ... --experiment_config exp.json cpu.py -- 12 | 13 | After a bit, you realize that you made a coding error, so you'd like to stop 14 | these jobs so that you can fix your error without wasting cloud resources (and 15 | money). The ``caliban stop`` command makes this relatively simple: 16 | 17 | .. code-block:: 18 | 19 | $ caliban stop --xgroup my-xgroup 20 | the following jobs would be stopped: 21 | cpu.py --foo 3 --sleep -1 22 | job 61 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-57pr9 23 | cpu.py --foo 3 --sleep 2 24 | job 62 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-s67jt 25 | cpu.py --foo 3 --sleep 600 26 | job 63 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-gg9zm 27 | 28 | do you wish to stop these 3 jobs? [yN]: y 29 | 30 | stopping job: 61 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-57pr9 31 | stopping job: 62 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-s67jt 32 | stopping job: 63 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-gg9zm 33 | 34 | requested job cancellation, please be patient as it may take a short while for this status change to be reflected in the gcp dashboard or from the `caliban status` command. 35 | 36 | This command will stop all jobs that are in a ``RUNNING`` or ``SUBMITTED`` state, 37 | and checks with you to make sure this is what you *really* intend, as 38 | accidentally stopping a job that has been running for days is a particularly 39 | painful experience if your checkpointing is less than perfect. Similar to other 40 | caliban commands, you can use the ``--dry_run`` flag to just print what jobs would 41 | be stopped. 42 | 43 | This command supports the following arguments: 44 | 45 | .. code-block:: 46 | 47 | $ caliban stop --help 48 | usage: caliban stop [-h] [--helpfull] [--xgroup XGROUP] [--dry_run] 49 | 50 | optional arguments: 51 | -h, --help show this help message and exit 52 | --helpfull show full help message and exit 53 | --xgroup XGROUP experiment group 54 | --dry_run Don't actually submit; log everything that's going to 55 | happen. 56 | -------------------------------------------------------------------------------- /docs/cloud/adc.rst: -------------------------------------------------------------------------------- 1 | Application Default Credentials 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Instead of a service account key, you might also generate "Application Default 5 | Credentials" on your machine. 6 | 7 | To install these on your workstation, run 8 | 9 | .. code-block:: bash 10 | 11 | gcloud auth application-default login 12 | 13 | at your terminal, as described in `these gcloud docs 14 | `_. 15 | That's it! 16 | -------------------------------------------------------------------------------- /docs/cloud/ai_platform_tpu.rst: -------------------------------------------------------------------------------- 1 | TPUs on AI Platform 2 | ^^^^^^^^^^^^^^^^^^^ 3 | 4 | .. NOTE:: This documentation is currently quite sparse; expect a tutorial soon. 5 | 6 | .. IMPORTANT:: Unlike on Cloud, TPUs on AI Platform only support (as of 7 | Dec 2019) Tensorflow versions 1.13 and 1.14. No JAX, no Pytorch. 8 | 9 | Caliban has Tensorflow version 2.1 hardcoded internally. Once the range of 10 | possible values expands we'll make this customizable. 11 | 12 | See `AI Platform's runtime version list 13 | `_ for more 14 | detail. 15 | 16 | 17 | If you supply the ``--tpu_spec NUM_TPUSxTPU_TYPE`` argument to your ``caliban 18 | cloud`` job, AI Platform will configure a worker node with that number of TPUs 19 | and attach it to the master node where your code runs. 20 | 21 | ``--tpu_spec`` is compatible with ``--gpu_spec``\ ; the latter configures the master 22 | node where your code lives, while the former sets up a separate worker instance. 23 | 24 | CPU mode by Default 25 | ~~~~~~~~~~~~~~~~~~~ 26 | 27 | Normally, all jobs default to GPU mode unless you supply ``--nogpu`` explicitly. 28 | This default flips when you supply a ``--tpu_spec`` and no explicit ``--gpu_spec``. 29 | In that case, ``caliban cloud`` will NOT attach a default GPU to your master 30 | instance. You have to ask for it explicitly. 31 | 32 | A CPU mode default also means that by default Caliban will try to install the 33 | ``'cpu'`` extra dependency set in your ``setup.py``\ , as described in the 34 | :doc:`../explore/declaring_requirements` guide. 35 | 36 | Authorizing TPU Access 37 | ~~~~~~~~~~~~~~~~~~~~~~ 38 | 39 | Before you can pass ``--tpu_spec`` to a job you'll need to authorize your Cloud 40 | TPU to access your service account. Check out `the AI Platform TPU tutorial 41 | `_ 42 | for detailed steps on how to achieve this. 43 | 44 | Example Workflows 45 | ~~~~~~~~~~~~~~~~~ 46 | 47 | Next you'll need to get the repository of TPU examples on your machine. 48 | 49 | .. code-block:: bash 50 | 51 | mkdir tpu-demos && cd tpu-demos 52 | curl https://codeload.github.com/tensorflow/tpu/tar.gz/r1.14 -o r1.14.tar.gz 53 | tar -xzvf r1.14.tar.gz && rm r1.14.tar.gz 54 | 55 | Check out the 56 | `AI Platform TPU tutorial `_ 57 | for the next steps, and check back for more detail about how to use that 58 | tutorial with Caliban. 59 | -------------------------------------------------------------------------------- /docs/cloud/bucket.rst: -------------------------------------------------------------------------------- 1 | Creating a Bucket 2 | ^^^^^^^^^^^^^^^^^ 3 | 4 | If you need to store data that you generate during a :doc:`../cli/caliban_cloud` 5 | run, storing data in a Cloud bucket is the easiest choice. 6 | 7 | Your bucket is a reserved "folder" on the Cloud filesystem; you'll use this to 8 | save models and measurements, and as a staging ground for model workflows you're 9 | submitting to Cloud. 10 | 11 | To create your bucket, add the following lines to your ``~/.bashrc`` file: 12 | 13 | .. code-block:: bash 14 | 15 | export BUCKET_NAME="totoro_bucket" 16 | export REGION="us-central1" 17 | 18 | Run ``source ~/.bashrc`` to pick up the changes, then run the following command 19 | to create your new bucket: 20 | 21 | .. code-block:: bash 22 | 23 | gsutil mb -l $REGION gs://$BUCKET_NAME 24 | 25 | That's it. 26 | -------------------------------------------------------------------------------- /docs/cloud/gpu_specs.rst: -------------------------------------------------------------------------------- 1 | Customizing Machines and GPUs 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | This section discusses the default configurations for accelerators and machine 5 | types that Caliban requests when it submits jobs to Cloud. You'll also find 6 | instructions on how to request different GPUs or machine types for your job. 7 | 8 | Default GPU and Machine Types 9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | By default, if you don't supply ``--gpu_spec`` or ``--machine_type`` (both discussed 12 | below), Caliban will configure your jobs on the following hardware for each 13 | mode: 14 | 15 | 16 | * GPU mode (default): a single P100 GPU on an ``n1-standard-8`` machine 17 | * CPU mode: an ``n1-highcpu-32`` machine with no GPU attached 18 | 19 | You can read more about the various machine types available on AI platform `here 20 | `_\ , or scan the 21 | following sections. 22 | 23 | 24 | Custom GPU Specs 25 | ~~~~~~~~~~~~~~~~ 26 | 27 | The optional ``--gpu_spec`` argument allows you to attach a custom number and type 28 | of GPU to the Cloud node that will run your containerized job on AI Platform. 29 | The required format is ``GPU_COUNTxGPU_TYPE``\ , as in this example: 30 | 31 | .. code-block:: bash 32 | 33 | caliban cloud --gpu_spec 2xV100 trainer.train 34 | 35 | This will submit your job to a node configured with 2 V100 GPUs to a machine in 36 | the region you specify via: 37 | 38 | 39 | * your ``$REGION`` environment variable, 40 | * the ``--region`` CLI argument 41 | * or, in the absence of either of those, the safe default of ``us-central1``. 42 | 43 | When you run any ``caliban cloud`` command, the program will immediately validate 44 | that the combination of GPU count, region, GPU type and machine type are 45 | compatible and error quickly if they're not. If you make the impossible request 46 | for 3 V100 GPUs: 47 | 48 | .. code-block:: bash 49 | 50 | caliban cloud --gpu_spec 3xV100 trainer.train 51 | 52 | you'll see this error message: 53 | 54 | .. code-block:: 55 | 56 | caliban cloud: error: argument --gpu_spec: 3 GPUs of type V100 aren't available 57 | for any machine type. Try one of the following counts: {1, 2, 4, 8} 58 | 59 | For more help, consult this page for valid combinations of GPU count, GPU type 60 | and machine type: https://cloud.google.com/ml-engine/docs/using-gpus 61 | 62 | If you ask for a valid count, but a count that's not possible on the machine 63 | type you specified - 2 V100s on an ``n1-standard-96`` machine, for example: 64 | 65 | .. code-block:: bash 66 | 67 | caliban cloud --gpu_spec 2xV100 --machine_type n1-standard-96 trainer.train 68 | 69 | You'll see this error: 70 | 71 | .. code-block:: 72 | 73 | 'n1-standard-96' isn't a valid machine type for 2 V100 GPUs. 74 | 75 | Try one of these: ['n1-highcpu-16', 'n1-highmem-16', 'n1-highmem-2', 76 | 'n1-highmem-4', 'n1-highmem-8', 'n1-standard-16', 'n1-standard-4', 'n1-standard-8'] 77 | 78 | For more help, consult this page for valid combinations of GPU count, GPU type 79 | and machine type: https://cloud.google.com/ml-engine/docs/using-gpus 80 | 81 | If you know that your combination is correct, but Caliban's internal 82 | compatibility table hasn't been updated to support some new combination, you can 83 | skip all of these validations by providing ``--force`` as an option. 84 | 85 | Custom Machine Types 86 | ~~~~~~~~~~~~~~~~~~~~ 87 | 88 | The ``--machine_type`` option allows you to specify a custom node type for the 89 | master node where your containerized job will run. ``caliban cloud --help`` will 90 | show you all available choices.; You can also read about the various machine 91 | types available on AI platform 92 | `here `_. 93 | 94 | As an example, the following command will configure your job to run on an 95 | ``n1-highcpu-96`` instance with 8 V100 GPUs attached: 96 | 97 | .. code-block:: bash 98 | 99 | caliban cloud --gpu_spec 8xV100 --machine_type n1-highcpu-96 trainer.train 100 | 101 | As described above in :ref:`Custom GPU Specs`, ``--machine_type`` works with 102 | ``--gpu_spec`` to validate that the combination of GPU count, GPU type and 103 | machine type are all valid, and returns an error immediately if the combination 104 | is invalid. 105 | -------------------------------------------------------------------------------- /docs/cloud/labels.rst: -------------------------------------------------------------------------------- 1 | Job Labels 2 | ^^^^^^^^^^ 3 | 4 | AI Platform provides you with the ability to label your jobs with key-value 5 | pairs. Any arguments you provide using either :doc:`custom script arguments 6 | <../explore/custom_script_args>` or an :doc:`experiment broadcast 7 | <../explore/experiment_broadcasting>` will be added to your job as labels, like 8 | this: 9 | 10 | In addition to arguments Caliban will add these labels to each job: 11 | 12 | 13 | * **job_name**: ``caliban_totoro`` by default, or the argument you pass 14 | using ``caliban cloud --name custom_name`` 15 | * **gpu_enabled**\ : ``true`` by default, or ``false`` if you ran your job with 16 | ``--nogpu`` 17 | 18 | Cloud has fairly strict requirements on the format of each label's key and 19 | value; Caliban will transform your arguments into labels with the proper 20 | formatting, so you don't have to think about these. 21 | 22 | Additional Custom Labels 23 | ~~~~~~~~~~~~~~~~~~~~~~~~ 24 | 25 | You can also pass extra custom labels using ``-l`` or ``--label``\ : 26 | 27 | .. code-block:: bash 28 | 29 | caliban cloud -l key:value --label another_k:my_value ... 30 | 31 | These labels will be applied to every job if you're running an :doc:`experiment 32 | broadcast <../explore/experiment_broadcasting>`, or to the single job you're 33 | submitting otherwise. 34 | 35 | If you provide a label that conflicts with a user argument or experiment flag, 36 | your label will get knocked out. 37 | 38 | .. NOTE:: periods aren't allowed in labels, but are often quite meaningful; 39 | because of this caliban replaces periods with underscores before stripping 40 | out any restricted characters. 41 | -------------------------------------------------------------------------------- /docs/cloud/rate_limit.rst: -------------------------------------------------------------------------------- 1 | Rate Limiting 2 | ^^^^^^^^^^^^^ 3 | 4 | ``caliban cloud`` relies on AI Platform for rate limiting, so you can submit many, 5 | many jobs using an ``--experiment_config`` (up to ~1500 total, I believe?) and AI 6 | Platform will throttle submissions to the default limit of 60 submissions per 7 | minute. If your project's been granted higher quotas, you won't be throttled 8 | until you hit your project's rate limit. 9 | 10 | Job submission on Cloud presents a nice progress bar, with terminal colors and 11 | more. The log commands, URLs, jobIds and custom arguments are highlighted so 12 | it's clear which jobs are going through. On a failure the error message prints 13 | in red. 14 | -------------------------------------------------------------------------------- /docs/cloud/service_account.rst: -------------------------------------------------------------------------------- 1 | Creating a Service Account Key 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | This page describes how to generate and install a `Service Account Key 5 | `_. 6 | A service account key is a sort of "passport" that your code can use to 7 | authenticate itself during communication with Google's Cloud services. 8 | 9 | You can also provide Caliban with a service account key via the ``--cloud_key`` 10 | flag. If you do, Caliban will use this service account to authenticate itself 11 | with AI Platform when submitting jobs. (You would do this if you wanted to 12 | submit to some project you didn't own, for example.) 13 | 14 | To create a service account key, visit the `Service Accounts page 15 | `_ 16 | and select the project you created earlier. 17 | 18 | Click "Create Service Account" at the top of the page: 19 | 20 | .. image:: /_static/img/cloud/activate.png 21 | :width: 600 22 | :align: center 23 | :alt: Activate Billing 24 | 25 | At the next form, under **"Service Account Name"**, type something like 26 | **totoro_key** and click **"Create"**. 27 | 28 | This will bring up a page titled **"Service Account Permissions"**. Select 29 | **Project > Owner** from the list: 30 | 31 | .. image:: /_static/img/cloud/service_acct_permissions.png 32 | :width: 600 33 | :align: center 34 | :alt: Service Account Permissions 35 | 36 | Then click **"Continue"** and **"Done"**. You now have a service account. You'll 37 | need to download it to your machine for Caliban to use it. 38 | 39 | Downloading the Service Account Key 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | Click on the hyperlinked name of the key - something like 43 | ``totoro-key@totoro-lives.iam.gserviceaccount.com`` - in the service accounts 44 | list. 45 | 46 | Near the bottom of the page, click "Add Key" > "Create New Key": 47 | 48 | .. image:: /_static/img/cloud/create_new_key.png 49 | :width: 600 50 | :align: center 51 | :alt: Create New Key 52 | 53 | Select **"JSON"** for key type and click **"Create"**. This will download a file 54 | with a name like ``totoro-lives-3df07b8c97a0.json`` to your machine. 55 | 56 | Find the file in your terminal (probably in your Downloads folder) and run the 57 | following command to move it to a nice, easy to read location: 58 | 59 | .. code-block:: bash 60 | 61 | mv [NEW_FILENAME].json ~/.config/service_key.json 62 | 63 | To make this key accessible to Caliban, you'll need to set a variable called 64 | ``GOOGLE_APPLICATION_CREDENTIALS`` in your shell to the path of your new service 65 | account key. Add the following line to your `~/.bashrc`: 66 | 67 | .. code-block:: bash 68 | 69 | export GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/service_key.json 70 | 71 | If Caliban sees this environment variable set, it will go ahead and bake these 72 | credentials into your container, making them accessible to your code even inside 73 | the Docker environment. 74 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | # -- Project information ----------------------------------------------------- 18 | 19 | project = "Caliban" 20 | copyright = "2020, Google LLC" 21 | author = "The Caliban authors" 22 | 23 | # The short X.Y version 24 | version = "" 25 | # The full version, including alpha/beta/rc tags 26 | release = "" 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.autodoc", 35 | "sphinx.ext.autosectionlabel", 36 | "sphinx.ext.autosummary", 37 | "sphinx.ext.intersphinx", 38 | "sphinx.ext.mathjax", 39 | "sphinx.ext.napoleon", 40 | "sphinx.ext.viewcode", 41 | ] 42 | 43 | intersphinx_mapping = {"python": ("https://docs.python.org/3/", None)} 44 | 45 | source_suffix = {".rst": "restructuredtext", ".txt": "restructuredtext"} 46 | 47 | # Add any paths that contain templates here, relative to this directory. 48 | templates_path = ["_templates"] 49 | 50 | # List of patterns, relative to source directory, that match files and 51 | # directories to ignore when looking for source files. 52 | # This pattern also affects html_static_path and html_extra_path. 53 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "requirements.txt"] 54 | 55 | # The name of the Pygments (syntax highlighting) style to use. 56 | pygments_style = None 57 | autosummary_generate = True 58 | napolean_use_rtype = False 59 | 60 | mathjax_config = { 61 | "TeX": {"equationNumbers": {"autoNumber": "AMS", "useLabelIds": True}}, 62 | } 63 | 64 | # -- Options for HTML output ------------------------------------------------- 65 | 66 | # The theme to use for HTML and HTML Help pages. See the documentation for 67 | # a list of builtin themes. 68 | # 69 | html_theme = "sphinx_rtd_theme" 70 | 71 | # Theme options are theme-specific and customize the look and feel of a theme 72 | # further. For a list of options available for each theme, see the 73 | # documentation. 74 | html_theme_options = { 75 | "logo_only": True, 76 | } 77 | 78 | # Add any paths that contain custom static files (such as style sheets) here, 79 | # relative to this directory. They are copied after the builtin static files, 80 | # so a file named "default.css" will overwrite the builtin "default.css". 81 | html_static_path = ["_static"] 82 | 83 | htmlhelp_basename = "Calibandoc" 84 | epub_title = project 85 | epub_exclude_files = ["search.html"] 86 | -------------------------------------------------------------------------------- /docs/explore/calibanconfig.rst: -------------------------------------------------------------------------------- 1 | calibanconfig 2 | ^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Caliban supports customization through a file called ``.calibanconfig.json`` 5 | that lives in your project's directory. Features are limited for now, but stay 6 | tuned for more. 7 | 8 | Custom Apt Packages 9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 10 | 11 | Caliban provides support for custom aptitude packages inside your container. To 12 | require custom apt packages, create a file called ``.calibanconfig.json`` inside 13 | your project's directory. 14 | 15 | The ``.calibanconfig.json`` should contain a single JSON dictionary with an 16 | ``"apt_packages"`` key. The value under this key can be either a list, or a 17 | dictionary with ``"gpu"`` and ``"cpu"'`` keys. For example, any of the following are 18 | valid: 19 | 20 | .. code-block:: 21 | 22 | # This is a list by itself. Comments are fine, by the way. 23 | { 24 | "apt_packages": ["libsm6", "libxext6", "libxrender-dev"] 25 | } 26 | 27 | This works too: 28 | 29 | .. code-block:: json 30 | 31 | # You can also include a dictionary with different deps 32 | # for gpu and cpu modes. It's fine to leave either of these blank, 33 | # or not include it. 34 | { 35 | "apt_packages": { 36 | "gpu": ["libsm6", "libxext6", "libxrender-dev"], 37 | "cpu": ["some_other_package"] 38 | } 39 | } 40 | 41 | These values will do what you expect and run ``apt-get install `` 42 | for each package. Packages are alphabetized, so changing the order won't 43 | invalidate Docker's build cache. 44 | 45 | Custom Base Images 46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 47 | For details on Caliban's base images, see :ref:`What's the Base Docker Image?`. 48 | 49 | You can specify a custom base image for Caliban to use in your ``.calibanconfig.json`` file 50 | by adding an entry with the ``base_image`` key as follows: 51 | 52 | .. code-block:: json 53 | 54 | { 55 | "base_image": "gcr.io/blueshift-playground/blueshift:gpu-ubuntu1804-py38-cuda101" 56 | } 57 | 58 | You can also specify different base images for ``cpu`` and ``gpu`` modes as follows: 59 | 60 | .. code-block:: json 61 | 62 | { 63 | "base_image": { 64 | "cpu": "gcr.io/blueshift-playground/blueshift:cpu-ubuntu1804-py38", 65 | "gpu": "gcr.io/blueshift-playground/blueshift:gpu-ubuntu1804-py38-cuda101" 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /docs/explore/custom_docker_run.rst: -------------------------------------------------------------------------------- 1 | Custom Docker Run Arguments 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | ``caliban {shell, notebook, run}`` all perform some combination of ``docker build`` 5 | and ``docker run`` to provide their functionality. Each provides various sane 6 | defaults that should be fine for most use cases; sometimes, however, you might 7 | need to break through the ``caliban`` abstraction layer and pass arguments to 8 | ``docker run`` directly. 9 | 10 | One example would be if you need to set environment variables inside the 11 | container, or limit which GPUs are mounted into the container. 12 | 13 | To pass custom options to ``docker run``\ , use ``--docker_run_args``\ , like this: 14 | 15 | .. code-block:: bash 16 | 17 | caliban run --docker_run_args "--env MY_VARIABLE" trainer.train 18 | 19 | This particular command will set ``MY_VARIABLE`` inside the container to its 20 | current value in the shell where you run the above command, as described in the 21 | `docker run `_ 22 | documentation. (The 23 | `\ ``docker run`` `_ docs 24 | have information on all possible options.) 25 | 26 | This argument is available in ``caliban run``\ , ``caliban shell`` and ``caliban 27 | notebook``. 28 | 29 | You may see an error if you pass some flag or argument that ``caliban`` already 30 | supplies. Caliban prints the ``docker run`` command it executes on each 31 | invocation, so if you need full control you can always use ``docker run`` 32 | directly. 33 | -------------------------------------------------------------------------------- /docs/explore/custom_script_args.rst: -------------------------------------------------------------------------------- 1 | Custom Script Arguments 2 | ^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | In ``caliban run`` or ``caliban cloud`` modes, if you pass ``--`` to the CLI, Caliban 5 | will stop parsing commands and pass everything after ``--`` through to your 6 | script, untouched. If you run: 7 | 8 | .. code-block:: bash 9 | 10 | caliban cloud trainer.train -- --epochs 2 --job_dir my_directory 11 | 12 | Your script will execute inside the container environment with the following 13 | command: 14 | 15 | .. code-block:: bash 16 | 17 | python -m trainer.train --epochs 2 --job_dir my_directory 18 | 19 | This feature is compatible with :doc:`experiment_broadcasting` in ``cloud``, 20 | ``run`` or ``cluster`` mode; arguments are prepended to the list generated by 21 | the specific experiment being executed from your experiment config. 22 | -------------------------------------------------------------------------------- /docs/explore/declaring_requirements.rst: -------------------------------------------------------------------------------- 1 | Declaring Requirements 2 | ^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | To use a Python library in your Caliban-based workflow you'll need to declare it 5 | in either a 6 | 7 | 8 | * ``requirements.txt`` file in the directory, or a 9 | * ``setup.py`` file, or 10 | * both of these together. 11 | 12 | If you run any of the Caliban commands in a directory without these, your image 13 | will have access to bare Python alone with no dependencies. 14 | 15 | A ``requirements.txt`` file is the simplest way to get started. See the 16 | `pip docs `_ for more 17 | information on the structure here. You've got ``git`` inside the container, so 18 | ``git`` dependencies will work fine. 19 | 20 | Setup.py and Extra Dependency Sets 21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 22 | 23 | Declaring your dependencies in a ``setup.py`` file gives you the ability to 24 | declare different sets of dependencies for the different Caliban modes (CPU vs 25 | GPU), in addition to your own custom dependency sets. 26 | 27 | This solves the problem of depending on, say, ``tensorflow-gpu`` for a GPU job, 28 | and ``tensorflow`` for normal, CPU-only jobs, without having to modify your 29 | dependency file. 30 | 31 | Here's an example ``setup.py`` file: 32 | 33 | .. code-block:: python 34 | 35 | from setuptools import find_packages 36 | from setuptools import setup 37 | 38 | setup( 39 | name='hello-tensorflow', 40 | version='0.1', 41 | install_requires=['absl-py', 'google-cloud-storage'], 42 | extras_require={ 43 | 'cpu': ['tensorflow==2.0.*'], 44 | 'gpu': ['tensorflow-gpu==2.0.*'], 45 | }, 46 | packages=find_packages(), 47 | description='Hello Tensorflow setup file.') 48 | 49 | This project has two normal dependencies - ``'absl-py'`` for flags, and 50 | ``'google-cloud-storage'`` to interact with Cloud buckets. 51 | 52 | The ``setup.py`` file declares its Tensorflow dependencies in a dictionary under 53 | the ``extras_require`` key. If you're using pip, you would install dependencies 54 | from just ``install_requires`` by running 55 | 56 | .. code-block:: bash 57 | 58 | pip install . 59 | 60 | If you instead ran 61 | 62 | .. code-block:: bash 63 | 64 | pip install .[gpu] 65 | 66 | ``pip`` would install 67 | 68 | 69 | * the entries under ``install_requires``\ , 70 | * AND, additionally, the entries under the ``'gpu'`` key of the ``extras_require`` 71 | dictionary. 72 | 73 | By default, if you have a ``setup.py`` file in your directory, caliban will do the 74 | latter and attempt to install a ``'gpu'`` set of extras, like 75 | 76 | .. code-block:: 77 | 78 | pip install .[gpu] 79 | 80 | If you pass ``--nogpu`` to any of the commands, Caliban will similarly attempt to 81 | run 82 | 83 | .. code-block:: 84 | 85 | pip install .[cpu] 86 | 87 | If you don't declare these keys, don't worry. You'll see a warning that the 88 | extras dependencies didn't exist, and everything will proceed, no problem. 89 | 90 | If you have some other set of dependencies you want to install, you can pass 91 | ``--extras my_deps``\ , or ``-e my_deps``\ , to any of the caliban modes install those 92 | in addition to the ``cpu`` or ``gpu`` dependency set. 93 | 94 | You can provide many sets, like this: 95 | 96 | .. code-block:: bash 97 | 98 | caliban cloud -e my_deps -e logging_extras 99 | 100 | And Caliban will install the dependencies from all declared sets inside of the 101 | containerized environment. 102 | -------------------------------------------------------------------------------- /docs/explore/exp_stdin.rst: -------------------------------------------------------------------------------- 1 | Experiment Config via stdin, pipes 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | In addition to passing an explicit JSON file to ``caliban cloud 5 | --experiment_config``\ , if you pass the string ``stdin`` as the flag's value 6 | ``caliban cloud`` will attempt to read the experiment config in off of ``stdin``. 7 | 8 | As an example, this command pipes in a config and also passes ``--dry_run`` to 9 | show the series of jobs that WILL be submitted when the ``--dry_run`` flag is 10 | removed: 11 | 12 | .. code-block:: bash 13 | 14 | cat experiment.json | caliban cloud --experiment_config stdin --dry_run trainer.train 15 | 16 | Because ``experiment.json`` is a file on disk, the above command is not that 17 | interesting, and equivalent to running: 18 | 19 | .. code-block:: bash 20 | 21 | caliban cloud --experiment_config experiment.json --dry_run trainer.train 22 | 23 | Things get more interesting when you need to dynamically generate an experiment 24 | config. 25 | 26 | Imagine you've written some python script ``generate_config.py`` that builds up a 27 | list of complex, interdependent experiments. If you modify that script to print 28 | a ``json`` list of ``json`` dicts when executed, you can pipe the results of the 29 | script directly into ``caliban cloud``\ : 30 | 31 | .. code-block:: bash 32 | 33 | python generate_config.py --turing_award 'winning' | \ 34 | caliban cloud --experiment_config stdin --dry_run trainer.train 35 | 36 | And see immediately (thanks to ``--dry_run``\ ) the list of jobs that would be 37 | executed on AI Platform with a real run. 38 | 39 | 40 | Experiment File Expansion and Pipes 41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | The :doc:`../cli/expansion` command described :doc:`above <../cli/expansion>` 44 | allows you to expand an experiment config into its component JSON objects. 45 | Because these are printed to ``stdout``\ , you can pipe them directly in to 46 | Caliban's commands, like this: 47 | 48 | .. code-block:: bash 49 | 50 | expansion experiment.json | caliban cloud --experiment_config stdin trainer.train 51 | 52 | You can also insert your own script into the middle of this pipeline. Imagine a 53 | script called ``my_script.py`` that: 54 | 55 | 56 | * reads a JSON list of experiments in via ``stdin`` 57 | * modifies each entry by inserting a new key whose value is a function of one 58 | or more existing entries 59 | * prints the resulting JSON list back out to ``stdout`` 60 | 61 | You could sequence these steps together like so: 62 | 63 | .. code-block:: bash 64 | 65 | cat experiment.json | \ 66 | expansion experiment.json | \ 67 | my_script.py | \ 68 | caliban cloud --experiment_config stdin --dry_run trainer.train 69 | 70 | If you supply ``--dry_run`` to caliban, as in the example above, caliban will 71 | print out all of the jobs that this particular command will kick off when you 72 | remove ``--dry_run``. This is a great way to generate complex experiments and test 73 | everything out before submitting your jobs. 74 | -------------------------------------------------------------------------------- /docs/explore/experiment_groups.rst: -------------------------------------------------------------------------------- 1 | Experiment Groups 2 | ^^^^^^^^^^^^^^^^^ 3 | 4 | Caliban supports grouping experiments into a collection called an *experiment 5 | group*. This allows you to do things like monitor all of the jobs in a given 6 | group, stop all running jobs in a group, or re-run all of the jobs in a group. 7 | 8 | Each of the caliban compute backends supports specifying an experiment group via 9 | the ``--xgroup`` flag: 10 | 11 | .. code-block:: 12 | 13 | $ caliban run --xgroup my-xgroup ... 14 | $ caliban cloud --xgroup my-xgroup ... 15 | $ caliban cluster job submit --xgroup my-xgroup ... 16 | 17 | If you don't specify an experiment group when submitting jobs via caliban, a new 18 | experiment group will be generated for you, so you don't need to use them if you 19 | don't want to. Also, the existence of this group should be transparent to you. 20 | 21 | You can add new jobs to an existing experiment group simply by specifying the 22 | same group on different caliban job submission calls: 23 | 24 | .. code-block:: 25 | 26 | caliban cloud --xgroup my-xgroup ... foo.py -- 27 | ... 28 | (some time later...) 29 | caliban cloud --xgroup my-xgroup ... bar.py -- 30 | 31 | The experiment group ``my-xgroup`` will contain the experiments generated by both 32 | of the caliban calls, and you can then perform different operations on these as 33 | described in the sections below. 34 | -------------------------------------------------------------------------------- /docs/explore/mac.rst: -------------------------------------------------------------------------------- 1 | Caliban on a Mac 2 | ^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | If you're developing on your Macbook, you'll be able to build GPU containers, 5 | but you won't be able to run them locally. You can still submit GPU jobs to AI 6 | Platform! 7 | 8 | To use Caliban's ``shell``\ , ``notebook`` and ``run``\ , you'll have to pass 9 | ``--nogpu`` as a keyword argument. If you don't do this you'll see the following 10 | error: 11 | 12 | .. code-block:: text 13 | 14 | [totoro@totoro-macbookpro hello-tensorflow (master)]$ caliban run trainer.train 15 | 16 | 'caliban run' doesn't support GPU usage on Macs! Please pass --nogpu to use this command. 17 | 18 | (GPU mode is fine for 'caliban cloud' from a Mac; just nothing that runs locally.) 19 | 20 | The :doc:`../getting_started/prerequisites` page covers Macbook installation of 21 | Docker and other dependencies. 22 | -------------------------------------------------------------------------------- /docs/explore/script_vs_module.rst: -------------------------------------------------------------------------------- 1 | What can Caliban Execute? 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Caliban's commands can run python files as modules or scripts. If you need more 5 | customization, you can run arbitrary shell scripts with Caliban. 6 | 7 | Script vs Module 8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | Inside the containerized environment, your Python script will run as a module or 11 | a script, depending on the format of the argument you supply to caliban. If you 12 | explicitly pass a python module, with components separated by dots: 13 | 14 | .. code-block:: bash 15 | 16 | caliban cloud trainer.train -- --epochs 2 --job_dir my_directory 17 | 18 | Your script will execute inside the container environment with the following 19 | command: 20 | 21 | .. code-block:: bash 22 | 23 | python -m trainer.train --epochs 2 --job_dir my_directory 24 | 25 | If instead you supply a relative path to the python file, like this: 26 | 27 | .. code-block:: bash 28 | 29 | caliban cloud trainer/train.py -- --epochs 2 --job_dir my_directory 30 | 31 | Caliban will execute your code as a python *script* by passing it directly to 32 | python without the ``-m`` flag, like this: 33 | 34 | .. code-block:: bash 35 | 36 | python trainer/train.py --epochs 2 --job_dir my_directory 37 | 38 | What does this mean for you? Concretely it means that if you execute your code 39 | as a module, all imports inside of your script have to be declared relative to 40 | the root directory, ie, the directory where you run the caliban command. If you 41 | have other files inside of the ``trainer`` directory, you'll have to import them 42 | from ``trainer/train.py`` like this: 43 | 44 | .. code-block:: python 45 | 46 | import trainer.util 47 | from trainer.cloud import load_bucket 48 | 49 | We do this because it enforces a common structure for all code. The reproducible 50 | unit is the directory that holds all of the code. The script doesn't live in 51 | isolation; it's part of a project, and depends on the other files in the code 52 | tree as well as the dependencies declared in the root directory. 53 | 54 | If you run your code as a script, imports will only work if they're relative to 55 | the file itself, not to the running code. 56 | 57 | I highly recommend running code as a module! 58 | 59 | Using Caliban with Shell Scripts 60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | 62 | Caliban can build containers for you that will execute arbitrary shell scripts, 63 | in addition to python code. 64 | 65 | If you pass a relative path that points to any file other other than: 66 | 67 | 68 | * a python module, or 69 | * an explicit path to a python file ending with ``.py``\ , 70 | 71 | to ``caliban cloud``\ , ``caliban run`` or one of the other modes that accepts 72 | modules, caliban will execute the code as a bash script. 73 | 74 | This feature is compatible with :doc:`custom script arguments 75 | ` or an :doc:`experiment broadcast 76 | `; your shell script will receive the same flags that 77 | any python module would receive. 78 | -------------------------------------------------------------------------------- /docs/explore/why_caliban.rst: -------------------------------------------------------------------------------- 1 | Why Caliban and Docker? 2 | ^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Caliban uses Docker to build isolated environments for your research code. What 5 | does this mean, and why would you want to do this? 6 | 7 | One major source of friction in machine learning research is the potential 8 | mismatch between the environment where your code runs during local development 9 | and the environment in AI Platform or Cloud. Here's a typical situation: 10 | 11 | 12 | * You run your code locally against some set of dependencies you installed 13 | months ago in the virtual environment you use for all your code. 14 | * You get everything working and submit it to Cloud. Minutes later you see a 15 | failure - your specified Tensorflow version is wrong. You submit again, 16 | specifying the beta of TF 2.0 that you've been using... and the job fails. 17 | That version's not available in Cloud. 18 | * Finally the submission works, but the job fails again. The ``gsutil`` command 19 | you've been shelling out to to save your models locally isn't available on 20 | AI Platform. 21 | * You sigh and look at the clock. It's 4pm. Should I have another cup of 22 | coffee? What am I even doing? Is this what my life has become? 23 | 24 | Each of these issues is small, but they stack up and turn you into a broken, 25 | cautious person, afraid to flex the wings you've forgotten are attached to your 26 | back. 27 | 28 | Docker is the answer to this problem. `Docker `_ is a 29 | piece of software that allows you to build and run "containers"; you can think 30 | of a container as a tiny Linux machine that you can run on your Mac or 31 | workstation, or ship off to execute on AI platform. The container gets access to 32 | the resources of the machine where it's running, but can't affect that machine 33 | in any other way. 34 | 35 | If you design your Python code to run inside of a container, you can move that 36 | container between different environments and know that the code's behavior won't 37 | change. 38 | 39 | The Trouble with Bare Docker 40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 41 | 42 | To build a Docker container for your code you need to write a ``Dockerfile``. If 43 | you try this you'll realize that you actually need many ``Dockerfile`` copies... 44 | one for GPU mode. One for CPU mode locally. Slight tweaks show up every time you 45 | want to add some environment variable; locally, you don't want to copy your code 46 | into the container, since you can live-mount the directory using ``docker run``\ , 47 | but on AI Platform you DO need a copy. 48 | 49 | Soon your ``Dockerfile`` is infested with comments and instructions to a future, 50 | less patient version of yourself, even less capable of remembering all of this 51 | than you are now. 52 | 53 | Caliban + Docker = <3 54 | ~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | If you've felt this pain, you now understand the motivation for Caliban. Caliban 57 | is a tool that dynamically builds docker images (by dynamically generating 58 | ``Dockerfile`` instances) for the various modes you rely on for machine learning 59 | research: 60 | 61 | 62 | * Jupyter notebook development 63 | * Local, interactive development at the shell 64 | * Local execution on your workstation on GPU 65 | * AI platform execution of 100s of jobs for some experiment 66 | 67 | By developing your research workflows inside of Docker containers (made easy by 68 | Caliban) you're much closer to that noble goal of reproducible research. 69 | 70 | Theoretically, you could publish the container that Caliban builds along with 71 | the range of experiment parameters you used to produce your data. 72 | -------------------------------------------------------------------------------- /docs/getting_started/getting_caliban.rst: -------------------------------------------------------------------------------- 1 | Getting Caliban 2 | --------------- 3 | 4 | .. warning:: If you're currently in a ``virtualenv``\ , please run ``deactivate`` 5 | to disable it before proceeding. 6 | 7 | We recommend installing ``caliban`` using `pipx 8 | `_. `pipx `_ is 9 | a tool that lets you install command line utilities written in Python into their 10 | own virtual environments, completely isolated from your system python packages. 11 | 12 | You don't HAVE to do this - you can install caliban in your global environment, 13 | or in a virtualenv - but ``pipx`` is the sanest way we've found to install 14 | Python CLI command tools. 15 | 16 | .. NOTE:: Before you install Caliban, you'll need to visit the 17 | :doc:`prerequisites` page and make sure you have Docker installed and 18 | the correct version of Python 3. 19 | 20 | Install ``pipx`` into your global python environment like this: 21 | 22 | .. code-block:: bash 23 | 24 | python3 -m pip install --user pipx 25 | python3 -m pipx ensurepath 26 | 27 | Once ``pipx`` is installed, use it to install ``caliban``: 28 | 29 | .. code-block:: bash 30 | 31 | pipx install caliban 32 | 33 | If you don't want to use `pipx`, install Caliban via pip: 34 | 35 | .. code-block:: bash 36 | 37 | pip install -U caliban 38 | 39 | Upgrading Caliban 40 | ^^^^^^^^^^^^^^^^^ 41 | 42 | With ``pipx``\ , upgrading Caliban is simple. The following command will do it: 43 | 44 | .. code-block:: bash 45 | 46 | pipx upgrade caliban 47 | 48 | If you've installed Caliban with pip: 49 | 50 | .. code-block:: bash 51 | 52 | pip upgrade caliban 53 | 54 | Check your Installation 55 | ^^^^^^^^^^^^^^^^^^^^^^^ 56 | 57 | To check if all is well, run 58 | 59 | .. code-block:: bash 60 | 61 | caliban --help 62 | 63 | To take Caliban through its paces, visit the `"Getting Started with Caliban" 64 | `_ tutorial on 65 | the main page of `Caliban's github repository 66 | `_. 67 | -------------------------------------------------------------------------------- /docs/getting_started/prerequisites.rst: -------------------------------------------------------------------------------- 1 | Prerequisites 2 | ------------- 3 | 4 | Before you can use Caliban, you'll need to install Docker and make sure your 5 | Python 3 is up to date. Follow these steps to get set up. 6 | 7 | Python 3 8 | ^^^^^^^^ 9 | 10 | Caliban requires Python >= 3.6. Check your current version at the terminal: 11 | 12 | .. code-block:: bash 13 | 14 | $ python3 --version 15 | Python 3.6.9 # Or something above 3.6.0 16 | 17 | If you need to upgrade: 18 | 19 | - on MacOS, download `the latest Python from python.org 20 | `_. 21 | - On Linux, make sure your ``python3`` is up to date by running the following 22 | command at your terminal: 23 | 24 | .. code-block:: bash 25 | 26 | sudo apt-get install python3 python3-venv python3-pip 27 | 28 | Once that's all set, run ``python3 --help`` again to verify that you're running 29 | python 3.6 or above. 30 | 31 | Docker 32 | ^^^^^^ 33 | 34 | To use Caliban, you'll need a working Docker installation. If you have a GPU and 35 | want to run jobs that use it, you'll have to install ``nvidia-docker2``, as 36 | described below in :ref:`GPU Support on Linux Machines` 37 | 38 | - On MacOS, install `Docker Desktop for Mac 39 | `_. You'll 40 | only be able to run in CPU mode, as MacOS doesn't support Docker's nvidia 41 | runtime. You will, however, be able to build GPU containers and submit them to 42 | Google Cloud. 43 | - On Linux, install Docker with `these instructions 44 | `_. 45 | 46 | Add your username to the docker group so that you can run Docker without using 47 | ``sudo``: 48 | 49 | .. code-block:: bash 50 | 51 | sudo usermod -a -G docker ${USER} 52 | 53 | GPU Support on Linux Machines 54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 55 | 56 | On Linux, Caliban can run jobs locally that take advantage of a GPU you may have installed. 57 | 58 | To use this feature, install the ``nvidia-docker2`` runtime by following the 59 | instructions at the `nvidia-docker2 60 | `_ 61 | page. 62 | 63 | .. NOTE:: It's important that you install ``nvidia-docker2``, not 64 | ``nvidia-docker``! The `nvidia-docker2 65 | `_ 66 | instructions discuss how to upgrade if you accidentally install 67 | ``nvidia-docker``. 68 | 69 | .. NOTE:: The most recent versions of docker don't need the ``nvidia-docker2`` 70 | dependency. In a future version of Caliban we'll remove this 71 | dependency and upgrade the documentation. 72 | -------------------------------------------------------------------------------- /docs/gke/cluster_management.rst: -------------------------------------------------------------------------------- 1 | Cluster Management 2 | ^^^^^^^^^^^^^^^^^^ 3 | 4 | This section describes how to create and delete clusters. We'll add 5 | documentation on other relevant cluster lifecycle tasks as we go. 6 | 7 | Cluster Creation 8 | ~~~~~~~~~~~~~~~~ 9 | 10 | As described in the ``create`` section of :doc:`../cli/caliban_cluster`, you 11 | will typically create a cluster once for a given project and leave it running. 12 | 13 | You can create a cluster for your project as follows: 14 | 15 | .. code-block:: bash 16 | 17 | totoro@totoro:$ caliban cluster create --cluster_name cluster_name --zone us-central1-a 18 | I0204 09:24:08.710866 139910209476416 cli.py:165] creating cluster cluster_name in project totoro-project in us-central1-a... 19 | I0204 09:24:08.711183 139910209476416 cli.py:166] please be patient, this may take several minutes 20 | I0204 09:24:08.711309 139910209476416 cli.py:167] visit https://console.cloud.google.com/kubernetes/clusters/details/us-central1-a/cluster_name?project=totoro-project to monitor cluster creation progress 21 | I0204 09:28:05.274621 139910209476416 cluster.py:1091] created cluster cluster_name successfully 22 | I0204 09:28:05.274888 139910209476416 cluster.py:1092] applying nvidia driver daemonset... 23 | 24 | The command will typically take several minutes to complete. The command will 25 | provide you with an url you can follow to monitor the creation process. The page 26 | will look something like the following: 27 | 28 | .. image:: /_static/img/gke/cluster_create_progress.png 29 | :width: 600 30 | :align: center 31 | :alt: Cluster creation progress 32 | 33 | Once your cluster is created and running, you can view and inspect it from the 34 | cloud dashboard from the ``Kuberenetes Engine > Clusters`` menu option: 35 | 36 | .. image:: /_static/img/gke/cluster_dashboard.png 37 | :width: 600 38 | :align: center 39 | :alt: Cluster dashboard 40 | 41 | Cluster Deletion 42 | ~~~~~~~~~~~~~~~~ 43 | 44 | In most cases you will bring up your cluster and leave it running. The cluster 45 | master does consume resources, however, so if you know that you are not going to 46 | be submitting jobs to your cluster for some length of time, you may want to 47 | delete your cluster to save money. Before doing this, please make sure that all 48 | of your jobs are complete, as deleting the cluster will also kill any running 49 | jobs. Deleting the cluster is very straightforward, simply using the 50 | :doc:`../cli/caliban_cluster` ``delete`` command. 51 | -------------------------------------------------------------------------------- /docs/gke/concepts.rst: -------------------------------------------------------------------------------- 1 | GKE Concepts 2 | ^^^^^^^^^^^^ 3 | 4 | Caliban makes it easy to create your own GKE Cluster - similar to your own 5 | personal copy of AI Platform - in your Cloud project, and submit jobs to that 6 | cluster. The advantage over AI Platform currently is that you can get more 7 | quota, often 10x what you have available in AI Platform, and many features are 8 | supported in GKE much earlier than they are in AI Platform. 9 | 10 | The quota disparity is particularly notable with TPUs. AI Platform currently 11 | only allows 8 TPUs, while a GKE cluster lets you specify 32, 64, etc TPUs for a 12 | given job. 13 | 14 | A good collection of GKE documentation can be found 15 | `here `_ 16 | 17 | Cluster 18 | ~~~~~~~ 19 | 20 | A 21 | `cluster `_ 22 | is a collection of cloud machines, combining a set of *nodes* that run your 23 | processing jobs, and *control plane* (also referred to as a *cluster master*\ ) 24 | that manages these worker nodes and handles scheduling your jobs and creating 25 | worker nodes to run them. 26 | 27 | Cluster Master 28 | ~~~~~~~~~~~~~~ 29 | 30 | A 31 | `cluster master `_ 32 | is the controller for the cluster and all its resources. It handles creating and 33 | deleting worker nodes, and scheduling jobs submitted by users. 34 | 35 | Nodes 36 | ~~~~~ 37 | 38 | A 39 | `node `_ 40 | is a worker machine (a cloud compute engine instance) that actually performs the 41 | work your job requires. The cluster control plane creates and manages these 42 | instances. 43 | 44 | Node Pool 45 | ~~~~~~~~~ 46 | 47 | A 48 | `node pool `_ 49 | is a collection of identical nodes (cpu, memory, gpu, tpu). 50 | 51 | Job 52 | ~~~ 53 | 54 | A 55 | `job `_ 56 | is a task that is to be run to completion using cluster resources. The cluster 57 | control plane manages the resources the job needs and handles restarting the job 58 | in case of failure or preemption. A job probably matches the concept you have in 59 | mind when you think of a job you submit to AI platform. A job is a top-level 60 | task, which may be run on multiple machines/containers, which in GKE are 61 | referred to as *pods*\ , described below. 62 | 63 | Pod 64 | ~~~ 65 | 66 | A `pod `_ is a 67 | single, ephemeral, running execution of your container. A job may run on several 68 | pods. 69 | -------------------------------------------------------------------------------- /docs/gke/prereq.rst: -------------------------------------------------------------------------------- 1 | GKE Prerequisites 2 | ^^^^^^^^^^^^^^^^^ 3 | 4 | There are a few prerequisites for creating and submitting jobs to a gke cluster. 5 | 6 | Required Permissions 7 | ~~~~~~~~~~~~~~~~~~~~ 8 | 9 | To create and use a GKE cluster, you'll need to modify your service account key 10 | to give it Account Owner permissions. Those instructions live at the 11 | :doc:`/cloud/service_account` docs page. Note that this only applies if you are 12 | using a service account key. 13 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/recipes/dockerignore.rst: -------------------------------------------------------------------------------- 1 | dockerignore speeds up builds 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Many of Caliban's commands begin their work by triggering a ``docker build`` 5 | command; this command has a side effect of bundling up the entire directory 6 | where you run the command into a "build context", which is zipped up and sent 7 | off to the Docker build process on your machine. 8 | 9 | In a directory containing machine learning code, it's not unusual that you might 10 | also have subdirectories that contain, for example: 11 | 12 | 13 | * large datasets that you've cached locally 14 | * tensorboard output from local runs 15 | * metrics 16 | 17 | If you don't want to include any of these things in the Docker container that 18 | caliban builds for you, you can significantly speed up your builds by creating a 19 | file called ``.dockerignore`` in the directory of your project. 20 | 21 | Here's an example ``.dockerignore`` file, with comments explaining each line: 22 | 23 | .. code-block:: 24 | 25 | # ignore the git repository info and the pip installation cache 26 | .git 27 | .cache 28 | 29 | # this is huge - ignore the virtualenv we've created inside the folder! 30 | env 31 | 32 | # tests don't belong inside the repo. 33 | tests 34 | 35 | # no need to package info about the packaged-up code in egg form. 36 | *.egg-info 37 | 38 | # These files are here for local development, but have nothing 39 | # to do with the code itself, and don't belong on the docker image. 40 | Makefile 41 | pylintrc 42 | setup.cfg 43 | __pycache__ 44 | .coverage 45 | .pytest_cache 46 | 47 | As a starting point, you might take your project's ``.gitignore`` file, copy 48 | everything other to ``.dockerignore`` and then delete any entries that you 49 | actually DO need inside your Docker container. An example might be some data you 50 | don't control with ``git``\ , but that you do want to include in the container using 51 | Caliban's ``-d`` flag. 52 | -------------------------------------------------------------------------------- /docs/recipes/flagfile.rst: -------------------------------------------------------------------------------- 1 | Passing Flags via --flagfile 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | If you find yourself passing lots of flags in to some caliban subcommand, you 5 | might consider Abseil's ``--flagfile`` feature. 6 | 7 | .. NOTE:: `Abseil `_ is a Google library that we 8 | use to generate Caliban's CLI. You can see the options `Abseil 9 | `_ provides on top of Caliban's arguments by 10 | passing ``--helpfull`` to any command; ``caliban cloud --helpfull``\ , for 11 | example. 12 | 13 | ``--flagfile`` allows you to put any number of flags or arguments to caliban into 14 | a file, one pair per line. Given some file like ``my_args.txt`` with the following 15 | contents: 16 | 17 | .. code-block:: 18 | 19 | --docker_run_args "CUDA_VISIBLE_DEVICES=0" 20 | --experiment_config experiment_one.json 21 | --cloud_key my_key.json 22 | --extras extra_deps 23 | 24 | You could run the following command: 25 | 26 | .. code-block:: bash 27 | 28 | caliban run --flagfile my_args.txt trainer.train 29 | 30 | All arguments expand in-line, so the above command would be equivalent to 31 | running: 32 | 33 | .. code-block:: bash 34 | 35 | caliban run --docker_run_args "CUDA_VISIBLE_DEVICES=0" \ 36 | --experiment_config experiment_one.json \ 37 | --cloud_key my_key.json \ 38 | --extras extra_deps \ 39 | trainer.train 40 | 41 | One major benefit is that you can share groups of arguments between various 42 | subcommand invocations, like ``caliban run`` and ``caliban cloud``\ , without having 43 | to store large duplicated strings of arguments. 44 | 45 | Nested Flagfiles 46 | ~~~~~~~~~~~~~~~~ 47 | 48 | You can supply ``--flagfile some_file`` arguments inside flag files! This allows 49 | you to build up trees of arguments in a fine grained way. Imagine some flagfile 50 | called ``v100_project.flags``\ : 51 | 52 | .. code-block:: text 53 | 54 | # Definition for big iron GPUs. 55 | --gpu_spec 8xV100 56 | --machine_type n1-highcpu-64 57 | --cloud_key my_key.json 58 | 59 | And then some further file called ``tpu_plus_gpu.flags``\ : 60 | 61 | .. code-block:: text 62 | 63 | --flagfile v100_project.flags 64 | --tpu_spec 8xV3 65 | --region us-central1 66 | 67 | The command: 68 | 69 | .. code-block:: bash 70 | 71 | caliban cloud --flagfile tpu_plus_gpu.flags trainer.train 72 | 73 | Would expand out **both** sets of flags, as expected. (I don't know what would 74 | happen if each file referenced the other... feel free to try!) 75 | 76 | For more information, check out the 77 | `Abseil docs on ``--flagfile`` `_. 78 | -------------------------------------------------------------------------------- /docs/recipes/local_dir.rst: -------------------------------------------------------------------------------- 1 | Mounting a Local Directory for Data Persistence 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | Let's say you're using ``caliban run`` with an experiment configuration to run 5 | many experiments locally. Because ``caliban run`` attempts to look just like the 6 | environment you'll see in the Cloud, the command doesn't mount any local 7 | directories by default; the container is completely isolated, and you (usually) 8 | have to persist data by writing it to a Cloud bucket. 9 | 10 | It's possible to avoid this, however, and use Caliban to mount a local directory 11 | into the Docker container. If you do this, you can take advantage of local 12 | experiment broadcasting to loop through many experimental runs on your 13 | workstation, and still persist all results and models to your local machine. 14 | 15 | The answer comes from the :doc:`../explore/custom_docker_run` feature. If you 16 | pass 17 | 18 | .. code-block:: bash 19 | 20 | --docker_run_args "--volume workstation_dir:/foo" 21 | 22 | to ``caliban run``\ , Caliban will mount the directory at ``workstation_dir`` into 23 | your container at ``/foo``. (You can use any name or directory you choose instead 24 | of ``/foo``\ , of course.) 25 | 26 | Let's look at an example. The following command will mount a folder called 27 | ``data`` in your workstation's home directory into your container. 28 | 29 | .. code-block:: bash 30 | 31 | caliban run \ 32 | --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo" 33 | --experiment_config exp_config.json \ 34 | trainer.train 35 | 36 | When you look at ``/foo`` inside the container, you'll see all of the files on 37 | your workstation at ``/usr/local/google/home/totoro/data``. If you create or 38 | edit any files, those changes will happen to the files on your workstation as 39 | well. 40 | 41 | .. WARNING:: For some reason I don't understand, if you pass ``-v`` instead of 42 | ``--volume``\ , as in ``--docker_run_args "-v mydir:containerdir"``\ , the 43 | argument parser in Caliban will break. Use ``--volume`` and you'll be set! 44 | 45 | If you want to play around with volume mounting, you can pass the same argument 46 | to ``caliban shell`` to get an interactive view of the filesystem your container 47 | will have access to when you run the above command: 48 | 49 | .. code-block:: bash 50 | 51 | # "--bare" prevents your home directory from mounting. 52 | caliban shell --bare \ 53 | --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo" 54 | 55 | In the shell that launches you'll see the directory mirrored: 56 | 57 | .. code-block:: 58 | 59 | $ caliban shell --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo" --nogpu --bare 60 | I0122 14:30:24.923780 4445842880 docker.py:438] Running command: docker build --rm -f- /Users/totoro/code/python/tutorials/hello-tensorflow 61 | Sending build context to Docker daemon 36.56MB 62 | <....lots of Docker output....> 63 | Successfully built f2ba6fb7b628 64 | I0122 14:30:33.125234 4445842880 docker.py:666] Running command: docker run --ipc host -w /usr/app -u 735994:89939 -v /Users/totoro/code/python/tutorials/hello-tensorflow:/usr/app -it --entrypoint /bin/bash --volume /usr/local/google/home/totoro/data:/foo f2ba6fb7b628 65 | _________ __ ________ ___ _ __ __ __ 66 | / ____/ | / / / _/ __ )/ | / | / / \ \ \ \ 67 | / / / /| | / / / // __ / /| | / |/ / \ \ \ \ 68 | / /___/ ___ |/ /____/ // /_/ / ___ |/ /| / / / / / 69 | \____/_/ |_/_____/___/_____/_/ |_/_/ |_/ /_/ /_/ 70 | 71 | You are running caliban shell as user with ID 735994 and group 89939, 72 | which should map to the ID and group for your user on the Docker host. Great! 73 | 74 | caliban-shell /usr/app > ls -al /foo 75 | total 9788 76 | drwx------ 21 totoro 89939 672 Jan 22 20:35 . 77 | drwxr-xr-x 1 root root 4096 Jan 22 21:30 .. 78 | -rw-r--r-- 1 totoro 89939 41689 Jan 20 21:48 sets.png 79 | -rw-r--r-- 1 totoro 89939 82811 Jan 20 21:48 tree.png 80 | caliban-shell /usr/app > 81 | -------------------------------------------------------------------------------- /docs/recipes/single_gpu.rst: -------------------------------------------------------------------------------- 1 | Using a Single GPU 2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 3 | 4 | By default, ``docker run`` will make all GPUs on your workstation available 5 | inside of the container. This means that in ``caliban shell``\ , ``caliban 6 | notebook`` or ``caliban run``\ , any jobs executed on your workstation will 7 | attempt to use: 8 | 9 | 10 | * your huge GPU, custom-built and installed for ML Supremacy 11 | * the dinky GPU that exists solely to power your monitor, NOT to help train 12 | models 13 | 14 | The second GPU will slow down everything. 15 | 16 | To stop this from happening you need to set the ``CUDA_VISIBLE_DEVICES`` 17 | environment variable equal to ``0``\ , as described on this 18 | `nvidia blog `_ 19 | about the issue. 20 | 21 | You can set the environment variable inside your container by passing 22 | ``--docker_run_args`` to caliban, like this: 23 | 24 | .. code-block:: bash 25 | 26 | caliban run --docker_run_args "--env CUDA_VISIBLE_DEVICES=0" trainer.train 27 | 28 | .. NOTE:: you may have noticed that this problem doesn't happen when you run a 29 | job inside ``caliban shell``. Your local environment may have 30 | ``CUDA_VISIBLE_DEVICES`` set. ``caliban shell`` and ``caliban notebook`` 31 | mount your home directory by default, which loads all of your local 32 | environment variables into the container and, if you've set this environment 33 | variable, modifies this setting inside your container. This doesn't happen 34 | with ``caliban run`` or ``caliban cloud``. You will always need to use this 35 | trick with those modes. 36 | 37 | There are two other ways to solve this problem using the 38 | `custom ``docker run`` arguments detailed here `_. 39 | You can directly limit the GPUs that mount into the container using the ``--gpus`` 40 | argument: 41 | 42 | .. code-block:: bash 43 | 44 | caliban run --docker_run_args "--gpus device=0" trainer.train 45 | 46 | If you run ``nvidia-smi`` in the container after passing this argument you won't 47 | see more than 1 GPU. This is useful if you know that some library you're using 48 | doesn't respect the ``CUDA_VISIBLE_DEVICES`` environment variable for any reason. 49 | 50 | You could also pass this and other environment variables using an env file. 51 | Given some file, say, ``myvars.env``\ , whose contents look like this: 52 | 53 | .. code-block:: text 54 | 55 | CUDA_VISIBLE_DEVICES=0 56 | IS_THIS_A_VARIABLE=yes 57 | 58 | The ``--env-file`` argument will load all of the referenced variables into the 59 | docker environment: 60 | 61 | .. code-block:: bash 62 | 63 | caliban run --docker_run_args "--env-file myvars.env" trainer.train 64 | 65 | Check out :doc:`../explore/custom_docker_run` for more information. 66 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.0.4 2 | sphinx_rtd_theme 3 | -------------------------------------------------------------------------------- /paper/10.21105.joss.02403.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/paper/10.21105.joss.02403.pdf -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @article{merkel2014docker, 2 | author = {Merkel, Dirk}, 3 | title = {Docker: {L}ightweight {L}inux {C}ontainers for {C}onsistent {D}evelopment and {D}eployment}, 4 | year = {2014}, 5 | issue_date = {March 2014}, 6 | publisher = {Belltown Media}, 7 | address = {Houston, TX}, 8 | volume = {2014}, 9 | number = {239}, 10 | issn = {1075-3583}, 11 | abstract = {Docker promises the ability to package applications and their dependencies into lightweight containers that move easily between different distros, start up quickly and are isolated from each other.}, 12 | journal = {Linux J.}, 13 | month = mar, 14 | articleno = {2}, 15 | numpages = {1} 16 | } 17 | 18 | @inproceedings{cito2016, 19 | author = {Cito, J\"{u}rgen and Gall, Harald C.}, 20 | title = {Using {D}ocker {C}ontainers to {I}mprove {R}eproducibility in {S}oftware {E}ngineering {R}esearch}, 21 | year = {2016}, 22 | isbn = {9781450342056}, 23 | publisher = {Association for Computing Machinery}, 24 | address = {New York, NY, USA}, 25 | url = {https://doi.org/10.1145/2889160.2891057}, 26 | doi = {10.1145/2889160.2891057}, 27 | booktitle = {Proceedings of the 38th International Conference on Software Engineering Companion}, 28 | pages = {906–907}, 29 | numpages = {2}, 30 | keywords = {containers, reproducibility, cloud}, 31 | location = {Austin, Texas}, 32 | series = {ICSE ’16} 33 | } 34 | 35 | @inproceedings{deng2009imagenet, 36 | title={Imagenet: A large-scale hierarchical image database}, 37 | author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li}, 38 | booktitle={2009 {IEEE} conference on computer vision and pattern recognition}, 39 | pages={248--255}, 40 | url={https://doi.org/10.1109/cvpr.2009.5206848}, 41 | doi={10.1109/cvpr.2009.5206848}, 42 | year={2009}, 43 | organization={IEEE} 44 | } 45 | 46 | @article{zaharia2018accelerating, 47 | title={Accelerating the {M}achine {L}earning {L}ifecycle with {MLflow}}, 48 | author={M. Zaharia and Andrew Chen and A. Davidson and A. Ghodsi and S. Hong and A. Konwinski and Siddharth Murching and Tomas Nykodym and P. Ogilvie and Mani Parkhe and F. Xie and Corey Zumar}, 49 | journal={{IEEE} Data Eng. Bull.}, 50 | year={2018}, 51 | volume={41}, 52 | pages={39-45} 53 | } 54 | 55 | @inproceedings{Forde2018ReproducingML, 56 | title={Reproducing {M}achine {L}earning {R}esearch on {B}inder}, 57 | author={Forde, Jessica and Bussonnier, Matthias and Fortin, F{\'e}lix-Antoine and Granger, Brian and Head, Tim and Holdgraf, Chris and Ivanov, Paul and Kelley, Kyle and Pacer, M and Panda, Yuvi and others}, 58 | booktitle={{NIPS} {W}orkshop on {M}achine {L}earning {O}pen {S}ource {S}oftware}, 59 | year={2018} 60 | } 61 | 62 | @article{DBLP:journals/corr/JonasVSR17, 63 | author = {Eric Jonas and 64 | Shivaram Venkataraman and 65 | Ion Stoica and 66 | Benjamin Recht}, 67 | title = {Occupy the {C}loud: {D}istributed {C}omputing for the 99{\%}}, 68 | journal = {CoRR}, 69 | volume = {abs/1702.04024}, 70 | year = {2017}, 71 | url = {http://arxiv.org/abs/1702.04024}, 72 | archivePrefix = {arXiv}, 73 | eprint = {1702.04024}, 74 | timestamp = {Mon, 13 Aug 2018 16:49:06 +0200}, 75 | biburl = {https://dblp.org/rec/journals/corr/JonasVSR17.bib}, 76 | bibsource = {dblp computer science bibliography, https://dblp.org} 77 | } 78 | 79 | @inproceedings{adam_richie-halford-proc-scipy-2018, 80 | author = { {A}dam {R}ichie-{H}alford and {A}riel {R}okem }, 81 | title = { {C}loudknot: {A} {P}ython {L}ibrary to {R}un your {E}xisting {C}ode on {A}{W}{S} {B}atch }, 82 | booktitle = { {P}roceedings of the 17th {P}ython in {S}cience {C}onference }, 83 | pages = { 8 - 14 }, 84 | year = { 2018 }, 85 | editor = { {F}atih {A}kici and {D}avid {L}ippa and {D}illon {N}iederhut and {M} {P}acer }, 86 | doi = { 10.25080/Majora-4af1f417-001 } 87 | } 88 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | indent-width = 2 3 | 4 | [tool.ruff.lint.pydocstyle] 5 | convention = "google" 6 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # Required for development, not publication. 2 | hypothesis 3 | ipython 4 | pre-commit 5 | pytest==7.3.2 6 | pytest-cov==4.1.0 7 | pytest-subprocess==1.5.0 8 | twine 9 | -------------------------------------------------------------------------------- /scripts/bashrc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Basic bash commands for Caliban's shell. 4 | 5 | export PS1="\[\e[31m\]caliban-shell\[\e[m\] \[\e[33m\]\w\[\e[m\] > " 6 | export TERM=xterm-256color 7 | alias grep="grep --color=auto" 8 | alias ls="$(which ls) --color=auto" 9 | alias ll="ls -al" 10 | 11 | 12 | printf "\e[1;34m" 13 | cat<=4.45.0", 48 | "kubernetes>=10.0.1", 49 | "google-auth>=1.19.0", 50 | "google-cloud-core>=1.0.3", 51 | "google-cloud-container>=0.3.0", 52 | "psycopg2-binary==2.9.6", 53 | "schema==0.7.5", 54 | "urllib3>=1.25.7", 55 | "yaspin>=0.16.0", 56 | "SQLAlchemy==1.3.11", 57 | "pg8000==1.16.1", 58 | ] 59 | 60 | setup( 61 | name="caliban", 62 | version=with_versioneer(lambda v: v.get_version()), 63 | cmdclass=with_versioneer(lambda v: v.get_cmdclass(), {}), 64 | description="Docker-based job runner for AI research.", 65 | long_description=readme(), 66 | long_description_content_type="text/markdown", 67 | python_requires=">=3.6.0", 68 | author="Caliban Team", 69 | author_email="samritchie@google.com", 70 | url="https://github.com/google/caliban", 71 | license="Apache-2.0", 72 | packages=find_packages(exclude=("tests", "docs")), 73 | install_requires=REQUIRED_PACKAGES, 74 | include_package_data=True, 75 | entry_points={ 76 | "console_scripts": [ 77 | "caliban = caliban.main:main", 78 | "expansion = caliban.expansion:main", 79 | ] 80 | }, 81 | ) 82 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | #!/usr/bin/python 18 | # 19 | # Copyright 2020 Google LLC 20 | # 21 | # Licensed under the Apache License, Version 2.0 (the "License"); 22 | # you may not use this file except in compliance with the License. 23 | # You may obtain a copy of the License at 24 | # 25 | # http://www.apache.org/licenses/LICENSE-2.0 26 | # 27 | # Unless required by applicable law or agreed to in writing, software 28 | # distributed under the License is distributed on an "AS IS" BASIS, 29 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 30 | # See the License for the specific language governing permissions and 31 | # limitations under the License. 32 | -------------------------------------------------------------------------------- /tests/caliban/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/config/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/docker/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/docker/test_build.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import caliban.docker.build as b 18 | 19 | 20 | def test_shell_dict(): 21 | """Tests that the shell dict has an entry for all possible Shell values.""" 22 | 23 | assert set(b.Shell) == set(b.SHELL_DICT.keys()) 24 | 25 | 26 | def test_copy_command(): 27 | multiline = b.copy_command( 28 | 1, 1, "face", "cake", "This is an example\nof a multiline comment." 29 | ) 30 | 31 | assert ( 32 | multiline 33 | == """# This is an example 34 | # of a multiline comment. 35 | COPY --chown=1:1 face cake 36 | """ 37 | ) 38 | 39 | # single lines don't append comments. 40 | oneline = b.copy_command(1, 1, "face", "cake.py") 41 | assert ( 42 | oneline 43 | == """COPY --chown=1:1 face cake.py 44 | """ 45 | ) 46 | 47 | # single comments work. 48 | oneline_comment = b.copy_command(1, 1, "face", "cake.py", comment="Comment!") 49 | assert ( 50 | oneline_comment 51 | == """# Comment! 52 | COPY --chown=1:1 face cake.py 53 | """ 54 | ) 55 | -------------------------------------------------------------------------------- /tests/caliban/docker/test_push.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import caliban.docker.push as p 18 | 19 | 20 | def register_list_tags(process, project_id, tag, **kwargs): 21 | process.register_subprocess( 22 | [ 23 | "gcloud", 24 | "container", 25 | "images", 26 | "list-tags", 27 | f"--project={project_id}", 28 | "--format=json", 29 | tag, 30 | ], 31 | **kwargs, 32 | ) 33 | 34 | 35 | def test_image_tag_for_project(): 36 | """Tests that we generate a valid image tag for domain-scoped and modern 37 | project IDs. 38 | 39 | """ 40 | assert p._image_tag_for_project("face", "imageid") == "gcr.io/face/imageid:latest" 41 | 42 | assert ( 43 | p._image_tag_for_project("google.com:face", "imageid") 44 | == "gcr.io/google.com/face/imageid:latest" 45 | ) 46 | 47 | 48 | def test_force_push_uuid_tag(fake_process): 49 | """Check that the push command actually attempts to tag and push.""" 50 | project_id = "project" 51 | image_id = "imageid" 52 | 53 | tag = p._image_tag_for_project(project_id, image_id) 54 | 55 | fake_process.register_subprocess(["docker", "tag", image_id, tag]) 56 | fake_process.register_subprocess(["docker", "push", tag]) 57 | 58 | assert p.push_uuid_tag(project_id, image_id, force=True) == tag 59 | 60 | 61 | def test_already_pushed_uuid_tag(fake_process): 62 | """Check that push_uuid_tag does NOT attempt to push if the process already 63 | exists..""" 64 | project_id = "project" 65 | image_id = "imageid" 66 | 67 | base_tag = p._image_tag_for_project(project_id, image_id, include_tag=False) 68 | tag = p._image_tag_for_project(project_id, image_id) 69 | 70 | register_list_tags(fake_process, project_id, base_tag, stdout='[{"metadata": []}]') 71 | 72 | assert p.push_uuid_tag(project_id, image_id) == tag 73 | 74 | 75 | def test_push_uuid_tag_if_no_remote_image(fake_process): 76 | """Check that push_uuid_tag DOES attempt to push if the image doesn't exist in 77 | the remote container registry already. 78 | 79 | """ 80 | project_id = "project" 81 | image_id = "imageid" 82 | 83 | base_tag = p._image_tag_for_project(project_id, image_id, include_tag=False) 84 | tag = p._image_tag_for_project(project_id, image_id) 85 | 86 | register_list_tags(fake_process, project_id, base_tag, stdout="[]") 87 | 88 | fake_process.register_subprocess(["docker", "tag", image_id, tag]) 89 | fake_process.register_subprocess(["docker", "push", tag]) 90 | 91 | assert p.push_uuid_tag(project_id, image_id) == tag 92 | -------------------------------------------------------------------------------- /tests/caliban/history/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/platform/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/platform/cloud/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/platform/cloud/test_types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import unittest 18 | from argparse import ArgumentTypeError 19 | 20 | import hypothesis.strategies as st 21 | from hypothesis import given 22 | 23 | import caliban.platform.cloud.types as ct 24 | 25 | 26 | class TypesTestSuite(unittest.TestCase): 27 | """Tests for caliban.platform.cloud.types.""" 28 | 29 | @given( 30 | st.integers(min_value=0, max_value=40), st.sampled_from(list(ct.GPU) + list(ct.TPU)) 31 | ) 32 | def test_validate_accelerator_count(self, i, accel): 33 | valid_counts = ct.accelerator_counts(accel) 34 | if i in valid_counts: 35 | self.assertEqual(i, ct.validate_accelerator_count(accel, i)) 36 | else: 37 | with self.assertRaises(ArgumentTypeError): 38 | ct.validate_accelerator_count(accel, i) 39 | 40 | def test_parse_machine_type(self): 41 | """Test that strings parse into machine types using the Google Cloud strings, 42 | NOT the name string for the enum. 43 | 44 | """ 45 | self.assertEqual(ct.MachineType.standard_8, ct.parse_machine_type("n1-standard-8")) 46 | 47 | with self.assertRaises(ArgumentTypeError): 48 | ct.parse_machine_type("random-string") 49 | 50 | def test_gpuspec_parse_arg(self): 51 | with self.assertRaises(ArgumentTypeError): 52 | # invalid format string, no x separator. 53 | ct.GPUSpec.parse_arg("face") 54 | 55 | with self.assertRaises(ArgumentTypeError): 56 | # Invalid number. 57 | ct.GPUSpec.parse_arg("randomxV100") 58 | 59 | with self.assertRaises(ArgumentTypeError): 60 | # invalid GPU type. 61 | ct.GPUSpec.parse_arg("8xNONSTANDARD") 62 | 63 | with self.assertRaises(ArgumentTypeError): 64 | # Invalid number for the valid GPU type. 65 | ct.GPUSpec.parse_arg("15xV100") 66 | 67 | self.assertEqual( 68 | ct.GPUSpec(ct.GPU.V100, 7), ct.GPUSpec.parse_arg("7xV100", validate_count=False) 69 | ) 70 | 71 | # Valid! 72 | self.assertEqual(ct.GPUSpec(ct.GPU.V100, 8), ct.GPUSpec.parse_arg("8xV100")) 73 | 74 | def test_tpuspec_parse_arg(self): 75 | with self.assertRaises(ArgumentTypeError): 76 | # invalid format string, no x separator. 77 | ct.TPUSpec.parse_arg("face") 78 | 79 | with self.assertRaises(ArgumentTypeError): 80 | # Invalid number. 81 | ct.TPUSpec.parse_arg("randomxV3") 82 | 83 | with self.assertRaises(ArgumentTypeError): 84 | # invalid TPU type. 85 | ct.TPUSpec.parse_arg("8xNONSTANDARD") 86 | 87 | with self.assertRaises(ArgumentTypeError): 88 | # Invalid number for the valid TPU type. 89 | ct.TPUSpec.parse_arg("15xV3") 90 | 91 | self.assertEqual( 92 | ct.TPUSpec(ct.TPU.V3, 7), ct.TPUSpec.parse_arg("7xV3", validate_count=False) 93 | ) 94 | 95 | # Valid! 96 | self.assertEqual(ct.TPUSpec(ct.TPU.V3, 8), ct.TPUSpec.parse_arg("8xV3")) 97 | -------------------------------------------------------------------------------- /tests/caliban/platform/gke/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/platform/gke/test_types.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """unit tests for gke utilities""" 17 | import unittest 18 | 19 | from datetime import datetime 20 | import hypothesis.strategies as st 21 | from hypothesis import given 22 | from kubernetes.client import V1Job, V1JobStatus 23 | 24 | from caliban.platform.gke.types import ReleaseChannel, JobStatus 25 | 26 | 27 | # ---------------------------------------------------------------------------- 28 | class TypesTestSuite(unittest.TestCase): 29 | """tests for caliban.platform.gke.types""" 30 | 31 | # -------------------------------------------------------------------------- 32 | @given( 33 | st.from_regex("\A(?!UNSPECIFIED\Z|RAPID\Z|REGULAR\Z|STABLE\Z).*\Z"), 34 | st.sampled_from(ReleaseChannel), 35 | ) 36 | def test_release_channel(self, invalid: str, valid: ReleaseChannel): 37 | """test ReleaseChannel""" 38 | 39 | with self.assertRaises(ValueError): 40 | _x = ReleaseChannel(invalid) 41 | 42 | self.assertEqual(valid, ReleaseChannel(valid.value)) 43 | 44 | 45 | # ---------------------------------------------------------------------------- 46 | def test_job_status(): 47 | for s in JobStatus: 48 | terminal = s.is_terminal() 49 | if s.name in ["FAILED", "SUCCEEDED", "UNAVAILABLE"]: 50 | assert terminal 51 | else: 52 | assert not terminal 53 | 54 | # completed jobs 55 | status = V1JobStatus(completion_time=datetime.now(), succeeded=1) 56 | job_info = V1Job(status=status) 57 | job_status = JobStatus.from_job_info(job_info) 58 | assert job_status == JobStatus.SUCCEEDED 59 | 60 | status = V1JobStatus(completion_time=datetime.now(), succeeded=0) 61 | job_info = V1Job(status=status) 62 | job_status = JobStatus.from_job_info(job_info) 63 | assert job_status == JobStatus.FAILED 64 | 65 | # active jobs 66 | status = V1JobStatus(completion_time=None, active=1) 67 | job_info = V1Job(status=status) 68 | job_status = JobStatus.from_job_info(job_info) 69 | assert job_status == JobStatus.RUNNING 70 | 71 | # pending jobs 72 | status = V1JobStatus(completion_time=None, active=0) 73 | job_info = V1Job(status=status) 74 | job_status = JobStatus.from_job_info(job_info) 75 | assert job_status == JobStatus.PENDING 76 | 77 | # unknown state 78 | status = V1JobStatus() 79 | job_info = V1Job(status=status) 80 | job_status = JobStatus.from_job_info(job_info) 81 | assert job_status == JobStatus.STATE_UNSPECIFIED 82 | 83 | job_info = V1Job() 84 | job_status = JobStatus.from_job_info(job_info) 85 | assert job_status == JobStatus.STATE_UNSPECIFIED 86 | 87 | job_status = JobStatus.from_job_info(None) 88 | assert job_status == JobStatus.STATE_UNSPECIFIED 89 | -------------------------------------------------------------------------------- /tests/caliban/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/tests/caliban/resources/__init__.py -------------------------------------------------------------------------------- /tests/caliban/resources/test_caliban_launcher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import argparse 18 | import builtins 19 | from google.auth import credentials 20 | import json 21 | import os 22 | import pytest 23 | import tempfile 24 | from typing import Any 25 | 26 | from caliban.resources import caliban_launcher 27 | 28 | 29 | @pytest.mark.parametrize("obj", [["a", 2, 3], {"a": 1, "b": 2}]) 30 | def test_parse_json(obj: Any): 31 | # valid json, type 32 | j = caliban_launcher._parse_json("foo", json.dumps(obj), type(obj)) 33 | assert j == obj 34 | 35 | # valid json, invalid type 36 | with pytest.raises(argparse.ArgumentTypeError): 37 | j = caliban_launcher._parse_json("bar", json.dumps(None), int) 38 | 39 | # invalid json 40 | with pytest.raises(argparse.ArgumentTypeError): 41 | j = caliban_launcher._parse_json("baz", "[", int) 42 | 43 | 44 | def test_start_services(): 45 | with tempfile.TemporaryDirectory() as tmpdir: 46 | outfile = os.path.join(tmpdir, "bar") 47 | svc = [["bash", "-c", "touch $FOO"]] 48 | env = {"FOO": outfile} 49 | caliban_launcher._start_services(svc, env, delay=1) 50 | 51 | assert os.path.exists(outfile) 52 | 53 | 54 | def test_execute_command(): 55 | with tempfile.TemporaryDirectory() as tmpdir: 56 | outfile = os.path.join(tmpdir, "bar") 57 | cmd = ["bash", "-c"] 58 | args = ["touch $FOO"] 59 | env = {"FOO": outfile} 60 | caliban_launcher._execute_command(cmd, args, env) 61 | 62 | assert os.path.exists(outfile) 63 | 64 | 65 | def test_load_config_file(monkeypatch): 66 | monkeypatch.setattr(os.path, "exists", lambda x: False) 67 | assert caliban_launcher._load_config_file() == {} 68 | 69 | cfg = {"foo": 7} 70 | 71 | class MockFile: 72 | def __enter__(self): 73 | pass 74 | 75 | def __exit__(self, a, b, c): 76 | pass 77 | 78 | monkeypatch.setattr(os.path, "exists", lambda x: True) 79 | monkeypatch.setattr(builtins, "open", lambda x: MockFile()) 80 | monkeypatch.setattr(json, "load", lambda x: cfg) 81 | assert caliban_launcher._load_config_file() == cfg 82 | 83 | 84 | def test_get_config(monkeypatch): 85 | cfg = {"foo": 3, "env": {"a": 0}, "services": ["ls"]} 86 | 87 | class MockArgs: 88 | def __init__(self): 89 | self.caliban_config = cfg 90 | 91 | class MockFile: 92 | def __enter__(self): 93 | pass 94 | 95 | def __exit__(self, a, b, c): 96 | pass 97 | 98 | monkeypatch.setattr(os.path, "exists", lambda x: True) 99 | monkeypatch.setattr(builtins, "open", lambda x: MockFile()) 100 | monkeypatch.setattr(json, "load", lambda x: {"env": {}, "services": []}) 101 | assert caliban_launcher._get_config(MockArgs()) == cfg 102 | 103 | 104 | def test_ensure_non_null_project(monkeypatch): 105 | # test case where GOOGLE_CLOUD_PROJECT is already set 106 | env = {"foo": "bar", "GOOGLE_CLOUD_PROJECT": "project"} 107 | 108 | new_env = caliban_launcher._ensure_non_null_project(env) 109 | assert env == new_env 110 | 111 | # GOOGLE_CLOUD_PROJECT not set, but valid project from default() 112 | def mock_default(scopes=None, request=None, quota_project_id=None): 113 | return (credentials.AnonymousCredentials(), "foo") 114 | 115 | monkeypatch.setattr("google.auth.default", mock_default) 116 | env = {"foo": "bar"} 117 | assert caliban_launcher._ensure_non_null_project(env) == env 118 | 119 | # GOOGLE_CLOUD_PROJECT not set, no valid project from default() 120 | def mock_default(scopes=None, request=None, quota_project_id=None): 121 | return (credentials.AnonymousCredentials(), None) 122 | 123 | monkeypatch.setattr("google.auth.default", mock_default) 124 | env = {"foo": "bar"} 125 | new_env = caliban_launcher._ensure_non_null_project(env) 126 | for k, v in env.items(): 127 | assert new_env.get(k) == v 128 | 129 | assert new_env.get("GOOGLE_CLOUD_PROJECT") is not None 130 | -------------------------------------------------------------------------------- /tests/caliban/test_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import unittest 18 | 19 | import caliban.cli as c 20 | import caliban.platform.cloud.types as ct 21 | from caliban.config import JobMode 22 | 23 | 24 | class CLITestSuite(unittest.TestCase): 25 | """Tests for caliban.cli.""" 26 | 27 | def test_job_mode(self): 28 | """Tests for all possible combinations of the three arguments to 29 | resolve_job_mode. 30 | 31 | """ 32 | gpu_spec = ct.GPUSpec(ct.GPU.P100, 4) 33 | tpu_spec = ct.TPUSpec(ct.TPU.V2, 8) 34 | 35 | def assertMode(expected_mode, use_gpu, gpu_spec, tpu_spec): 36 | mode = c._job_mode(use_gpu, gpu_spec, tpu_spec) 37 | self.assertEqual(mode, expected_mode) 38 | 39 | # --nogpu and no override. 40 | assertMode(JobMode.CPU, False, None, None) 41 | 42 | # TPU doesn't need GPUs 43 | assertMode(JobMode.CPU, False, None, tpu_spec) 44 | 45 | # Default GPUSpec filled in. 46 | assertMode(JobMode.GPU, True, None, None) 47 | 48 | # Explicit GPU spec, so GPU gets attached. 49 | assertMode(JobMode.GPU, True, gpu_spec, None) 50 | assertMode(JobMode.GPU, True, gpu_spec, tpu_spec) 51 | 52 | # If NO explicit GPU is supplied but a TPU is supplied, execute in CPU 53 | # mode, ie, don't attach a GPU. 54 | assertMode(JobMode.CPU, True, None, tpu_spec) 55 | 56 | # explicit GPU spec is incompatible with --nogpu in both of the following 57 | # cases, irrespective of TPU spec. 58 | with self.assertRaises(AssertionError): 59 | c._job_mode(False, gpu_spec, None) 60 | 61 | with self.assertRaises(AssertionError): 62 | c._job_mode(False, gpu_spec, tpu_spec) 63 | -------------------------------------------------------------------------------- /tests/caliban/util/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /tests/caliban/util/test_argparse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from collections import OrderedDict 18 | 19 | import caliban.util.argparse as ua 20 | 21 | 22 | def test_expand_args(): 23 | m = OrderedDict([("a", "item"), ("b", None), ("c", "d")]) 24 | expanded = ua.expand_args(m) 25 | 26 | # None is excluded from the results. 27 | assert expanded == ["a", "item", "b", "c", "d"] 28 | 29 | 30 | def test_is_key(): 31 | """A key is anything that starts with a dash; nothing else!""" 32 | assert ua.is_key("--face") 33 | assert ua.is_key("-f") 34 | assert not ua.is_key("") 35 | assert not ua.is_key("face") 36 | assert not ua.is_key("f") 37 | 38 | # this should never happen, but what the heck, why not test that it's a 39 | # fine thing, accepted yet strange. 40 | assert ua.is_key("-----face") 41 | -------------------------------------------------------------------------------- /tests/caliban/util/test_auth.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from subprocess import CalledProcessError 18 | 19 | from google.oauth2.credentials import Credentials 20 | 21 | import caliban.util.auth as a 22 | 23 | 24 | def register_auth(process, **kwargs): 25 | process.register_subprocess(["gcloud", "auth", "print-access-token"], **kwargs) 26 | 27 | 28 | def fail_process(process): 29 | process.returncode = 1 30 | raise CalledProcessError("cmd", "exception! Not logged in!") 31 | 32 | 33 | def test_auth_access_token(fake_process): 34 | """Check that if the user has logged in with `gcloud auth login`, 35 | `auth_access_token` returns the correct token. 36 | 37 | """ 38 | token = "token" 39 | register_auth(fake_process, stdout=token) 40 | assert a.auth_access_token() == token 41 | 42 | 43 | def test_missing_auth_access_token(fake_process): 44 | """Check that if the user has NOT logged in with `gcloud auth login`, 45 | `auth_access_token` returns None. 46 | 47 | """ 48 | register_auth(fake_process, callback=fail_process) 49 | assert a.auth_access_token() is None 50 | 51 | 52 | def test_gcloud_auth_credentials(fake_process): 53 | """Check that if the user has logged in with `gcloud auth login`, 54 | a proper instance of Credentials is returned. 55 | 56 | """ 57 | token = "token" 58 | register_auth(fake_process, stdout=token) 59 | assert isinstance(a.gcloud_auth_credentials(), Credentials) 60 | 61 | 62 | def test_missing_gcloud_auth_credentials(fake_process): 63 | """Check that if the user has logged in with `gcloud auth login`, 64 | `auth_access_token` returns the correct token. 65 | 66 | """ 67 | register_auth(fake_process, callback=fail_process) 68 | assert a.gcloud_auth_credentials() is None 69 | -------------------------------------------------------------------------------- /tests/caliban/util/test_schema.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import tempfile 18 | 19 | import schema as s 20 | 21 | import caliban.util.schema as us 22 | import pytest 23 | 24 | 25 | def test_directory(tmpdir): 26 | # Proper directories pass validation. 27 | assert us.Directory.validate(tmpdir) == tmpdir 28 | 29 | # random dirs that I made up dont! 30 | with pytest.raises(s.SchemaError) as e: 31 | assert us.Directory.validate("random") 32 | 33 | # Check that the formatting string works. 34 | assert e.match("Directory 'random' doesn't exist") 35 | 36 | 37 | def test_file(): 38 | with tempfile.NamedTemporaryFile() as tmp: 39 | # Existing files pass validation. 40 | assert us.File.validate(tmp.name) == tmp.name 41 | 42 | # random paths that I made up dont! 43 | with pytest.raises(s.SchemaError) as e: 44 | assert us.File.validate("random") 45 | 46 | # Check that the formatting string works. 47 | assert e.match("File 'random' isn't") 48 | -------------------------------------------------------------------------------- /tests/caliban/util/test_tqdm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import io 18 | 19 | from tqdm.utils import _term_move_up 20 | 21 | import caliban.util.tqdm as ut 22 | 23 | 24 | def test_carriage_return(): 25 | def through(xs): 26 | buf = io.StringIO() 27 | f = ut.TqdmFile(file=buf) 28 | 29 | for x in xs: 30 | f.write(x) 31 | f.flush() 32 | 33 | return buf.getvalue() 34 | 35 | # Strings pass through tqdmfile with no newline attached. 36 | assert through(["Yo!"]) == "Yo!" 37 | 38 | # Empty lines do nothing. 39 | assert through(["", "", ""]) == "" 40 | 41 | # A carriage return is converted to a newline, but the next line, if it's 42 | # written, will have the proper prefix to trigger a carriage return. 43 | assert through(["Yo!\r"]) == "Yo!\n" 44 | 45 | # Boom, triggered. 46 | assert through(["Yo!\r", "continue"]) == f"Yo!\n{_term_move_up()}\rcontinue" 47 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """Configuration for Hypothesis tests.""" 17 | 18 | import os 19 | 20 | from hypothesis import Verbosity, settings 21 | 22 | settings.register_profile("ci", max_examples=1000) 23 | settings.register_profile("dev", max_examples=10) 24 | settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose) 25 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "default")) 26 | -------------------------------------------------------------------------------- /tests/context.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | #!/usr/bin/python 18 | # 19 | # Copyright 2020 Google LLC 20 | # 21 | # Licensed under the Apache License, Version 2.0 (the "License"); 22 | # you may not use this file except in compliance with the License. 23 | # You may obtain a copy of the License at 24 | # 25 | # http://www.apache.org/licenses/LICENSE-2.0 26 | # 27 | # Unless required by applicable law or agreed to in writing, software 28 | # distributed under the License is distributed on an "AS IS" BASIS, 29 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 30 | # See the License for the specific language governing permissions and 31 | # limitations under the License. 32 | 33 | import os 34 | import sys 35 | 36 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 37 | -------------------------------------------------------------------------------- /tutorials/README.md: -------------------------------------------------------------------------------- 1 | # Caliban Tutorials 2 | 3 | This directory contains a number of tutorials that show off various aspects of 4 | [Caliban](https://github.com/google/caliban). 5 | 6 | The `basic` directory contains the code for the ["Getting Started with 7 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban) 8 | tutorial on the main page of [Caliban's github 9 | repository](https://github.com/google/caliban). 10 | 11 | More coming soon! 12 | -------------------------------------------------------------------------------- /tutorials/basic/.calibanconfig.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /tutorials/basic/README.md: -------------------------------------------------------------------------------- 1 | # Basic Tutorial 2 | 3 | This directory contains the code for the ["Getting Started with 4 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban) 5 | tutorial on the main page of [Caliban's github 6 | repository](https://github.com/google/caliban). 7 | 8 | Visit ["Getting Started with 9 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban) for 10 | the full tutorial, and instructions on how to run the code in this folder. 11 | -------------------------------------------------------------------------------- /tutorials/basic/experiment.json: -------------------------------------------------------------------------------- 1 | {"learning_rate": [0.01, 0.001, 0.0001]} 2 | -------------------------------------------------------------------------------- /tutorials/basic/mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """This tutorial comes from the Tensorflow MNIST quickstart at 17 | https://www.tensorflow.org/tutorials/quickstart/beginner. 18 | 19 | """ 20 | import warnings 21 | 22 | import tensorflow as tf 23 | from absl import app, flags 24 | 25 | warnings.filterwarnings("ignore", category=DeprecationWarning) 26 | 27 | FLAGS = flags.FLAGS 28 | 29 | # Define a command-line argument using the Abseil library: 30 | # https://abseil.io/docs/python/guides/flags 31 | flags.DEFINE_float("learning_rate", 0.1, "Learning rate.") 32 | flags.DEFINE_integer("epochs", 3, "Epochs to train.") 33 | 34 | 35 | def get_keras_model(width=128, activation="relu"): 36 | """Returns an instance of a Keras Sequential model. 37 | https://www.tensorflow.org/api_docs/python/tf/keras/Sequential""" 38 | return tf.keras.models.Sequential( 39 | [ 40 | tf.keras.layers.Flatten(input_shape=(28, 28)), 41 | tf.keras.layers.Dense(width, activation=activation), 42 | tf.keras.layers.Dense(width, activation=activation), 43 | tf.keras.layers.Dense(10, activation=None), 44 | ] 45 | ) 46 | 47 | 48 | def main(_): 49 | """Train a model against the MNIST dataset and print performance metrics.""" 50 | mnist = tf.keras.datasets.mnist 51 | 52 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 53 | x_train, x_test = x_train / 255.0, x_test / 255.0 54 | 55 | model = get_keras_model() 56 | 57 | loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) 58 | optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate) 59 | 60 | model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"]) 61 | 62 | print( 63 | f"Training model with learning rate={FLAGS.learning_rate} for {FLAGS.epochs} epochs." 64 | ) 65 | model.fit(x_train, y_train, epochs=FLAGS.epochs) 66 | 67 | print("Model performance: ") 68 | model.evaluate(x_test, y_test, verbose=2) 69 | 70 | 71 | if __name__ == "__main__": 72 | app.run(main) 73 | -------------------------------------------------------------------------------- /tutorials/basic/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-cpu 2 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/.calibanconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "apt_packages" : ["openssh-client", "curl"], 3 | "mlflow_config" : {"project": "blueshift-research", 4 | "region": "us-central1", 5 | "db": "mlflow", 6 | "user": "postgres", 7 | "password": "mlflow", 8 | "artifact_root": "gs://blueshift-research/mlflow", 9 | "debug" : false} 10 | } 11 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/README.md: -------------------------------------------------------------------------------- 1 | # UV + MLFlow Tutorial [ALPHA!] 2 | 3 | This directory contains a demo of a model training workflow that uses the 4 | [uv-metrics](https://github.com/google/uv-metrics) library to persist metrics to 5 | an [MLFlow](https://mlflow.org/) tracking server. 6 | 7 | This is mostly here for testing and reference. Check back for a documentation 8 | update once the API settles down. 9 | 10 | ## Prerequisites 11 | 12 | Right now we are supporting logging metrics to a sql-based backing store only 13 | in this tutorial, but we will update things to allow for local storage in the 14 | future. For now you will need to have a google cloud sql instance configured 15 | for this, and you will need an MLFlow server set up to serve results from 16 | this instance. 17 | 18 | To run this tutorial, you will need to edit the `.calibanconfig.json` 19 | file in this directory to reflect your database settings so that the training 20 | script can connect to the database and log metrics. The specific entries to 21 | edit here are in the `mflow_config` entry in `.calibanconfig.json`: 22 | 23 | ``` 24 | { 25 | "apt_packages" : ["openssh-client", "curl"], 26 | "mlflow_config" : {"project": , 27 | "region": , 28 | "db": , 29 | "user": , 30 | "password": , 31 | "artifact_root": , 32 | "debug" : false} 33 | } 34 | ``` 35 | 36 | One note here is that currently artifact storage is not working completely, but 37 | please specify this entry and we will update this tutorial once that is working properly. 38 | 39 | Once you have set these parameters properly, you should be able to run the tutorial code. 40 | 41 | ## Sanity Check (optional) 42 | 43 | A quick sanity check to test your database connection is to set the `debug` flag in 44 | the `.calibanconfig.json` file to `true`, and then use Caliban to run the `hello_world.sh` 45 | script. This script simply prints "hello, world", but by enabling the `debug` flag, we 46 | can check the status of the database connection. 47 | 48 | To run this test: 49 | 50 | ``` 51 | caliban run --nogpu hello_world.sh 52 | ``` 53 | 54 | If your database settings are configured properly, you should see output like the following: 55 | 56 | ``` 57 | Successfully built 5eb8dcef14ce 58 | I0807 13:02:53.008464 139963939288896 tqdm.py:90] Restoring pure python logging 59 | I0807 13:02:53.010536 139963939288896 run.py:74] 60 | I0807 13:02:53.010816 139963939288896 run.py:75] Job 1 - Experiment args: [] 61 | I0807 13:02:53.010974 139963939288896 run.py:198] Running command: docker run --ipc host -e PYTHONUNBUFFERED=1 -e COLUMNS=211 -e LINES=19 5eb8dcef14ce ... 62 | 2020/08/07 20:02:53 current FDs rlimit set to 1048576, wanted limit is 8500. Nothing to do here. 63 | 2020/08/07 20:02:53 using credential file for authentication; path="/home//.config/gcloud/application_default_credentials.json" 64 | 2020/08/07 20:02:54 Listening on /tmp/cloudsql/::/.s.PGSQL.5432 for :: 65 | 2020/08/07 20:02:54 Ready for new connections 66 | INFO:root:/bin/bash hello_world.sh 67 | hello, world 68 | I0807 13:03:04.015075 139963939288896 run.py:111] Job 1 succeeded! 69 | ``` 70 | 71 | As long as you see `Ready for new connections`, then your configuration should be ok, and you 72 | can disable the `debug` flag and continue with the rest of the tutorial. 73 | 74 | ## Running a Job 75 | 76 | In the Caliban repository: 77 | 78 | ``` 79 | git checkout master && git pull 80 | cd tutorials/uv-metrics 81 | ``` 82 | 83 | Run a single job: 84 | 85 | ``` 86 | caliban run --nogpu trainer.train 87 | ``` 88 | 89 | Name the experiment group and run 3: 90 | 91 | ``` 92 | caliban run --experiment_config experiment.json --xgroup mlflow_tutorial --nogpu trainer.train 93 | ``` 94 | 95 | ## Check the MLFlow UI 96 | 97 | You may need to refresh, but the UI should now show multiple experiments. You can view the 98 | status and metrics for your jobs from the UI while your jobs are in progress, which is 99 | useful for long-running jobs. 100 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/cli.py: -------------------------------------------------------------------------------- 1 | """CLI Interface for the UV-metrics tutorial example.""" 2 | 3 | import argparse 4 | 5 | from absl.flags import argparse_flags 6 | 7 | 8 | def create_parser(): 9 | """Creates and returns the argparse instance for the experiment config 10 | expansion app. 11 | 12 | """ 13 | 14 | parser = argparse_flags.ArgumentParser( 15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 16 | description="""Configurable arguments for the uv-metrics Caliban tutorial.""", 17 | prog="python -m mnist", 18 | ) 19 | 20 | parser.add_argument( 21 | "--gcloud_path", 22 | help="""Path for gcloud logs; if supplied, used for persisting logs. This must be of 23 | the form gs://BUCKET_NAME/subfolder. Logs will be stored in the supplied 24 | folder in a subfolder named after the current job run.""", 25 | ) 26 | 27 | parser.add_argument( 28 | "--local_path", 29 | help="""Path for gcloud logs; if supplied, this location on the local filesystem is 30 | used for persisting logs in jsonl format. The path can be relative. Logs 31 | will be stored in the supplied folder in a subfolder named after the 32 | current job run.""", 33 | ) 34 | 35 | parser.add_argument( 36 | "--tensorboard_path", 37 | help="""project-local path for tensorboard logs; if supplied, this location on the 38 | local filesystem is used for persisting logs that tensorboard can 39 | read.""", 40 | ) 41 | 42 | parser.add_argument( 43 | "--learning_rate", "--lr", type=float, default=0.01, help="Learning rate." 44 | ) 45 | parser.add_argument("--epochs", type=int, default=3, help="Epochs to train.") 46 | 47 | return parser 48 | 49 | 50 | def parse_flags(argv): 51 | """Function required by absl.app.run. Internally generates a parser and returns 52 | the results of parsing hello-uv arguments. 53 | 54 | """ 55 | args = argv[1:] 56 | return create_parser().parse_args(args) 57 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/experiment.json: -------------------------------------------------------------------------------- 1 | {"learning_rate": [0.01, 0.001, 0.0001]} 2 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/hello_world.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | echo "hello, world" 3 | sleep 5 4 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2020 Google LLC 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from setuptools import find_packages, setup 18 | 19 | REQUIRED_PACKAGES = [ 20 | "alembic==1.4.2", 21 | "google-cloud-storage", 22 | "matplotlib", 23 | "mlflow==1.10.0", 24 | "pg8000==1.16.1", 25 | "sqlalchemy==1.3.13", 26 | "tensorflow-cpu", 27 | "tensorflow_datasets", 28 | "uv-metrics>=0.4.2", 29 | ] 30 | 31 | setup( 32 | version="0.0.1", 33 | name="uv-metrics-tutorial", 34 | description="UV Metrics example.", 35 | packages=find_packages(exclude=("tests", "docs")), 36 | install_requires=REQUIRED_PACKAGES, 37 | ) 38 | -------------------------------------------------------------------------------- /tutorials/uv-metrics/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/tutorials/uv-metrics/trainer/__init__.py -------------------------------------------------------------------------------- /tutorials/uv-metrics/trainer/cli.py: -------------------------------------------------------------------------------- 1 | """CLI Interface for the Hello-UV tutorial example.""" 2 | 3 | import argparse 4 | 5 | from absl.flags import argparse_flags 6 | 7 | 8 | def create_parser(): 9 | """Creates and returns the argparse instance for the experiment config 10 | expansion app. 11 | 12 | """ 13 | 14 | parser = argparse_flags.ArgumentParser( 15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 16 | description="""Configurable arguments for the UV Metrics demo.""", 17 | prog="python -m trainer.train", 18 | ) 19 | 20 | parser.add_argument( 21 | "--gcloud_path", 22 | help="""Path for gcloud logs; if supplied, used for persisting logs. This must be of 23 | the form gs://BUCKET_NAME/subfolder. Logs will be stored in the supplied 24 | folder in a subfolder named after the current job run.""", 25 | ) 26 | 27 | parser.add_argument( 28 | "--local_path", 29 | help="""Path for gcloud logs; if supplied, this location on the local filesystem is 30 | used for persisting logs in jsonl format. The path can be relative. Logs 31 | will be stored in the supplied folder in a subfolder named after the 32 | current job run.""", 33 | ) 34 | 35 | parser.add_argument( 36 | "--tensorboard_path", 37 | help="""project-local path for tensorboard logs; if supplied, this location on the 38 | local filesystem is used for persisting logs that tensorboard can 39 | read.""", 40 | ) 41 | 42 | parser.add_argument( 43 | "--activation", 44 | help="""Activation strings. Choose from the options at 45 | https://www.tensorflow.org/api_docs/python/tf/keras/activations""", 46 | default="relu", 47 | ) 48 | parser.add_argument( 49 | "--width", type=int, default=1000, help="Width of the network to train." 50 | ) 51 | parser.add_argument( 52 | "--depth", type=int, default=2, help="Depth of the network to train." 53 | ) 54 | parser.add_argument( 55 | "--learning_rate", 56 | "--lr", 57 | type=float, 58 | default=0.1, 59 | help="Learning rate to use while training.", 60 | ) 61 | 62 | return parser 63 | 64 | 65 | def parse_flags(argv): 66 | """Function required by absl.app.run. Internally generates a parser and returns 67 | the results of parsing hello-uv arguments. 68 | 69 | """ 70 | args = argv[1:] 71 | return create_parser().parse_args(args) 72 | --------------------------------------------------------------------------------