├── .dir-locals.el
├── .dockerignore
├── .gitattributes
├── .github
└── workflows
│ ├── coverage.yml
│ ├── pre-commit.yml
│ ├── release.yml
│ └── workflow.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── COMMITTERS.md
├── CONTRIBUTING.md
├── LICENSE
├── LICENSE_SHORT
├── MANIFEST.in
├── Makefile
├── README.md
├── caliban
├── __init__.py
├── __main__.py
├── _version.py
├── cli.py
├── config
│ ├── __init__.py
│ └── experiment.py
├── docker
│ ├── __init__.py
│ ├── build.py
│ └── push.py
├── expansion.py
├── history
│ ├── __init__.py
│ ├── cli.py
│ ├── submit.py
│ ├── types.py
│ └── util.py
├── main.py
├── platform
│ ├── __init__.py
│ ├── cloud
│ │ ├── __init__.py
│ │ ├── core.py
│ │ ├── types.py
│ │ └── util.py
│ ├── gke
│ │ ├── __init__.py
│ │ ├── cli.py
│ │ ├── cluster.py
│ │ ├── constants.py
│ │ ├── types.py
│ │ └── util.py
│ ├── notebook.py
│ ├── run.py
│ └── shell.py
├── resources
│ ├── __init__.py
│ ├── caliban_launcher.py
│ └── cloud_sql_proxy.py
└── util
│ ├── __init__.py
│ ├── argparse.py
│ ├── auth.py
│ ├── fs.py
│ ├── metrics.py
│ ├── schema.py
│ └── tqdm.py
├── cloudbuild.json
├── codemeta.json
├── dockerfiles
├── Dockerfile
└── Dockerfile.gpu
├── docs
├── Makefile
├── _static
│ └── img
│ │ ├── cloud
│ │ ├── activate.png
│ │ ├── create_new_key.png
│ │ ├── create_service_account.png
│ │ ├── new_project.png
│ │ ├── project_id.png
│ │ ├── select_project.png
│ │ └── service_acct_permissions.png
│ │ └── gke
│ │ ├── cleanup_job.png
│ │ ├── cluster_create_progress.png
│ │ ├── cluster_dashboard.png
│ │ ├── job_logs.png
│ │ ├── node_pool_autoprovision.png
│ │ ├── pod_events.png
│ │ ├── pre_job_details.png
│ │ ├── pre_job_submission.png
│ │ ├── stackdriver_logs.png
│ │ ├── unschedulable.png
│ │ └── unschedulable_details.png
├── cli
│ ├── caliban_build.rst
│ ├── caliban_cloud.rst
│ ├── caliban_cluster.rst
│ ├── caliban_notebook.rst
│ ├── caliban_resubmit.rst
│ ├── caliban_run.rst
│ ├── caliban_shell.rst
│ ├── caliban_status.rst
│ ├── caliban_stop.rst
│ └── expansion.rst
├── cloud
│ ├── adc.rst
│ ├── ai_platform_tpu.rst
│ ├── bucket.rst
│ ├── gpu_specs.rst
│ ├── labels.rst
│ ├── rate_limit.rst
│ └── service_account.rst
├── conf.py
├── explore
│ ├── base_image.rst
│ ├── calibanconfig.rst
│ ├── custom_docker_run.rst
│ ├── custom_script_args.rst
│ ├── declaring_requirements.rst
│ ├── exp_stdin.rst
│ ├── experiment_broadcasting.rst
│ ├── experiment_groups.rst
│ ├── gcloud.rst
│ ├── mac.rst
│ ├── script_vs_module.rst
│ └── why_caliban.rst
├── getting_started
│ ├── cloud.rst
│ ├── getting_caliban.rst
│ └── prerequisites.rst
├── gke
│ ├── cluster_management.rst
│ ├── concepts.rst
│ ├── job_submission.rst
│ └── prereq.rst
├── index.rst
├── make.bat
├── recipes
│ ├── dockerignore.rst
│ ├── flagfile.rst
│ ├── local_dir.rst
│ └── single_gpu.rst
└── requirements.txt
├── paper
├── 10.21105.joss.02403.pdf
├── paper.bib
└── paper.md
├── pylintrc
├── pyproject.toml
├── requirements-dev.txt
├── scripts
├── bashrc
├── build_dockerfiles.sh
├── cloudbuild.py
├── cloudbuild_config.json
└── run_tests.sh
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── caliban
│ ├── __init__.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── test_config.py
│ │ └── test_experiment.py
│ ├── docker
│ │ ├── __init__.py
│ │ ├── test_build.py
│ │ └── test_push.py
│ ├── history
│ │ ├── __init__.py
│ │ └── test_history.py
│ ├── platform
│ │ ├── __init__.py
│ │ ├── cloud
│ │ │ ├── __init__.py
│ │ │ ├── test_types.py
│ │ │ └── test_util.py
│ │ └── gke
│ │ │ ├── __init__.py
│ │ │ ├── test_types.py
│ │ │ └── test_util.py
│ ├── resources
│ │ ├── __init__.py
│ │ └── test_caliban_launcher.py
│ ├── test_cli.py
│ └── util
│ │ ├── __init__.py
│ │ ├── test_argparse.py
│ │ ├── test_auth.py
│ │ ├── test_fs.py
│ │ ├── test_metrics.py
│ │ ├── test_schema.py
│ │ ├── test_tqdm.py
│ │ └── test_util.py
├── conftest.py
└── context.py
├── tutorials
├── README.md
├── basic
│ ├── .calibanconfig.json
│ ├── README.md
│ ├── experiment.json
│ ├── mnist.py
│ └── requirements.txt
└── uv-metrics
│ ├── .calibanconfig.json
│ ├── README.md
│ ├── cli.py
│ ├── experiment.json
│ ├── hello_world.sh
│ ├── mnist.py
│ ├── setup.py
│ └── trainer
│ ├── __init__.py
│ ├── cli.py
│ └── train.py
└── versioneer.py
/.dir-locals.el:
--------------------------------------------------------------------------------
1 | ((python-mode
2 | . ((py-indent-offset . 2))))
3 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # ignore .git and .cache folders
2 | .git
3 | .cache
4 | env
5 | tests
6 | *.egg-info
7 | Makefile
8 | pylintrc
9 | setup.cfg
10 | __pycache__
11 | .coverage
12 | .pytest_cache
13 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | caliban/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
1 | name: coverage
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches: [main]
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.11
17 | - name: Cache pip
18 | uses: actions/cache@v2
19 | with:
20 | # This path is specific to Ubuntu
21 | path: ~/.cache/pip
22 | # Look to see if there is a cache hit for the corresponding requirements file
23 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
24 | restore-keys: |
25 | ${{ runner.os }}-pip-
26 | ${{ runner.os }}-
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | pip install .
31 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
32 | - name: Run pytest, generate coverage
33 | run: |
34 | pytest --doctest-modules -v -s \
35 | --hypothesis-profile dev \
36 | --cov-config setup.cfg \
37 | --cov-report=xml \
38 | --cov caliban \
39 | caliban tests
40 | - name: Upload coverage to Codecov
41 | uses: codecov/codecov-action@v1
42 | with:
43 | fail_ci_if_error: true
44 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: pre-commit
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches: [main]
7 |
8 | jobs:
9 | pre-commit:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 |
14 | - uses: actions/setup-python@v4
15 | with:
16 | python-version: 3.11
17 |
18 | - uses: pre-commit/action@v3.0.0
19 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release to PyPi
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - name: Set up Python 3.11
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: 3.11
16 | - name: Install dependencies
17 | run: |
18 | python -m pip install --upgrade pip
19 | pip install setuptools wheel twine
20 | - name: Build and publish
21 | env:
22 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
23 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
24 | run: |
25 | python setup.py sdist bdist_wheel
26 | twine upload dist/*
27 |
--------------------------------------------------------------------------------
/.github/workflows/workflow.yml:
--------------------------------------------------------------------------------
1 | name: build
2 |
3 | on:
4 | pull_request:
5 | push:
6 | branches: [main]
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | strategy:
12 | matrix:
13 | python-version: ["3.9", "3.10", "3.11"]
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Cache pip
22 | uses: actions/cache@v2
23 | with:
24 | # This path is specific to Ubuntu
25 | path: ~/.cache/pip
26 | # Look to see if there is a cache hit for the corresponding requirements file
27 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
28 | restore-keys: |
29 | ${{ runner.os }}-pip-
30 | ${{ runner.os }}-
31 | - name: Install dependencies
32 | run: |
33 | python -m pip install --upgrade pip
34 | pip install .
35 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
36 | - name: Run pytest
37 | run: |
38 | pytest --doctest-modules -v -s \
39 | --hypothesis-profile dev \
40 | caliban tests
41 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | output-dist
106 | keras_tensorboard
107 | keras_export
108 |
109 | # emacs backup files
110 | *~
111 | auto
112 | mlruns
113 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | repos:
16 | - repo: https://github.com/pre-commit/pre-commit-hooks
17 | rev: v2.3.0
18 | hooks:
19 | - id: check-yaml
20 | - id: end-of-file-fixer
21 | - id: trailing-whitespace
22 |
23 | - repo: https://github.com/astral-sh/ruff-pre-commit
24 | rev: v0.1.11
25 | hooks:
26 | - id: ruff
27 | types_or: [ python, pyi, jupyter ]
28 |
29 | - id: ruff-format
30 | types_or: [ python, pyi, jupyter ]
31 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Build documentation in the docs/ directory with Sphinx
9 | sphinx:
10 | configuration: docs/conf.py
11 |
12 | # Optionally set the version of Python and requirements required to build your docs
13 | python:
14 | version: 3.7
15 | install:
16 | - requirements: docs/requirements.txt
17 |
--------------------------------------------------------------------------------
/COMMITTERS.md:
--------------------------------------------------------------------------------
1 | # Committers
2 |
3 | These are the folks who can +1 a pull request and approve it for merge.
4 |
5 | ## Active
6 |
7 | | Name | Handle |
8 | |-----------------|------------------------------------------------------|
9 | | Sam Ritchie | [@isnotinvain](https://github.com/sritchie) |
10 | | Ambrose Slone | [@ajslone](https://github.com/ajslone) |
11 | | Guy Gur-Ari | [@guygurari](https://github.com/guygurari) |
12 | | Vinay Ramasesh | [@ramasesh](https://github.com/ramasesh) |
13 |
14 |
15 | ## Emeritus
16 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | So you want to add some code to Caliban. Excellent!
4 |
5 | Pull requests and bug reports are always welcome! Check out our [Contributor's
6 | Guide](CONTRIBUTING.md) for information on how to get started contributing to
7 | Caliban.
8 |
9 | The TL;DR; is:
10 |
11 | - send us a pull request,
12 | - iterate on the feedback + discussion, and
13 | - get a +1 from a [Committer](COMMITTERS.md)
14 |
15 | in order to get your PR accepted.
16 |
17 | Issues should be reported on the [GitHub issue
18 | tracker](https://github.com/google/caliban/issues).
19 |
20 | If you want to discuss an idea for a new feature or ask us a question,
21 | discussion occurs primarily in the body of [Github
22 | Issues](https://github.com/google/caliban/issues), though the project is growing
23 | large enough that we may start a Gitter channel soon.
24 |
25 | The current list of active committers (who can +1 a pull request) can be found
26 | here: [COMMITTERS.md](COMMITTERS.md)
27 |
28 | A list of contributors to the project can be found at the project's
29 | [Contributors](https://github.com/google/caliban/graphs/contributors) page.
30 |
31 | ## Contributor License Agreement
32 |
33 | Contributions to this project must be accompanied by a Contributor License
34 | Agreement. You (or your employer) retain the copyright to your contribution;
35 | this simply gives us permission to use and redistribute your contributions as
36 | part of the project. Head over to to see
37 | your current agreements on file or to sign a new one.
38 |
39 | You generally only need to submit a CLA once, so if you've already submitted one
40 | (even if it was for a different project), you probably don't need to do it
41 | again.
42 |
43 | ## Developing in Caliban
44 |
45 | We use [pre-commit](https://pre-commit.com/) to manage a series of git
46 | pre-commit hooks for the project; for example, each time you commit code, the
47 | hooks will make sure that your python is formatted properly. If your code isn't,
48 | the hook will format it, so when you try to commit the second time you'll get
49 | past the hook.
50 |
51 | All hooks are defined in `.pre-commit-config.yaml`. To install these hooks,
52 | install `pre-commit` if you don't yet have it. I prefer using
53 | [pipx](https://github.com/pipxproject/pipx) so that `pre-commit` stays globally
54 | available.
55 |
56 | ```bash
57 | pipx install pre-commit
58 | ```
59 |
60 | Then install the hooks with this command:
61 |
62 | ```bash
63 | pre-commit install
64 | ```
65 |
66 | Now they'll run on every commit. If you want to run them manually, you can run either of these commands:
67 |
68 | ```bash
69 | pre-commit run --all-files
70 |
71 | # or this, if you've previously run `make build`:
72 | make lint
73 | ```
74 |
75 | ## Documentation
76 |
77 | We use Sphinx to generate docs. If you want to live-preview your changes to the
78 | documentation as you are editing, you can use
79 | [sphinx-reload](https://pypi.org/project/sphinx-reload/). To get this working:
80 |
81 | ```bash
82 | pipx install sphinx-reload
83 | ```
84 |
85 | Then, inside the caliban folder:
86 |
87 | ```bash
88 | make build
89 | sphinx-reload docs
90 | ```
91 |
92 | If all goes well, `sphinx-reload` will tell you it is serving the documentation
93 | on a port, which you can listen into from your browser.
94 |
95 | ## Publishing Caliban
96 |
97 | - First, run `make build` to get your virtual environment set up.
98 | - Make sure that you're on the master branch!
99 | - add a new tag, with `git tag 0.2.3` or the equivalent
100 | - run `make release` to push the latest code and tags to all relevant
101 | repositories.
102 |
--------------------------------------------------------------------------------
/LICENSE_SHORT:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # https://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include caliban/_version.py
3 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ##
2 | # Variables
3 | ##
4 |
5 | ENV_NAME = env
6 | ENV_ACT = . env/bin/activate;
7 | PIP = $(ENV_NAME)/bin/pip
8 | PY = $(ENV_NAME)/bin/python
9 | PYTEST_ARGS = --doctest-modules -v -s --hypothesis-profile dev
10 | PYTEST_TARGET = caliban tests
11 | COVERAGE_ARGS = --cov-config setup.cfg --cov-report term-missing --cov
12 | COVERAGE_TARGET = caliban
13 |
14 | ##
15 | # Targets
16 | ##
17 |
18 | .PHONY: build
19 | build: clean install
20 |
21 | .PHONY: clean
22 | clean: clean-env clean-files
23 |
24 | .PHONY: clean-env
25 | clean-env:
26 | rm -rf $(ENV_NAME)
27 |
28 | .PHONY: clean-files
29 | clean-files:
30 | rm -rf .tox
31 | rm -rf .coverage
32 | find . -name \*.pyc -type f -delete
33 | find . -name \*.test.db -type f -delete
34 | find . -depth -name __pycache__ -type d -exec rm -rf {} \;
35 | rm -rf dist *.egg* build
36 |
37 | .PHONY: install
38 | install:
39 | rm -rf $(ENV_NAME)
40 | virtualenv -p python3 $(ENV_NAME)
41 | $(PIP) install -r requirements-dev.txt
42 | $(PIP) install -r docs/requirements.txt
43 | $(PIP) install -e .
44 |
45 | .PHONY: test
46 | test: lint pytest
47 |
48 | .PHONY: pytest
49 | pytest:
50 | $(ENV_ACT) pytest $(PYTEST_ARGS) $(COVERAGE_ARGS) $(COVERAGE_TARGET) $(PYTEST_TARGET)
51 |
52 | .PHONY: test-full
53 | test-full: lint test-setuppy clean-files
54 |
55 | .PHONY: test-setuppy
56 | test-setuppy:
57 | $(PY) setup.py test
58 |
59 | .PHONY: lint
60 | lint: pre-commit
61 |
62 | .PHONY: pre-commit
63 | pre-commit: $(ENV_ACT) pre-commit run --all-files
64 |
65 | .PHONY: push
66 | push:
67 | git push origin master
68 | git push --tags
69 |
70 | .PHONY: release-egg
71 | release-egg:
72 | $(ENV_ACT) python setup.py sdist bdist_wheel
73 | $(ENV_ACT) twine upload -r pypi dist/*
74 | rm -rf dist *.egg* build
75 |
76 | .PHONY: release
77 | release: push release-egg
78 |
--------------------------------------------------------------------------------
/caliban/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from ._version import get_versions
18 |
19 | __version__ = get_versions()["version"]
20 | del get_versions
21 |
--------------------------------------------------------------------------------
/caliban/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from .main import main
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/caliban/docker/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/docker/push.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Functions required to interact with Docker to build and run images, shells
17 | and notebooks in a Docker environment.
18 |
19 | """
20 |
21 | import json
22 | import subprocess
23 |
24 | from absl import logging
25 |
26 |
27 | def _image_tag_for_project(
28 | project_id: str, image_id: str, include_tag: bool = True
29 | ) -> str:
30 | """Generate the GCR Docker image tag for the supplied pair of project_id and
31 | image_id.
32 |
33 | This function properly handles "domain scoped projects", where the project ID
34 | contains a domain name and project ID separated by :
35 | https://cloud.google.com/container-registry/docs/overview#domain-scoped_projects.
36 |
37 | """
38 | project_s = project_id.replace(":", "/")
39 | base = f"gcr.io/{project_s}/{image_id}"
40 | return f"{base}:latest" if include_tag else base
41 |
42 |
43 | def _gcr_list_tags(project_id: str, image_id: str):
44 | """Returns a sequence of metadata for all tags of the supplied image_id in the
45 | supplied project.
46 |
47 | """
48 | image_tag = _image_tag_for_project(project_id, image_id, include_tag=False)
49 | cmd = [
50 | "gcloud",
51 | "container",
52 | "images",
53 | "list-tags",
54 | f"--project={project_id}",
55 | "--format=json",
56 | image_tag,
57 | ]
58 | return json.loads(subprocess.check_output(cmd))
59 |
60 |
61 | def gcr_image_pushed(project_id: str, image_id: str) -> bool:
62 | """Returns true if the supplied image has been pushed to the container registry
63 | for the supplied project, false otherwise.
64 |
65 | """
66 | return len(_gcr_list_tags(project_id, image_id)) > 0
67 |
68 |
69 | def push_uuid_tag(project_id: str, image_id: str, force: bool = False) -> str:
70 | """Takes a base image and tags it for upload, then pushes it to a remote Google
71 | Container Registry.
72 |
73 | Returns the tag on a successful push.
74 | """
75 | image_tag = _image_tag_for_project(project_id, image_id)
76 |
77 | def missing_remotely():
78 | missing = not gcr_image_pushed(project_id, image_id)
79 | if not missing:
80 | logging.info(f"Skipping docker push, as {image_tag} already exists remotely.")
81 | return missing
82 |
83 | if force or missing_remotely():
84 | subprocess.run(["docker", "tag", image_id, image_tag], check=True)
85 | subprocess.run(["docker", "push", image_tag], check=True)
86 |
87 | return image_tag
88 |
--------------------------------------------------------------------------------
/caliban/expansion.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Entry point for Caliban's experiment config expansion."""
17 |
18 | from __future__ import absolute_import, division, print_function
19 |
20 | import json
21 | import logging as ll
22 | from typing import List
23 |
24 | from absl import app, logging
25 | from absl.flags import argparse_flags
26 |
27 | import caliban.config.experiment as ce
28 | from caliban import __version__
29 |
30 | ll.getLogger("caliban.expansion").setLevel(logging.ERROR)
31 |
32 |
33 | def expansion_parser():
34 | """Creates and returns the argparse instance for the experiment config
35 | expansion app.
36 |
37 | """
38 |
39 | parser = argparse_flags.ArgumentParser(
40 | description="Experiment config expander. For documentation, visit https://github.com/google/caliban",
41 | prog="expansion",
42 | )
43 | parser.add_argument(
44 | "--version", action="version", version="%(prog)s {}".format(__version__)
45 | )
46 | parser.add_argument(
47 | "--pprint", action="store_true", help="Pretty-print the config to stdout."
48 | )
49 | parser.add_argument(
50 | "--print_flags",
51 | action="store_true",
52 | help="Print the actual flags generated by each experiment in the expansion, \
53 | one per line.",
54 | )
55 | parser.add_argument(
56 | "experiment_config",
57 | type=ce.load_experiment_config,
58 | help="Path to an experiment config, or 'stdin' to read from stdin.",
59 | )
60 |
61 | return parser
62 |
63 |
64 | def parse_flags(argv):
65 | """Function required by absl.app.run. Internally generates a parser and returns
66 | the results of parsing caliban arguments.
67 |
68 | """
69 | args = argv[1:]
70 | return expansion_parser().parse_args(args)
71 |
72 |
73 | def _print_flags(expanded: List[ce.Experiment]) -> None:
74 | """Print the flags associated with each experiment in the supplied expansion
75 | list.
76 |
77 | """
78 | for m in expanded:
79 | flags = ce.experiment_to_args(m)
80 | print(" ".join(flags))
81 |
82 |
83 | def _print_json(expanded: List[ce.Experiment], pprint: bool = False) -> None:
84 | """Print the list of expanded experiments to stdout; if pprint is true,
85 | pretty-prints each JSON dict using an indent of 2, else prints the list with
86 | no newlines.
87 |
88 | """
89 | indent = 2 if pprint else None
90 | print(json.dumps(expanded, indent=indent))
91 |
92 |
93 | def run_app(args):
94 | """Main function to run the Caliban app. Accepts a Namespace-type output of an
95 | argparse argument parser.
96 |
97 | """
98 | conf = args.experiment_config
99 | expanded = ce.expand_experiment_config(conf)
100 |
101 | if args.print_flags:
102 | _print_flags(expanded)
103 | else:
104 | _print_json(expanded, pprint=args.pprint)
105 |
106 |
107 | def main():
108 | app.run(run_app, flags_parser=parse_flags)
109 |
110 |
111 | if __name__ == "__main__":
112 | main()
113 |
--------------------------------------------------------------------------------
/caliban/history/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/history/submit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """caliban utilities for job re-submission"""
17 |
18 | from typing import List, Optional
19 |
20 | import caliban.platform.cloud.core as cloud
21 | import caliban.platform.gke.cli as gke_cli
22 | import caliban.platform.run as r
23 | from caliban.history.types import JobSpec, Platform
24 |
25 |
26 | # ----------------------------------------------------------------------------
27 | def submit_job_specs(
28 | specs: List[JobSpec],
29 | platform: Platform,
30 | project_id: Optional[str] = None,
31 | credentials_path: Optional[str] = None,
32 | ) -> None:
33 | """submits a job spec"""
34 |
35 | if len(specs) == 0:
36 | return
37 |
38 | if platform == Platform.LOCAL:
39 | return r.execute_jobs(job_specs=specs)
40 |
41 | if platform == Platform.CAIP:
42 | return cloud.submit_job_specs(
43 | specs=specs,
44 | project_id=project_id,
45 | credentials_path=credentials_path,
46 | num_specs=len(specs),
47 | )
48 |
49 | if platform == Platform.GKE:
50 | return gke_cli.submit_job_specs(
51 | args={
52 | "cloud_key": credentials_path,
53 | "project_id": project_id,
54 | "specs": specs,
55 | },
56 | )
57 |
58 | return None
59 |
--------------------------------------------------------------------------------
/caliban/platform/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/platform/cloud/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/platform/cloud/util.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Utilities relevant to AI Platform.
18 | """
19 | import re
20 | from typing import Dict, List, Optional, Tuple, Union
21 |
22 | import caliban.util as u
23 | import caliban.util.argparse as ua
24 |
25 | # key and value for labels can be at most this-many-characters long.
26 | AI_PLATFORM_MAX_LABEL_LENGTH = 63
27 |
28 |
29 | def _truncate(s: str, max_length: int) -> str:
30 | """Returns the input string s truncated to be at most max_length characters
31 | long.
32 |
33 | """
34 | return s if len(s) <= max_length else s[0:max_length]
35 |
36 |
37 | def _clean_label(s: Optional[str], is_key: bool) -> str:
38 | """Processes the string into the sanitized format required by AI platform
39 | labels.
40 |
41 | https://cloud.google.com/ml-engine/docs/resource-labels
42 |
43 | """
44 | if s is None:
45 | return ""
46 |
47 | # periods are not allowed by AI Platform labels, but often occur in,
48 | # e.g., learning rates
49 | DECIMAL_REPLACEMENT = "_"
50 | s = s.replace(".", DECIMAL_REPLACEMENT)
51 |
52 | # lowercase, letters, - and _ are valid, so strip the leading dashes, make
53 | # everything lowercase and then kill any remaining unallowed characters.
54 | cleaned = re.sub(r"[^a-z0-9_-]", "", s.lower()).lstrip("-")
55 |
56 | # Keys must start with a letter. If is_key is set and the cleaned version
57 | # starts with something else, append `k`.
58 | if is_key and cleaned != "" and not cleaned[0].isalpha():
59 | cleaned = "k" + cleaned
60 |
61 | return _truncate(cleaned, AI_PLATFORM_MAX_LABEL_LENGTH)
62 |
63 |
64 | def key_label(k: Optional[str]) -> str:
65 | """converts the argument into a valid label, suitable for submission as a label
66 | key to Cloud.
67 |
68 | """
69 | return _clean_label(k, True)
70 |
71 |
72 | def value_label(v: Optional[str]) -> str:
73 | """converts the argument into a valid label, suitable for submission as a label
74 | value to Cloud.
75 |
76 | """
77 | return _clean_label(v, False)
78 |
79 |
80 | def script_args_to_labels(script_args: Optional[List[str]]) -> Dict[str, str]:
81 | """Converts the arguments supplied to our scripts into a dictionary usable as
82 | labels valid for Cloud submission.
83 |
84 | """
85 | ret = {}
86 |
87 | def process_pair(k, v):
88 | if ua.is_key(k):
89 | clean_k = key_label(k)
90 | if clean_k != "":
91 | ret[clean_k] = "" if ua.is_key(v) else value_label(v)
92 |
93 | if script_args is None or len(script_args) == 0:
94 | return ret
95 |
96 | elif len(script_args) == 1:
97 | process_pair(script_args[0], None)
98 |
99 | # Handle the case where the final argument in the list is a boolean flag.
100 | # This won't get picked up by partition.
101 | elif len(script_args) > 1:
102 | for k, v in u.partition(script_args, 2):
103 | process_pair(k, v)
104 |
105 | process_pair(script_args[-1], None)
106 |
107 | return ret
108 |
109 |
110 | def sanitize_labels(
111 | pairs: Union[Dict[str, str], List[Tuple[str, str]]],
112 | ) -> Dict[str, str]:
113 | """Turns a dict, or a list of unsanitized key-value pairs (each represented by
114 | a tuple) into a dictionary suitable to submit to Cloud as a label dict.
115 |
116 | """
117 | if isinstance(pairs, dict):
118 | return sanitize_labels(pairs.items())
119 |
120 | return {key_label(k): value_label(v) for (k, v) in pairs if key_label(k)}
121 |
--------------------------------------------------------------------------------
/caliban/platform/gke/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/platform/gke/constants.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """constants for gke"""
17 |
18 | import re
19 |
20 | from caliban.config import DEFAULT_MACHINE_TYPE, JobMode
21 | from caliban.platform.cloud.types import GPU, GPUSpec
22 | from caliban.platform.gke.types import ReleaseChannel
23 |
24 | COMPUTE_SCOPE_URL = "https://www.googleapis.com/auth/compute"
25 | COMPUTE_READONLY_SCOPE_URL = "https://www.googleapis.com/auth/compute.readonly"
26 | CLOUD_PLATFORM_SCOPE_URL = "https://www.googleapis.com/auth/cloud-platform"
27 | KUBE_SYSTEM_NAMESPACE = "kube-system"
28 | DEFAULT_NAMESPACE = "default"
29 | BATCH_V1_VERSION = "batch/v1"
30 | NODE_SELECTOR_GKE_ACCELERATOR = "cloud.google.com/gke-accelerator"
31 | NODE_SELECTOR_INSTANCE_TYPE = "beta.kubernetes.io/instance-type"
32 | NODE_SELECTOR_PREEMPTIBLE = "cloud.google.com/gke-preemptible"
33 | CONTAINER_RESOURCE_LIMIT_TPU = "cloud-tpus.google.com"
34 | CONTAINER_RESOURCE_LIMIT_GPU = "nvidia.com/gpu"
35 | CONTAINER_RESOURCE_REQUEST_CPU = "cpu"
36 | CONTAINER_RESOURCE_REQUEST_MEM = "memory"
37 | TEMPLATE_META_ANNOTATION_TPU_DRIVER = "tf-version.cloud-tpus.google.com"
38 | DEFAULT_TPU_DRIVER = "1.14"
39 | ZONE_DEFAULT = "-" # all zones
40 | DEFAULT_MACHINE_TYPE_CPU = DEFAULT_MACHINE_TYPE[JobMode.CPU].value
41 | DEFAULT_MACHINE_TYPE_GPU = DEFAULT_MACHINE_TYPE[JobMode.GPU].value
42 | DEFAULT_GPU_SPEC = GPUSpec(GPU.P100, 1)
43 | DASHBOARD_JOB_URL = "https://console.cloud.google.com/kubernetes/job"
44 | DASHBOARD_CLUSTER_URL = "https://console.cloud.google.com/kubernetes/clusters/details"
45 | MAX_GB_PER_CPU = 64
46 | DEFAULT_CLUSTER_NAME = "blueshift"
47 | VALID_JOB_FILE_EXT = (".yaml", ".json")
48 | DEFAULT_RELEASE_CHANNEL = ReleaseChannel.REGULAR
49 | CLUSTER_API_VERSION = "v1beta1"
50 |
51 | # default min_cpu for gpu/tpu -accelerated jobs (in milli-cpu)
52 | DEFAULT_MIN_CPU_ACCEL = 1500
53 | # default min_cpu for cpu-only jobs (in milli-cpu)
54 | DEFAULT_MIN_CPU_CPU = 31000
55 |
56 | # default min_mem for gpu/tpu jobs (in MB)
57 | DEFAULT_MIN_MEM_ACCEL = 7000
58 | # default min_mem for cpu-only jobs (in MB)
59 | DEFAULT_MIN_MEM_CPU = 25000
60 |
61 | # ----------------------------------------------------------------------------
62 | # The following urls specify kubernetes daemonsets that apply the appropriate
63 | # nvidia drivers to auto-created gpu instances. If this is not running, then your
64 | # gpu jobs will mysteriously fail to schedule, and you will be sad.
65 | # see https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers
66 |
67 | # daemonset for COS instances
68 | NVIDIA_DRIVER_COS_DAEMONSET_URL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml"
69 |
70 | # daemonset for Ubuntu instances
71 | NVIDIA_DRIVER_UBUNTU_DAEMONSET_URL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/ubuntu/daemonset-preloaded.yaml"
72 |
73 | # ----------------------------------------------------------------------------
74 | DNS_1123_RE = re.compile("\A[a-z0-9]([a-z0-9\-\.]*[a-z0-9])?\Z")
75 |
--------------------------------------------------------------------------------
/caliban/platform/gke/types.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """types relevant to gke"""
17 |
18 | from enum import Enum
19 | from typing import NamedTuple, Optional
20 |
21 | from google.auth.credentials import Credentials
22 | from kubernetes.client import V1Job
23 |
24 | # ----------------------------------------------------------------------------
25 | # Node image types
26 | # see https://cloud.google.com/kubernetes-engine/docs/concepts/node-images
27 | NodeImage = Enum(
28 | "NODE_IMAGE",
29 | {
30 | "COS": "cos",
31 | "UBUNTU": "ubuntu",
32 | "COS_CONTAINERD": "cos_containerd",
33 | "UBUNTU_CONTAINERD": "ubuntu_containerd",
34 | },
35 | )
36 |
37 | # ----------------------------------------------------------------------------
38 | # GKE operation status, see:
39 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1/projects.locations.operations
40 | OpStatus = Enum(
41 | "OP_STATUS",
42 | {
43 | "STATUS_UNSPECIFIED": "STATUS_UNSPECIFIED",
44 | "PENDING": "PENDING",
45 | "RUNNING": "RUNNING",
46 | "DONE": "DONE",
47 | "ABORTING": "ABORTING",
48 | },
49 | )
50 |
51 | # ----------------------------------------------------------------------------
52 | # Credentials data (credentials, project id)
53 | CredentialsData = NamedTuple(
54 | "CredentialsData",
55 | [("credentials", Optional[Credentials]), ("project_id", Optional[str])],
56 | )
57 |
58 | # ----------------------------------------------------------------------------
59 | # GKE release channel, see:
60 | # https://cloud.google.com/kubernetes-engine/docs/concepts/release-channels
61 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#Cluster.ReleaseChannel
62 | # https://cloud.google.com/kubernetes-engine/docs/reference/rest/v1beta1/projects.locations.clusters#channel
63 | ReleaseChannel = Enum(
64 | "RELEASE_CHANNEL",
65 | {
66 | "UNSPECIFIED": "UNSPECIFIED",
67 | "RAPID": "RAPID",
68 | "REGULAR": "REGULAR",
69 | "STABLE": "STABLE",
70 | },
71 | )
72 |
73 |
74 | # ----------------------------------------------------------------------------
75 | class JobStatus(Enum):
76 | """gke job status"""
77 |
78 | STATE_UNSPECIFIED = 0
79 | PENDING = 1
80 | RUNNING = 2
81 | FAILED = 3
82 | SUCCEEDED = 4
83 | UNAVAILABLE = 5
84 |
85 | def is_terminal(self) -> bool:
86 | return self.name in ["FAILED", "SUCCEEDED", "UNAVAILABLE"]
87 |
88 | @classmethod
89 | def from_job_info(cls, job_info: V1Job) -> "JobStatus":
90 | if job_info is None:
91 | return JobStatus.STATE_UNSPECIFIED
92 |
93 | if job_info.status is None:
94 | return JobStatus.STATE_UNSPECIFIED
95 |
96 | # completed
97 | if job_info.status.completion_time is not None:
98 | if job_info.status.succeeded is not None:
99 | if job_info.status.succeeded > 0:
100 | return JobStatus.SUCCEEDED
101 | else:
102 | return JobStatus.FAILED
103 |
104 | # active/pending
105 | if job_info.status.active is not None:
106 | if job_info.status.active > 0:
107 | return JobStatus.RUNNING
108 | else:
109 | return JobStatus.PENDING
110 |
111 | # unknown
112 | return JobStatus.STATE_UNSPECIFIED
113 |
--------------------------------------------------------------------------------
/caliban/platform/notebook.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Functions required to interact with Docker to build and run images, shells
17 | and notebooks in a Docker environment.
18 |
19 | """
20 |
21 | from typing import List, Optional
22 |
23 | from blessings import Terminal
24 |
25 | import caliban.config as c
26 | import caliban.docker.build as b
27 | import caliban.platform.shell as ps
28 | import caliban.util.fs as ufs
29 |
30 | t = Terminal()
31 |
32 |
33 | def run_notebook(
34 | job_mode: c.JobMode,
35 | port: Optional[int] = None,
36 | lab: Optional[bool] = None,
37 | version: Optional[bool] = None,
38 | run_args: Optional[List[str]] = None,
39 | **run_interactive_kwargs,
40 | ) -> None:
41 | """Start a notebook in the current working directory; the process will run
42 | inside of a Docker container that's identical to the environment available to
43 | Cloud jobs that are submitted by `caliban cloud`, or local jobs run with
44 | `caliban run.`
45 |
46 | if you pass mount_home=True your jupyter settings will persist across calls.
47 |
48 | Keyword args:
49 |
50 | - port: the port to pass to Jupyter when it boots, useful if you have
51 | multiple instances running on one machine.
52 | - lab: if True, starts jupyter lab, else jupyter notebook.
53 | - version: explicit Jupyter version to install.
54 |
55 | run_interactive_kwargs are all extra arguments taken by run_interactive.
56 |
57 | """
58 |
59 | if port is None:
60 | port = ufs.next_free_port(8888)
61 |
62 | if lab is None:
63 | lab = False
64 |
65 | if run_args is None:
66 | run_args = []
67 |
68 | inject_arg = b.NotebookInstall.lab if lab else b.NotebookInstall.jupyter
69 | jupyter_cmd = "lab" if lab else "notebook"
70 | jupyter_args = [
71 | "-m",
72 | "jupyter",
73 | jupyter_cmd,
74 | "--ip=0.0.0.0",
75 | "--port={}".format(port),
76 | "--no-browser",
77 | ]
78 | docker_args = ["-p", "{}:{}".format(port, port)] + run_args
79 |
80 | ps.run_interactive(
81 | job_mode,
82 | entrypoint="python",
83 | entrypoint_args=jupyter_args,
84 | run_args=docker_args,
85 | inject_notebook=inject_arg,
86 | jupyter_version=version,
87 | **run_interactive_kwargs,
88 | )
89 |
--------------------------------------------------------------------------------
/caliban/platform/shell.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Functions required to interact with Docker to build and run images, shells
17 | and notebooks in a Docker environment.
18 |
19 | """
20 |
21 | import os
22 | from pathlib import Path
23 | from typing import List, Optional
24 |
25 | import caliban.config as c
26 | import caliban.docker.build as b
27 | import caliban.platform.run as r
28 |
29 |
30 | def _home_mount_cmds(enable_home_mount: bool) -> List[str]:
31 | """Returns the argument needed by Docker to mount a user's local home directory
32 | into the home directory location inside their container.
33 |
34 | If enable_home_mount is false returns an empty list.
35 |
36 | """
37 | ret = []
38 | if enable_home_mount:
39 | ret = ["-v", "{}:{}".format(Path.home(), b.container_home())]
40 | return ret
41 |
42 |
43 | def _interactive_opts(workdir: str) -> List[str]:
44 | """Returns the basic arguments we want to run a docker process locally."""
45 | return [
46 | "-w",
47 | workdir,
48 | "-u",
49 | "{}:{}".format(os.getuid(), os.getgid()),
50 | "-v",
51 | "{}:{}".format(os.getcwd(), workdir),
52 | ]
53 |
54 |
55 | def run_interactive(
56 | job_mode: c.JobMode,
57 | workdir: Optional[str] = None,
58 | image_id: Optional[str] = None,
59 | run_args: Optional[List[str]] = None,
60 | mount_home: Optional[bool] = None,
61 | shell: Optional[b.Shell] = None,
62 | entrypoint: Optional[str] = None,
63 | entrypoint_args: Optional[List[str]] = None,
64 | **build_image_kwargs,
65 | ) -> None:
66 | """Start a live shell in the terminal, with all dependencies installed and the
67 | current working directory (and optionally the user's home directory) mounted.
68 |
69 | Keyword args:
70 |
71 | - job_mode: c.JobMode.
72 | - image_id: ID of the image to run. Supplying this will skip an image build.
73 | - run_args: extra arguments to supply to `docker run`.
74 | - mount_home: if true, mounts the user's $HOME directory into the container
75 | to `/home/$USERNAME`. If False, nothing.
76 | - shell: name of the shell to install into the container. Also configures the
77 | entrypoint if that's not supplied.
78 | - entrypoint: command to run. Defaults to the executable command for the
79 | supplied shell.
80 | - entrypoint_args: extra arguments to supply to the entrypoint.
81 |
82 | any extra kwargs supplied are passed through to build_image.
83 |
84 | """
85 | if workdir is None:
86 | workdir = b.DEFAULT_WORKDIR
87 |
88 | if run_args is None:
89 | run_args = []
90 |
91 | if entrypoint_args is None:
92 | entrypoint_args = []
93 |
94 | if mount_home is None:
95 | mount_home = True
96 |
97 | if shell is None:
98 | # Only set a default shell if we're also mounting the home volume.
99 | # Otherwise a custom shell won't have access to the user's profile.
100 | shell = b.default_shell() if mount_home else b.Shell.bash
101 |
102 | if entrypoint is None:
103 | entrypoint = b.SHELL_DICT[shell].executable
104 |
105 | interactive_run_args = (
106 | _interactive_opts(workdir)
107 | + ["-it", "--entrypoint", entrypoint]
108 | + _home_mount_cmds(mount_home)
109 | + run_args
110 | )
111 |
112 | r.run(
113 | job_mode=job_mode,
114 | run_args=interactive_run_args,
115 | script_args=entrypoint_args,
116 | image_id=image_id,
117 | shell=shell,
118 | workdir=workdir,
119 | **build_image_kwargs,
120 | )
121 |
--------------------------------------------------------------------------------
/caliban/resources/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/caliban/resources/cloud_sql_proxy.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Python wrapper around Google's cloud_sql_proxy tool that accepts
17 | configuration via a JSON dictionary of the form:
18 |
19 | {
20 | "proxy": "path to cloud_sql_proxy",
21 | "path": "cloud_sql socket path",
22 | "project": "gcp_project",
23 | "region": "gcp_region",
24 | "db": "database_name",
25 | "creds": "path_to_credentials (optional)"
26 | }
27 |
28 | This script lives in a dotfile
29 | """
30 |
31 | import argparse
32 | import copy
33 | import json
34 | import logging
35 | import os
36 | import subprocess
37 | import sys
38 |
39 | logging.basicConfig(level=logging.INFO)
40 |
41 |
42 | # ----------------------------------------------------------------------------
43 | def _parser():
44 | parser = argparse.ArgumentParser(
45 | description="cloud_sql_proxy wrapper that allows JSON configuration.",
46 | prog="cloud_sql_proxy",
47 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
48 | )
49 |
50 | parser.add_argument("config", type=json.loads)
51 | return parser
52 |
53 |
54 | # ----------------------------------------------------------------------------
55 | def _parse_flags(argv):
56 | return _parser().parse_args(argv[1:])
57 |
58 |
59 | # ----------------------------------------------------------------------------
60 | def main(proxy="", path="", project="", region="", db="", creds=None, debug=False):
61 | cmd = [
62 | proxy,
63 | "-dir",
64 | path,
65 | "-instances",
66 | f"{project}:{region}:{db}",
67 | ]
68 |
69 | if not debug:
70 | cmd.append("-quiet")
71 |
72 | env = copy.copy(dict(os.environ))
73 |
74 | if creds is not None:
75 | env["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(os.path.expanduser(creds))
76 |
77 | subprocess.check_call(cmd, env=env)
78 |
79 |
80 | # ----------------------------------------------------------------------------
81 | if __name__ == "__main__":
82 | m = _parse_flags(sys.argv)
83 | main(**m.config)
84 |
--------------------------------------------------------------------------------
/caliban/util/argparse.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Utilities for our job runner.
18 | """
19 | import argparse
20 | import itertools as it
21 | import os
22 | from typing import Dict, List, Optional, Tuple
23 |
24 | from blessings import Terminal
25 |
26 | import caliban.util as u
27 | import caliban.util.fs as ufs
28 | import schema as s
29 |
30 | t = Terminal()
31 |
32 |
33 | def expand_args(items: Dict[str, str]) -> List[str]:
34 | """Converts the input map into a sequence of k, v pair strings. A None value is
35 | interpreted to mean that the key is a solo flag; it's evicted from the
36 | output.
37 |
38 | """
39 | pairs = [[k, v] if v is not None else [k] for k, v in items.items()]
40 | return list(it.chain.from_iterable(pairs))
41 |
42 |
43 | def argparse_schema(schema):
44 | """Wrapper that performs validation and converts SchemaErrors into
45 | ArgumentTypeErrors for better argument error reporting.
46 |
47 | """
48 |
49 | def check(x):
50 | try:
51 | return schema.validate(x)
52 | except s.SchemaError as e:
53 | raise argparse.ArgumentTypeError(e.code) from None
54 |
55 | return check
56 |
57 |
58 | # TODO: Now that we use schema, validated_package and parse_kv_pair should be
59 | # converted to schema instances.
60 |
61 |
62 | def validated_package(path: str) -> u.Package:
63 | """similar to generate_package but runs argparse validation on packages that
64 | don't actually exist in the filesystem.
65 |
66 | """
67 | p = ufs.generate_package(path)
68 |
69 | if not os.path.isdir(p.package_path):
70 | raise argparse.ArgumentTypeError(
71 | """Directory '{}' doesn't exist in directory. Code must be
72 | nested in a folder that exists in the current directory.""".format(p.package_path)
73 | )
74 |
75 | filename = p.script_path
76 | if not ufs.file_exists_in_cwd(filename):
77 | raise argparse.ArgumentTypeError(
78 | """File '{}' doesn't exist locally as a script or python module; code
79 | must live inside the current directory.""".format(filename)
80 | )
81 |
82 | return p
83 |
84 |
85 | def parse_kv_pair(s: str) -> Tuple[str, str]:
86 | """
87 | Parse a key, value pair, separated by '='
88 |
89 | On the command line (argparse) a declaration will typically look like:
90 | foo=hello
91 | or
92 | foo="hello world"
93 | """
94 | items = s.split("=")
95 | k = items[0].strip() # Remove whitespace around keys
96 |
97 | if len(items) <= 1:
98 | raise argparse.ArgumentTypeError(
99 | "Couldn't parse label '{}' into k=v format.".format(s)
100 | )
101 |
102 | v = "=".join(items[1:])
103 | return (k, v)
104 |
105 |
106 | def is_key(k: Optional[str]) -> bool:
107 | """Returns True if the argument is a valid argparse optional arg input, False
108 | otherwise.
109 |
110 | Strings that start with - or -- are considered valid for now.
111 |
112 | """
113 | return k is not None and len(k) > 0 and k[0] == "-"
114 |
--------------------------------------------------------------------------------
/caliban/util/auth.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Utilities for interacting with the filesystem and packages.
17 |
18 | """
19 |
20 | from subprocess import CalledProcessError, check_output
21 | from typing import Optional
22 |
23 | from google.oauth2 import service_account
24 | from google.oauth2.credentials import Credentials
25 |
26 |
27 | def auth_access_token() -> Optional[str]:
28 | """Attempts to fetch the local Oauth2 access token from the user's environment.
29 | Returns the token if it exists, or None if not
30 |
31 | """
32 | try:
33 | ret = check_output(
34 | ["gcloud", "auth", "print-access-token"], encoding="utf8"
35 | ).rstrip()
36 | return ret if len(ret) > 0 else None
37 | except CalledProcessError:
38 | return None
39 |
40 |
41 | def gcloud_auth_credentials() -> Optional[Credentials]:
42 | """Attempt to generate credentials from the oauth2 workflow triggered by
43 | `gcloud auth login`. Returns
44 |
45 | """
46 | token = auth_access_token()
47 | if token:
48 | return Credentials(token)
49 |
50 |
51 | def gcloud_credentials(credentials_path: Optional[str] = None) -> Optional[Credentials]:
52 | credentials = None
53 |
54 | if credentials_path is not None:
55 | credentials = service_account.Credentials.from_service_account_file(
56 | credentials_path
57 | )
58 | else:
59 | # attempt to fetch credentials acquired via `gcloud auth login`. If this
60 | # fails, the following API object will attempt to use application default
61 | # credentials.
62 | credentials = gcloud_auth_credentials()
63 |
64 | return credentials
65 |
--------------------------------------------------------------------------------
/caliban/util/schema.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Useful shared schemas.
18 | """
19 | import os
20 | import sys
21 | from contextlib import contextmanager
22 | from typing import Optional
23 |
24 | import commentjson
25 |
26 | import caliban.util as u
27 | import schema as s
28 |
29 |
30 | class FatalSchemaError(Exception):
31 | """Wrapper for an exception that can bubble itself up to the top level of the
32 | program."""
33 |
34 | def __init__(self, message, context):
35 | self.message = message
36 | self.context = context
37 | super().__init__(self.message)
38 |
39 |
40 | @contextmanager
41 | def error_schema(context: Optional[str] = None):
42 | """Wrap functions that check schemas in this context manager to throw an
43 | appropriate error with a nice message.
44 |
45 | """
46 | prefix = ""
47 | if context is not None:
48 | prefix = f"\nValidation error while parsing {context}:\n"
49 |
50 | try:
51 | yield
52 | except s.SchemaError as e:
53 | raise FatalSchemaError(e.code, prefix)
54 |
55 |
56 | @contextmanager
57 | def fatal_errors():
58 | """Context manager meant to wrap an entire program and present schema errors in
59 | an easy-to-read way.
60 |
61 | """
62 | try:
63 | yield
64 | except FatalSchemaError as e:
65 | u.err(f"{e.context}\n{e.message}\n\n")
66 | sys.exit(1)
67 | except s.SchemaError as e:
68 | u.err(f"\n{e.code}\n\n")
69 | sys.exit(1)
70 |
71 |
72 | def load_json(path):
73 | with open(path) as f:
74 | return commentjson.load(f)
75 |
76 |
77 | Directory = s.Schema(
78 | os.path.isdir,
79 | error="""Directory '{}' doesn't exist in this directory. Check yourself!""",
80 | )
81 |
82 | File = s.Schema(
83 | lambda path: os.path.isfile(os.path.expanduser(path)),
84 | error="""File '{}' isn't a valid file on your system. Try again!""",
85 | )
86 |
87 | Json = s.And(
88 | File,
89 | s.Use(
90 | load_json, error="""File '{}' doesn't seem to contain valid JSON. Try again!"""
91 | ),
92 | )
93 |
--------------------------------------------------------------------------------
/caliban/util/tqdm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 | Progress bar utilities.
18 | """
19 |
20 | import contextlib
21 | import sys
22 |
23 | from absl import logging
24 | from blessings import Terminal
25 |
26 | import tqdm
27 | from tqdm.utils import _term_move_up
28 |
29 | t = Terminal()
30 |
31 |
32 | class TqdmFile(object):
33 | """Dummy file-like that will write to tqdm"""
34 |
35 | file = None
36 | prefix = _term_move_up() + "\r"
37 |
38 | def __init__(self, file):
39 | self.file = file
40 | self._carriage_pending = False
41 |
42 | def write(self, line):
43 | if self._carriage_pending:
44 | line = self.prefix + line
45 | self._carriage_pending = False
46 |
47 | if line.endswith("\r"):
48 | self._carriage_pending = True
49 | line = line[:-1] + "\n"
50 |
51 | tqdm.tqdm.write(line, file=self.file, end="")
52 |
53 | def flush(self):
54 | return getattr(self.file, "flush", lambda: None)()
55 |
56 | def isatty(self):
57 | return getattr(self.file, "isatty", lambda: False)()
58 |
59 | def close(self):
60 | return getattr(self.file, "close", lambda: None)()
61 |
62 |
63 | def config_logging():
64 | """Overrides logging to go through TQDM.
65 |
66 | TODO use this call to kill then restore:
67 | https://github.com/tqdm/tqdm#redirecting-writing
68 |
69 | """
70 | h = logging.get_absl_handler()
71 | _old = h.python_handler
72 | h._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr))
73 | logging.use_python_logging()
74 |
75 |
76 | @contextlib.contextmanager
77 | def tqdm_logging():
78 | """Overrides logging to go through TQDM.
79 |
80 | https://github.com/tqdm/tqdm#redirecting-writing
81 |
82 | """
83 | handler = logging.get_absl_handler()
84 | orig = handler.python_handler
85 |
86 | try:
87 | handler._python_handler = logging.PythonHandler(stream=TqdmFile(sys.stderr))
88 |
89 | # The changes won't take effect if this hasn't been called. Defensively
90 | # call it again here.
91 | logging.use_python_logging()
92 | yield orig.stream
93 | except Exception as exc:
94 | raise exc
95 | finally:
96 | handler._python_handler = orig
97 |
--------------------------------------------------------------------------------
/codemeta.json:
--------------------------------------------------------------------------------
1 | {
2 | "@context": "https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld",
3 | "@type": "Code",
4 | "author": [
5 | {
6 | "@id": "http://orcid.org/0000-0002-0545-6360",
7 | "@type": "Person",
8 | "email": "samritchie@google.com",
9 | "name": "Sam Ritchie",
10 | "affiliation": "Google"
11 | },
12 | {
13 | "@id": "",
14 | "@type": "Person",
15 | "email": "aslone@google.com",
16 | "name": "Ambrose Slone",
17 | "affiliation": "Google"
18 | },
19 | {
20 | "@id": "http://orcid.org/0000-0003-0625-3327",
21 | "@type": "Person",
22 | "email": "ramasesh@google.com",
23 | "name": "Vinay Ramasesh",
24 | "affiliation": "Google"
25 | }
26 | ],
27 | "identifier": "",
28 | "maintainer": "http://orcid.org/0000-0002-0545-6360",
29 | "codeRepository": "https://github.com/google/caliban",
30 | "issueTracker": "https://github.com/google/caliban/issues",
31 | "datePublished": "2020-06-22",
32 | "dateModified": "2020-06-22",
33 | "dateCreated": "2020-06-22",
34 | "description": "Docker-based job manager for reproducible workflows",
35 | "keywords": "python, docker, machine learning, reproducibility",
36 | "license": "Apache 2.0",
37 | "title": "Caliban",
38 | "version": "0.2.5"
39 | }
40 |
--------------------------------------------------------------------------------
/dockerfiles/Dockerfile:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This builds the base images that we can use for development at Blueshift.
16 | # Tensorflow 2.1 by default, but we can override the image when we call docker.
17 | #
18 | # docker build -t gcr.io/blueshift-playground/blueshift:cpu -f- .
32 |
33 | ARG GCLOUD_LOC=/usr/local/gcloud
34 | ARG PYTHON_VERSION=3.7
35 |
36 | # minicoda release archive is here: https://repo.anaconda.com/miniconda
37 | # see the docs here for managing python versions with conda:
38 | # https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-python.html
39 | ARG MINICONDA_VERSION=py37_4.8.2
40 |
41 | LABEL maintainer="samritchie@google.com"
42 |
43 | # See http://bugs.python.org/issue19846
44 | ENV LANG C.UTF-8
45 |
46 | # Install git so that users can declare git dependencies, and python3 plus
47 | # python3-virtualenv so we can generate an isolated Python environment inside
48 | # the container.
49 | RUN apt-get update && apt-get install -y --no-install-recommends \
50 | git \
51 | python3 \
52 | python3-virtualenv \
53 | wget && \
54 | apt-get clean && \
55 | rm -rf /var/lib/apt/lists/*
56 |
57 | # Some tools expect a "python" binary.
58 | RUN ln -s $(which python3) /usr/local/bin/python
59 |
60 | # install the google cloud SDK.
61 | RUN wget -nv \
62 | https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \
63 | mkdir -m 777 ${GCLOUD_LOC} && \
64 | tar xvzf google-cloud-sdk.tar.gz -C ${GCLOUD_LOC} && \
65 | rm google-cloud-sdk.tar.gz && \
66 | ${GCLOUD_LOC}/google-cloud-sdk/install.sh --usage-reporting=false \
67 | --path-update=false --bash-completion=false \
68 | --disable-installation-options && \
69 | rm -rf /root/.config/* && \
70 | ln -s /root/.config /config && \
71 | # Remove the backup directory that gcloud creates
72 | rm -rf ${GCLOUD_LOC}/google-cloud-sdk/.install/.backup
73 |
74 | # Path configuration
75 | ENV PATH $PATH:${GCLOUD_LOC}/google-cloud-sdk/bin
76 |
77 | COPY scripts/bashrc /etc/bash.bashrc
78 |
79 | # Install Miniconda and prep the system to activate our custom environment.
80 | RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${MINICONDA_VERSION}-Linux-x86_64.sh -O ~/miniconda.sh && \
81 | /bin/bash ~/miniconda.sh -b -p /opt/conda && \
82 | rm ~/miniconda.sh && \
83 | /opt/conda/bin/conda clean -tipsy && \
84 | ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
85 | echo ". /opt/conda/etc/profile.d/conda.sh" >> /etc/bash.bashrc && \
86 | echo "conda activate caliban" >> /etc/bash.bashrc
87 |
88 | RUN yes | /opt/conda/bin/conda create --name caliban python=${PYTHON_VERSION} && /opt/conda/bin/conda clean --all
89 |
90 | # This allows a user to install packages in the conda environment once it
91 | # launches.
92 | RUN chmod -R 757 /opt/conda/envs/caliban && mkdir /.cache && chmod -R 757 /.cache
93 |
94 | # This is equivalent to activating the env.
95 | ENV PATH /opt/conda/envs/caliban/bin:$PATH
96 |
97 | # This makes pip recognize our conda environment
98 | # as a virtual environment, so it installs editables properly
99 | # See https://github.com/conda/conda/issues/5861 for details
100 | ENV PIP_SRC /opt/conda/envs/caliban/pipsrc
101 |
--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.gpu:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Google LLC
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ARG UBUNTU_VERSION=18.04
16 | ARG CUDA=10.1
17 |
18 | FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
19 |
20 | # ARCH and CUDA are specified again because the FROM directive resets ARGs
21 | # (but their default value is retained if set previously)
22 | ARG ARCH
23 | ARG CUDA
24 | ARG CUDNN=7.6.4.38-1
25 | ARG CUDNN_MAJOR_VERSION=7
26 | ARG LIB_DIR_PREFIX=x86_64
27 | ARG LIBNVINFER=6.0.1-1
28 | ARG LIBNVINFER_MAJOR_VERSION=6
29 |
30 | # Needed for string substitution
31 | SHELL ["/bin/bash", "-c"]
32 |
33 | # These dependencies come from the list at the official Tensorflow GPU base
34 | # image:
35 | # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
36 |
37 | RUN apt-get update && apt-get install -y --no-install-recommends \
38 | build-essential \
39 | cuda-command-line-tools-${CUDA/./-} \
40 | # There appears to be a regression in libcublas10=10.2.2.89-1 which
41 | # prevents cublas from initializing in TF. See
42 | # https://github.com/tensorflow/tensorflow/issues/9489#issuecomment-562394257
43 | libcublas10=10.2.1.243-1 \
44 | cuda-nvrtc-${CUDA/./-} \
45 | cuda-cufft-${CUDA/./-} \
46 | cuda-curand-${CUDA/./-} \
47 | cuda-cusolver-${CUDA/./-} \
48 | cuda-cusparse-${CUDA/./-} \
49 | curl \
50 | libcudnn7=${CUDNN}+cuda${CUDA} \
51 | libfreetype6-dev \
52 | libhdf5-serial-dev \
53 | libzmq3-dev \
54 | pkg-config \
55 | software-properties-common \
56 | unzip \
57 | libnvinfer${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \
58 | libnvinfer-plugin${LIBNVINFER_MAJOR_VERSION}=${LIBNVINFER}+cuda${CUDA} \
59 | && apt-get clean \
60 | && rm -rf /var/lib/apt/lists/*
61 |
62 | # For CUDA profiling, TensorFlow requires CUPTI.
63 | ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
64 |
65 | # Link the libcuda stub to the location where tensorflow is searching for it and reconfigure
66 | # dynamic linker run-time bindings
67 | RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \
68 | && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \
69 | && ldconfig
70 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 | # You can set these variables from the command line, and also
4 | # from the environment for the first two.
5 | SPHINXOPTS ?=
6 | SPHINXBUILD ?= ../env/bin/sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 |
--------------------------------------------------------------------------------
/docs/_static/img/cloud/activate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/activate.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/create_new_key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/create_new_key.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/create_service_account.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/create_service_account.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/new_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/new_project.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/project_id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/project_id.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/select_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/select_project.png
--------------------------------------------------------------------------------
/docs/_static/img/cloud/service_acct_permissions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/cloud/service_acct_permissions.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/cleanup_job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cleanup_job.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/cluster_create_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cluster_create_progress.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/cluster_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/cluster_dashboard.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/job_logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/job_logs.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/node_pool_autoprovision.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/node_pool_autoprovision.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/pod_events.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pod_events.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/pre_job_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pre_job_details.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/pre_job_submission.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/pre_job_submission.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/stackdriver_logs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/stackdriver_logs.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/unschedulable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/unschedulable.png
--------------------------------------------------------------------------------
/docs/_static/img/gke/unschedulable_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/docs/_static/img/gke/unschedulable_details.png
--------------------------------------------------------------------------------
/docs/cli/caliban_build.rst:
--------------------------------------------------------------------------------
1 | caliban build
2 | ^^^^^^^^^^^^^
3 |
4 | This command builds the Docker image used in :doc:`caliban_run`,
5 | :doc:`caliban_cloud` and friends, without actually executing the container or
6 | submitting it remotely.
7 |
8 | ``caliban build`` supports the following arguments:
9 |
10 | .. code-block:: text
11 |
12 | usage: caliban build [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY]
13 | [--extras EXTRAS] [-d DIR]
14 | module
15 |
16 | positional arguments:
17 | module Code to execute, in either 'trainer.train' or
18 | 'trainer/train.py' format. Accepts python scripts,
19 | modules or a path to an arbitrary script.
20 |
21 | optional arguments:
22 | -h, --help show this help message and exit
23 | --helpfull show full help message and exit
24 | --nogpu Disable GPU mode and force CPU-only.
25 | --cloud_key CLOUD_KEY
26 | Path to GCloud service account key. (Defaults to
27 | $GOOGLE_APPLICATION_CREDENTIALS.)
28 | --extras EXTRAS setup.py dependency keys.
29 | --no_cache Disable Docker's caching mechanism and force a
30 | rebuild of the container from scratch.
31 | -d DIR, --dir DIR Extra directories to include. List these from large to
32 | small to take full advantage of Docker's build cache.
33 |
--------------------------------------------------------------------------------
/docs/cli/caliban_notebook.rst:
--------------------------------------------------------------------------------
1 | caliban notebook
2 | ^^^^^^^^^^^^^^^^
3 |
4 | This command generates the same isolated environment as the other commands, but
5 | instead of running your code or dropping you into a shell, runs a local instance
6 | of Jupyter based in the folder where you execute the command.
7 |
8 | ``caliban notebook`` supports the following arguments:
9 |
10 | .. code-block:: text
11 |
12 | usage: caliban notebook [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY]
13 | [--extras EXTRAS] [--docker_run_args DOCKER_RUN_ARGS]
14 | [-p PORT] [-jv JUPYTER_VERSION] [--lab] [--bare]
15 |
16 | optional arguments:
17 | -h, --help show this help message and exit
18 | --helpfull show full help message and exit
19 | --nogpu Disable GPU mode and force CPU-only.
20 | --cloud_key CLOUD_KEY
21 | Path to GCloud service account key. (Defaults to
22 | $GOOGLE_APPLICATION_CREDENTIALS.)
23 | --extras EXTRAS setup.py dependency keys.
24 | --docker_run_args DOCKER_RUN_ARGS
25 | String of args to add to Docker.
26 | -p PORT, --port PORT Port to use for Jupyter, inside container and locally.
27 | -jv JUPYTER_VERSION, --jupyter_version JUPYTER_VERSION
28 | Jupyterlab version to install via pip.
29 | --lab run 'jupyter lab', vs the default 'jupyter notebook'.
30 | --bare Skip mounting the $HOME directory; run an isolated
31 | Jupyter lab.
32 |
33 | By default ``caliban notebook`` runs ``jupyter notebook`` inside the container. To
34 | run Jupyterlab, pass the ``--lab`` flag:
35 |
36 | .. code-block:: bash
37 |
38 | caliban notebook --lab
39 |
40 | As with the other commands, the only python dependencies available in the
41 | container will be dependencies that you declare explicitly in either:
42 |
43 |
44 | * a ``requirements.txt`` file
45 | * a ``setup.py`` file.
46 |
47 | Your setup file can declare groups of dependencies using the setuptools
48 | `extras_require
49 | `_
50 | feature. (See the :doc:`../explore/declaring_requirements` docs for more detail
51 | on how to use ``extras_require`` to create separate environments for GPU and
52 | CPU.)
53 |
54 | Mounted Home Directory
55 | ~~~~~~~~~~~~~~~~~~~~~~
56 |
57 | ``caliban notebook`` mounts your ``$HOME`` directory into the container, which
58 | allows your Jupyter settings to persist across sessions. If you don't want this
59 | for some reason, run the command with the ``--bare`` flag.
60 |
61 | Custom Jupyer Port
62 | ~~~~~~~~~~~~~~~~~~
63 |
64 | If you'd like to run ``notebook`` using a different port, use the ``--port`` option:
65 |
66 | .. code-block:: bash
67 |
68 | caliban notebook --lab --port 8889
69 |
70 | On the Mac you'll have to pass ``--nogpu`` to ``notebook``\ , as the NVIDIA runtime
71 | isn't supported on non-Linux machines.
72 |
--------------------------------------------------------------------------------
/docs/cli/caliban_resubmit.rst:
--------------------------------------------------------------------------------
1 | caliban resubmit
2 | ^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Often one needs to re-run an experiment after making code changes, or to run the
5 | same code with a different random seed. Caliban supports this with its
6 | ``resubmit`` command.
7 |
8 | This command allows you to resubmit jobs in an experiment group without having
9 | to remember or re-enter all of the parameters for your experiments. For example,
10 | suppose you run a set of experiments in an experiment group on CAIP:
11 |
12 | .. code-block::
13 |
14 | caliban cloud --xgroup resubmit_test --nogpu --experiment_config experiment.json cpu.py -- --foo 3
15 |
16 | You then realize that you made a coding error, causing some of your jobs to
17 | fail:
18 |
19 | .. code-block::
20 |
21 | $ caliban status --xgroup resubmit_test
22 | xgroup resubmit_test:
23 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None
24 | experiment id 37: cpu.py --foo 3 --sleep 2
25 | job 69 SUCCEEDED CAIP 2020-05-29 10:53:41 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105340_2
26 | experiment id 38: cpu.py --foo 3 --sleep 1
27 | job 68 FAILED CAIP 2020-05-29 10:53:40 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105338_1
28 |
29 | You then go and modify your code, and now you can use the ``resubmit`` command to
30 | run the jobs that failed:
31 |
32 | .. code-block::
33 |
34 | $ caliban resubmit --xgroup resubmit_test
35 | the following jobs would be resubmitted:
36 | cpu.py --foo 3 --sleep 1
37 | job 68 FAILED CAIP 2020-05-29 10:53:40 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105338_1
38 |
39 | do you wish to resubmit these 1 jobs? [yN]: y
40 | rebuilding containers...
41 | ...
42 | Submitting request!
43 | ...
44 |
45 | Checking back in with ``caliban status`` shows that the code change worked, and
46 | now all of the experiments in the group have succeeded, and you can see that the
47 | container hash has changed for the previously failed jobs, reflecting your code
48 | change:
49 |
50 | .. code-block::
51 |
52 | $ caliban status --xgroup resubmit_test
53 | xgroup resubmit_test:
54 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None
55 | experiment id 37: cpu.py --foo 3 --sleep 2
56 | job 69 SUCCEEDED CAIP 2020-05-29 10:53:41 container: gcr.io/totoro-project/cffd1475aaca:latest name: caliban_totoro_20200529_105340_2
57 | experiment id 38: cpu.py --foo 3 --sleep 1
58 | job 70 SUCCEEDED CAIP 2020-05-29 11:03:01 container: gcr.io/totoro-project/81b2087b5026:latest name: caliban_totoro_20200529_110259_1
59 |
60 | The ``resubmit`` command supports the following arguments:
61 |
62 | .. code-block::
63 |
64 | $ caliban resubmit --help
65 | usage: caliban resubmit [-h] [--helpfull] [--xgroup XGROUP] [--dry_run] [--all_jobs] [--project_id PROJECT_ID] [--cloud_key CLOUD_KEY]
66 |
67 | optional arguments:
68 | -h, --help show this help message and exit
69 | --helpfull show full help message and exit
70 | --xgroup XGROUP experiment group
71 | --dry_run Don't actually submit; log everything that's going to happen.
72 | --all_jobs resubmit all jobs regardless of current state, otherwise only jobs that are in FAILED or STOPPED state will be resubmitted
73 | --project_id PROJECT_ID
74 | ID of the GCloud AI Platform/GKE project to use for Cloud job submission and image persistence. (Defaults to $PROJECT_ID; errors if both the argument and $PROJECT_ID are empty.)
75 | --cloud_key CLOUD_KEY
76 | Path to GCloud service account key. (Defaults to $GOOGLE_APPLICATION_CREDENTIALS.)
77 |
--------------------------------------------------------------------------------
/docs/cli/caliban_shell.rst:
--------------------------------------------------------------------------------
1 | caliban shell
2 | ^^^^^^^^^^^^^
3 |
4 | This command is designed for fast, iterative workflows on scripts in an
5 | environment that's guaranteed to match the environment available to your code on
6 | Cloud.
7 |
8 | ``caliban shell`` supports the following arguments:
9 |
10 | .. code-block:: text
11 |
12 | usage: caliban shell [-h] [--helpfull] [--nogpu] [--cloud_key CLOUD_KEY]
13 | [--extras EXTRAS] [--image_id IMAGE_ID]
14 | [--docker_run_args DOCKER_RUN_ARGS] [--shell {bash,zsh}]
15 | [--bare]
16 |
17 | optional arguments:
18 | -h, --help show this help message and exit
19 | --helpfull show full help message and exit
20 | --nogpu Disable GPU mode and force CPU-only.
21 | --cloud_key CLOUD_KEY
22 | Path to GCloud service account key. (Defaults to
23 | $GOOGLE_APPLICATION_CREDENTIALS.)
24 | --extras EXTRAS setup.py dependency keys.
25 | --image_id IMAGE_ID Docker image ID accessible in the local Docker
26 | registry. If supplied, Caliban will skip the 'docker
27 | build' step and use this image.
28 | --docker_run_args DOCKER_RUN_ARGS
29 | String of args to add to Docker.
30 | --shell {bash,zsh} This argument sets the shell used inside the container
31 | to one of Caliban's supported shells. Defaults to the
32 | shell specified by the $SHELL environment variable, or
33 | 'bash' if your shell isn't supported.
34 | --bare Skip mounting the $HOME directory; load a bare shell.
35 |
36 | Running ``caliban shell`` in any directory will generate a Docker image
37 | containing the minimal environment necessary to execute Python ML workflows and
38 | drop you into an interactive shell inside of that image.
39 |
40 | Caliban will copy in your Cloud credentials and set the required
41 | ``$GOOGLE_APPLICATION_CREDENTIALS`` env variable, so all Cloud interaction from
42 | Python should Just Work. (See the :doc:`guide on gcloud authentication
43 | <../explore/gcloud>` for more detail.)
44 |
45 | The base Caliban images also have ``gcloud`` installed; all ``gcloud`` and ``gsutil``
46 | commands will work with the same permissions granted to the key found at
47 | ``$GOOGLE_APPLICATION_CREDENTIALS``.
48 |
49 | .. NOTE:: If you run ``caliban shell --bare``\ , your gcloud and gsutil will
50 | have the same permissions that they'll have in the cloud - the permissions
51 | granted by your JSON key file. If you just run ``caliban shell``\ , which
52 | mounts your home directory, ``gcloud`` and ``gsutil`` will preferentially
53 | load the config you have on your local machine.
54 |
55 | The only python dependencies available in the container will be dependencies
56 | that you declare explicitly in either:
57 |
58 |
59 | * a ``requirements.txt`` file
60 | * a ``setup.py`` file.
61 |
62 | Your setup file can declare groups of dependencies using the setuptools
63 | `extras_require
64 | `_
65 | feature. (See the :doc:`../explore/declaring_requirements` docs for more detail
66 | on how to use ``extras_require`` to create separate environments for GPU and
67 | CPU.)
68 |
69 | By default your home directory will mount into the container, along with the
70 | folder you're in when you run ``caliban shell``. This means that:
71 |
72 |
73 | * your default ``bash`` (or ``zsh``\ ) environment will be available to you at the
74 | ``caliban shell``.
75 | * Any changes you make to files in the mounted directory will be immediately
76 | available to you to run with, say, ``python -m trainer.train`` or some similar
77 | command.
78 |
79 | On the Mac you'll have to pass ``--nogpu`` to ``shell``\ , as the NVIDIA runtime isn't
80 | supported on non-Linux machines. If you forget ``caliban`` will remind you and
81 | prevent you from getting too far.
82 |
83 | .. NOTE:: Caliban currently supports ``bash`` and ``zsh`` shells. The command
84 | will use your ``$SHELL`` environment variable to pick a default; to override
85 | the default, you can always pass the ``--shell`` argument, like this:
86 | ``caliban shell --shell bash``.
87 |
88 | .. WARNING:: One potential issue resulting from the fact that your home directory will mount
89 | into the container is that some binaries from your ``$HOME`` directory might
90 | leak into the container. For example, we have seen a case in which, in trying
91 | to run a CUDA binary to communicate with the GPU, ``caliban shell`` called a
92 | binary from the home directory rather than the one which the container should
93 | have used. This issue can be mitigated simply by using the ``--bare`` option,
94 | which will not mount the ``$HOME`` directory inside the container.
95 |
--------------------------------------------------------------------------------
/docs/cli/caliban_status.rst:
--------------------------------------------------------------------------------
1 | caliban status
2 | ^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | The ``caliban status`` command allows you to check on the status of jobs submitted
5 | via caliban. There are two primary modes for this command. The first returns
6 | your most recent job submissions across all experiment groups:
7 |
8 | .. code-block::
9 |
10 | $ caliban status --max_jobs 5
11 | most recent 5 jobs for user totoro:
12 |
13 | xgroup totoro-xgroup-2020-05-28-11-33-35:
14 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None
15 | experiment id 28: cpu.py --foo 3 --sleep 2
16 | job 56 STOPPED GKE 2020-05-28 11:33:35 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-rssqq
17 | experiment id 29: cpu.py --foo 3 --sleep 600
18 | job 57 STOPPED GKE 2020-05-28 11:33:36 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-c5x6v
19 |
20 | xgroup totoro-xgroup-2020-05-28-11-40-52:
21 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None
22 | experiment id 30: cpu.py --foo 3 --sleep -1
23 | job 58 STOPPED CAIP 2020-05-28 11:40:54 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114052_1
24 | experiment id 31: cpu.py --foo 3 --sleep 2
25 | job 59 STOPPED CAIP 2020-05-28 11:40:55 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114054_2
26 | experiment id 32: cpu.py --foo 3 --sleep 600
27 | job 60 RUNNING CAIP 2020-05-28 11:40:56 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: caliban_totoro_20200528_114055_3
28 |
29 | Here we can see five jobs that we recently submitted, in two experiment groups.
30 | The first experiment group has jobs submitted to GKE, while the second has jobs
31 | submitted to CAIP. You can specify the maximum number of jobs to return using
32 | the ``--max_jobs`` flag.
33 |
34 | The second mode for the ``caliban status`` command returns jobs in a given
35 | experiment group, using the ``--xgroup`` flag:
36 |
37 | .. code-block::
38 |
39 | $ caliban status --xgroup xg2 --max_jobs 2
40 | xgroup xg2:
41 | docker config 1: job_mode: CPU, build url: ~/sw/cluster/caliban/tmp/cpu, extra dirs: None
42 | experiment id 1: cpu.py --foo 3 --sleep -1
43 | job 34 FAILED CAIP 2020-05-08 18:26:56 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_1_20200508_182654
44 | job 37 FAILED CAIP 2020-05-08 19:01:08 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_1_20200508_190107
45 | experiment id 2: cpu.py --foo 3 --sleep 2
46 | job 30 SUCCEEDED LOCAL 2020-05-08 09:59:04 container: e2a0b8fca1dc
47 | job 35 SUCCEEDED CAIP 2020-05-08 18:26:57 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_2_20200508_182656
48 | experiment id 5: cpu.py --foo 3 --sleep 600
49 | job 36 STOPPED CAIP 2020-05-08 18:26:58 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_3_20200508_182657
50 | job 38 SUCCEEDED CAIP 2020-05-08 19:01:09 container: gcr.io/totoro-project/e2a0b8fca1dc:latest name: caliban_totoro_3_20200508_190108
51 |
52 | Here we can see the jobs that have been submitted as part of the ``xg2``
53 | experiment group. By specifying ``--max_jobs 2`` in the call, we can see the two
54 | most recent job submissions for each experiment in the group. In this case, we
55 | can see that experiment 2 was submitted both locally and to CAIP at different
56 | times. We can also see that experiment 1 failed (due to an invalid parameter),
57 | and that the first submision to CAIP of experiment 5 was stopped by the user.
58 |
59 | Another interesting thing to note here is that the container hash is the same
60 | for each of these job submissions, so we can tell that the underlying code did
61 | not change between submissions.
62 |
63 | This command supports the following arguments:
64 |
65 | .. code-block::
66 |
67 | $ caliban status --help
68 | usage: caliban status [-h] [--helpfull] [--xgroup XGROUP]
69 | [--max_jobs MAX_JOBS]
70 |
71 | optional arguments:
72 | -h, --help show this help message and exit
73 | --helpfull show full help message and exit
74 | --xgroup XGROUP experiment group
75 | --max_jobs MAX_JOBS Maximum number of jobs to view. If you specify an
76 | experiment group, then this specifies the maximum
77 | number of jobs per experiment to view. If you do not
78 | specify an experiment group, then this specifies the
79 | total number of jobs to return, ordered by creation
80 | date, or all jobs if max_jobs==0.
81 |
--------------------------------------------------------------------------------
/docs/cli/caliban_stop.rst:
--------------------------------------------------------------------------------
1 | caliban stop
2 | ^^^^^^^^^^^^^^^^^^^^
3 |
4 | This command allows you to stop running jobs submitted using caliban.
5 |
6 | For example, suppose you submit a group of experiments to GKE using an
7 | experiment config file like the following:
8 |
9 | .. code-block::
10 |
11 | $ caliban cluster job submit --xgroup my-xgroup ... --experiment_config exp.json cpu.py --
12 |
13 | After a bit, you realize that you made a coding error, so you'd like to stop
14 | these jobs so that you can fix your error without wasting cloud resources (and
15 | money). The ``caliban stop`` command makes this relatively simple:
16 |
17 | .. code-block::
18 |
19 | $ caliban stop --xgroup my-xgroup
20 | the following jobs would be stopped:
21 | cpu.py --foo 3 --sleep -1
22 | job 61 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-57pr9
23 | cpu.py --foo 3 --sleep 2
24 | job 62 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-s67jt
25 | cpu.py --foo 3 --sleep 600
26 | job 63 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-gg9zm
27 |
28 | do you wish to stop these 3 jobs? [yN]: y
29 |
30 | stopping job: 61 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-57pr9
31 | stopping job: 62 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-s67jt
32 | stopping job: 63 RUNNING GKE 2020-05-28 11:55:04 container: gcr.io/totoro-project/0f6d8a3ddbee:latest name: job-stop-test-gg9zm
33 |
34 | requested job cancellation, please be patient as it may take a short while for this status change to be reflected in the gcp dashboard or from the `caliban status` command.
35 |
36 | This command will stop all jobs that are in a ``RUNNING`` or ``SUBMITTED`` state,
37 | and checks with you to make sure this is what you *really* intend, as
38 | accidentally stopping a job that has been running for days is a particularly
39 | painful experience if your checkpointing is less than perfect. Similar to other
40 | caliban commands, you can use the ``--dry_run`` flag to just print what jobs would
41 | be stopped.
42 |
43 | This command supports the following arguments:
44 |
45 | .. code-block::
46 |
47 | $ caliban stop --help
48 | usage: caliban stop [-h] [--helpfull] [--xgroup XGROUP] [--dry_run]
49 |
50 | optional arguments:
51 | -h, --help show this help message and exit
52 | --helpfull show full help message and exit
53 | --xgroup XGROUP experiment group
54 | --dry_run Don't actually submit; log everything that's going to
55 | happen.
56 |
--------------------------------------------------------------------------------
/docs/cloud/adc.rst:
--------------------------------------------------------------------------------
1 | Application Default Credentials
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Instead of a service account key, you might also generate "Application Default
5 | Credentials" on your machine.
6 |
7 | To install these on your workstation, run
8 |
9 | .. code-block:: bash
10 |
11 | gcloud auth application-default login
12 |
13 | at your terminal, as described in `these gcloud docs
14 | `_.
15 | That's it!
16 |
--------------------------------------------------------------------------------
/docs/cloud/ai_platform_tpu.rst:
--------------------------------------------------------------------------------
1 | TPUs on AI Platform
2 | ^^^^^^^^^^^^^^^^^^^
3 |
4 | .. NOTE:: This documentation is currently quite sparse; expect a tutorial soon.
5 |
6 | .. IMPORTANT:: Unlike on Cloud, TPUs on AI Platform only support (as of
7 | Dec 2019) Tensorflow versions 1.13 and 1.14. No JAX, no Pytorch.
8 |
9 | Caliban has Tensorflow version 2.1 hardcoded internally. Once the range of
10 | possible values expands we'll make this customizable.
11 |
12 | See `AI Platform's runtime version list
13 | `_ for more
14 | detail.
15 |
16 |
17 | If you supply the ``--tpu_spec NUM_TPUSxTPU_TYPE`` argument to your ``caliban
18 | cloud`` job, AI Platform will configure a worker node with that number of TPUs
19 | and attach it to the master node where your code runs.
20 |
21 | ``--tpu_spec`` is compatible with ``--gpu_spec``\ ; the latter configures the master
22 | node where your code lives, while the former sets up a separate worker instance.
23 |
24 | CPU mode by Default
25 | ~~~~~~~~~~~~~~~~~~~
26 |
27 | Normally, all jobs default to GPU mode unless you supply ``--nogpu`` explicitly.
28 | This default flips when you supply a ``--tpu_spec`` and no explicit ``--gpu_spec``.
29 | In that case, ``caliban cloud`` will NOT attach a default GPU to your master
30 | instance. You have to ask for it explicitly.
31 |
32 | A CPU mode default also means that by default Caliban will try to install the
33 | ``'cpu'`` extra dependency set in your ``setup.py``\ , as described in the
34 | :doc:`../explore/declaring_requirements` guide.
35 |
36 | Authorizing TPU Access
37 | ~~~~~~~~~~~~~~~~~~~~~~
38 |
39 | Before you can pass ``--tpu_spec`` to a job you'll need to authorize your Cloud
40 | TPU to access your service account. Check out `the AI Platform TPU tutorial
41 | `_
42 | for detailed steps on how to achieve this.
43 |
44 | Example Workflows
45 | ~~~~~~~~~~~~~~~~~
46 |
47 | Next you'll need to get the repository of TPU examples on your machine.
48 |
49 | .. code-block:: bash
50 |
51 | mkdir tpu-demos && cd tpu-demos
52 | curl https://codeload.github.com/tensorflow/tpu/tar.gz/r1.14 -o r1.14.tar.gz
53 | tar -xzvf r1.14.tar.gz && rm r1.14.tar.gz
54 |
55 | Check out the
56 | `AI Platform TPU tutorial `_
57 | for the next steps, and check back for more detail about how to use that
58 | tutorial with Caliban.
59 |
--------------------------------------------------------------------------------
/docs/cloud/bucket.rst:
--------------------------------------------------------------------------------
1 | Creating a Bucket
2 | ^^^^^^^^^^^^^^^^^
3 |
4 | If you need to store data that you generate during a :doc:`../cli/caliban_cloud`
5 | run, storing data in a Cloud bucket is the easiest choice.
6 |
7 | Your bucket is a reserved "folder" on the Cloud filesystem; you'll use this to
8 | save models and measurements, and as a staging ground for model workflows you're
9 | submitting to Cloud.
10 |
11 | To create your bucket, add the following lines to your ``~/.bashrc`` file:
12 |
13 | .. code-block:: bash
14 |
15 | export BUCKET_NAME="totoro_bucket"
16 | export REGION="us-central1"
17 |
18 | Run ``source ~/.bashrc`` to pick up the changes, then run the following command
19 | to create your new bucket:
20 |
21 | .. code-block:: bash
22 |
23 | gsutil mb -l $REGION gs://$BUCKET_NAME
24 |
25 | That's it.
26 |
--------------------------------------------------------------------------------
/docs/cloud/gpu_specs.rst:
--------------------------------------------------------------------------------
1 | Customizing Machines and GPUs
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | This section discusses the default configurations for accelerators and machine
5 | types that Caliban requests when it submits jobs to Cloud. You'll also find
6 | instructions on how to request different GPUs or machine types for your job.
7 |
8 | Default GPU and Machine Types
9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 |
11 | By default, if you don't supply ``--gpu_spec`` or ``--machine_type`` (both discussed
12 | below), Caliban will configure your jobs on the following hardware for each
13 | mode:
14 |
15 |
16 | * GPU mode (default): a single P100 GPU on an ``n1-standard-8`` machine
17 | * CPU mode: an ``n1-highcpu-32`` machine with no GPU attached
18 |
19 | You can read more about the various machine types available on AI platform `here
20 | `_\ , or scan the
21 | following sections.
22 |
23 |
24 | Custom GPU Specs
25 | ~~~~~~~~~~~~~~~~
26 |
27 | The optional ``--gpu_spec`` argument allows you to attach a custom number and type
28 | of GPU to the Cloud node that will run your containerized job on AI Platform.
29 | The required format is ``GPU_COUNTxGPU_TYPE``\ , as in this example:
30 |
31 | .. code-block:: bash
32 |
33 | caliban cloud --gpu_spec 2xV100 trainer.train
34 |
35 | This will submit your job to a node configured with 2 V100 GPUs to a machine in
36 | the region you specify via:
37 |
38 |
39 | * your ``$REGION`` environment variable,
40 | * the ``--region`` CLI argument
41 | * or, in the absence of either of those, the safe default of ``us-central1``.
42 |
43 | When you run any ``caliban cloud`` command, the program will immediately validate
44 | that the combination of GPU count, region, GPU type and machine type are
45 | compatible and error quickly if they're not. If you make the impossible request
46 | for 3 V100 GPUs:
47 |
48 | .. code-block:: bash
49 |
50 | caliban cloud --gpu_spec 3xV100 trainer.train
51 |
52 | you'll see this error message:
53 |
54 | .. code-block::
55 |
56 | caliban cloud: error: argument --gpu_spec: 3 GPUs of type V100 aren't available
57 | for any machine type. Try one of the following counts: {1, 2, 4, 8}
58 |
59 | For more help, consult this page for valid combinations of GPU count, GPU type
60 | and machine type: https://cloud.google.com/ml-engine/docs/using-gpus
61 |
62 | If you ask for a valid count, but a count that's not possible on the machine
63 | type you specified - 2 V100s on an ``n1-standard-96`` machine, for example:
64 |
65 | .. code-block:: bash
66 |
67 | caliban cloud --gpu_spec 2xV100 --machine_type n1-standard-96 trainer.train
68 |
69 | You'll see this error:
70 |
71 | .. code-block::
72 |
73 | 'n1-standard-96' isn't a valid machine type for 2 V100 GPUs.
74 |
75 | Try one of these: ['n1-highcpu-16', 'n1-highmem-16', 'n1-highmem-2',
76 | 'n1-highmem-4', 'n1-highmem-8', 'n1-standard-16', 'n1-standard-4', 'n1-standard-8']
77 |
78 | For more help, consult this page for valid combinations of GPU count, GPU type
79 | and machine type: https://cloud.google.com/ml-engine/docs/using-gpus
80 |
81 | If you know that your combination is correct, but Caliban's internal
82 | compatibility table hasn't been updated to support some new combination, you can
83 | skip all of these validations by providing ``--force`` as an option.
84 |
85 | Custom Machine Types
86 | ~~~~~~~~~~~~~~~~~~~~
87 |
88 | The ``--machine_type`` option allows you to specify a custom node type for the
89 | master node where your containerized job will run. ``caliban cloud --help`` will
90 | show you all available choices.; You can also read about the various machine
91 | types available on AI platform
92 | `here `_.
93 |
94 | As an example, the following command will configure your job to run on an
95 | ``n1-highcpu-96`` instance with 8 V100 GPUs attached:
96 |
97 | .. code-block:: bash
98 |
99 | caliban cloud --gpu_spec 8xV100 --machine_type n1-highcpu-96 trainer.train
100 |
101 | As described above in :ref:`Custom GPU Specs`, ``--machine_type`` works with
102 | ``--gpu_spec`` to validate that the combination of GPU count, GPU type and
103 | machine type are all valid, and returns an error immediately if the combination
104 | is invalid.
105 |
--------------------------------------------------------------------------------
/docs/cloud/labels.rst:
--------------------------------------------------------------------------------
1 | Job Labels
2 | ^^^^^^^^^^
3 |
4 | AI Platform provides you with the ability to label your jobs with key-value
5 | pairs. Any arguments you provide using either :doc:`custom script arguments
6 | <../explore/custom_script_args>` or an :doc:`experiment broadcast
7 | <../explore/experiment_broadcasting>` will be added to your job as labels, like
8 | this:
9 |
10 | In addition to arguments Caliban will add these labels to each job:
11 |
12 |
13 | * **job_name**: ``caliban_totoro`` by default, or the argument you pass
14 | using ``caliban cloud --name custom_name``
15 | * **gpu_enabled**\ : ``true`` by default, or ``false`` if you ran your job with
16 | ``--nogpu``
17 |
18 | Cloud has fairly strict requirements on the format of each label's key and
19 | value; Caliban will transform your arguments into labels with the proper
20 | formatting, so you don't have to think about these.
21 |
22 | Additional Custom Labels
23 | ~~~~~~~~~~~~~~~~~~~~~~~~
24 |
25 | You can also pass extra custom labels using ``-l`` or ``--label``\ :
26 |
27 | .. code-block:: bash
28 |
29 | caliban cloud -l key:value --label another_k:my_value ...
30 |
31 | These labels will be applied to every job if you're running an :doc:`experiment
32 | broadcast <../explore/experiment_broadcasting>`, or to the single job you're
33 | submitting otherwise.
34 |
35 | If you provide a label that conflicts with a user argument or experiment flag,
36 | your label will get knocked out.
37 |
38 | .. NOTE:: periods aren't allowed in labels, but are often quite meaningful;
39 | because of this caliban replaces periods with underscores before stripping
40 | out any restricted characters.
41 |
--------------------------------------------------------------------------------
/docs/cloud/rate_limit.rst:
--------------------------------------------------------------------------------
1 | Rate Limiting
2 | ^^^^^^^^^^^^^
3 |
4 | ``caliban cloud`` relies on AI Platform for rate limiting, so you can submit many,
5 | many jobs using an ``--experiment_config`` (up to ~1500 total, I believe?) and AI
6 | Platform will throttle submissions to the default limit of 60 submissions per
7 | minute. If your project's been granted higher quotas, you won't be throttled
8 | until you hit your project's rate limit.
9 |
10 | Job submission on Cloud presents a nice progress bar, with terminal colors and
11 | more. The log commands, URLs, jobIds and custom arguments are highlighted so
12 | it's clear which jobs are going through. On a failure the error message prints
13 | in red.
14 |
--------------------------------------------------------------------------------
/docs/cloud/service_account.rst:
--------------------------------------------------------------------------------
1 | Creating a Service Account Key
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | This page describes how to generate and install a `Service Account Key
5 | `_.
6 | A service account key is a sort of "passport" that your code can use to
7 | authenticate itself during communication with Google's Cloud services.
8 |
9 | You can also provide Caliban with a service account key via the ``--cloud_key``
10 | flag. If you do, Caliban will use this service account to authenticate itself
11 | with AI Platform when submitting jobs. (You would do this if you wanted to
12 | submit to some project you didn't own, for example.)
13 |
14 | To create a service account key, visit the `Service Accounts page
15 | `_
16 | and select the project you created earlier.
17 |
18 | Click "Create Service Account" at the top of the page:
19 |
20 | .. image:: /_static/img/cloud/activate.png
21 | :width: 600
22 | :align: center
23 | :alt: Activate Billing
24 |
25 | At the next form, under **"Service Account Name"**, type something like
26 | **totoro_key** and click **"Create"**.
27 |
28 | This will bring up a page titled **"Service Account Permissions"**. Select
29 | **Project > Owner** from the list:
30 |
31 | .. image:: /_static/img/cloud/service_acct_permissions.png
32 | :width: 600
33 | :align: center
34 | :alt: Service Account Permissions
35 |
36 | Then click **"Continue"** and **"Done"**. You now have a service account. You'll
37 | need to download it to your machine for Caliban to use it.
38 |
39 | Downloading the Service Account Key
40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 |
42 | Click on the hyperlinked name of the key - something like
43 | ``totoro-key@totoro-lives.iam.gserviceaccount.com`` - in the service accounts
44 | list.
45 |
46 | Near the bottom of the page, click "Add Key" > "Create New Key":
47 |
48 | .. image:: /_static/img/cloud/create_new_key.png
49 | :width: 600
50 | :align: center
51 | :alt: Create New Key
52 |
53 | Select **"JSON"** for key type and click **"Create"**. This will download a file
54 | with a name like ``totoro-lives-3df07b8c97a0.json`` to your machine.
55 |
56 | Find the file in your terminal (probably in your Downloads folder) and run the
57 | following command to move it to a nice, easy to read location:
58 |
59 | .. code-block:: bash
60 |
61 | mv [NEW_FILENAME].json ~/.config/service_key.json
62 |
63 | To make this key accessible to Caliban, you'll need to set a variable called
64 | ``GOOGLE_APPLICATION_CREDENTIALS`` in your shell to the path of your new service
65 | account key. Add the following line to your `~/.bashrc`:
66 |
67 | .. code-block:: bash
68 |
69 | export GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/service_key.json
70 |
71 | If Caliban sees this environment variable set, it will go ahead and bake these
72 | credentials into your container, making them accessible to your code even inside
73 | the Docker environment.
74 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 | # -- Project information -----------------------------------------------------
18 |
19 | project = "Caliban"
20 | copyright = "2020, Google LLC"
21 | author = "The Caliban authors"
22 |
23 | # The short X.Y version
24 | version = ""
25 | # The full version, including alpha/beta/rc tags
26 | release = ""
27 |
28 | # -- General configuration ---------------------------------------------------
29 |
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 | "sphinx.ext.autodoc",
35 | "sphinx.ext.autosectionlabel",
36 | "sphinx.ext.autosummary",
37 | "sphinx.ext.intersphinx",
38 | "sphinx.ext.mathjax",
39 | "sphinx.ext.napoleon",
40 | "sphinx.ext.viewcode",
41 | ]
42 |
43 | intersphinx_mapping = {"python": ("https://docs.python.org/3/", None)}
44 |
45 | source_suffix = {".rst": "restructuredtext", ".txt": "restructuredtext"}
46 |
47 | # Add any paths that contain templates here, relative to this directory.
48 | templates_path = ["_templates"]
49 |
50 | # List of patterns, relative to source directory, that match files and
51 | # directories to ignore when looking for source files.
52 | # This pattern also affects html_static_path and html_extra_path.
53 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "requirements.txt"]
54 |
55 | # The name of the Pygments (syntax highlighting) style to use.
56 | pygments_style = None
57 | autosummary_generate = True
58 | napolean_use_rtype = False
59 |
60 | mathjax_config = {
61 | "TeX": {"equationNumbers": {"autoNumber": "AMS", "useLabelIds": True}},
62 | }
63 |
64 | # -- Options for HTML output -------------------------------------------------
65 |
66 | # The theme to use for HTML and HTML Help pages. See the documentation for
67 | # a list of builtin themes.
68 | #
69 | html_theme = "sphinx_rtd_theme"
70 |
71 | # Theme options are theme-specific and customize the look and feel of a theme
72 | # further. For a list of options available for each theme, see the
73 | # documentation.
74 | html_theme_options = {
75 | "logo_only": True,
76 | }
77 |
78 | # Add any paths that contain custom static files (such as style sheets) here,
79 | # relative to this directory. They are copied after the builtin static files,
80 | # so a file named "default.css" will overwrite the builtin "default.css".
81 | html_static_path = ["_static"]
82 |
83 | htmlhelp_basename = "Calibandoc"
84 | epub_title = project
85 | epub_exclude_files = ["search.html"]
86 |
--------------------------------------------------------------------------------
/docs/explore/calibanconfig.rst:
--------------------------------------------------------------------------------
1 | calibanconfig
2 | ^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Caliban supports customization through a file called ``.calibanconfig.json``
5 | that lives in your project's directory. Features are limited for now, but stay
6 | tuned for more.
7 |
8 | Custom Apt Packages
9 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 |
11 | Caliban provides support for custom aptitude packages inside your container. To
12 | require custom apt packages, create a file called ``.calibanconfig.json`` inside
13 | your project's directory.
14 |
15 | The ``.calibanconfig.json`` should contain a single JSON dictionary with an
16 | ``"apt_packages"`` key. The value under this key can be either a list, or a
17 | dictionary with ``"gpu"`` and ``"cpu"'`` keys. For example, any of the following are
18 | valid:
19 |
20 | .. code-block::
21 |
22 | # This is a list by itself. Comments are fine, by the way.
23 | {
24 | "apt_packages": ["libsm6", "libxext6", "libxrender-dev"]
25 | }
26 |
27 | This works too:
28 |
29 | .. code-block:: json
30 |
31 | # You can also include a dictionary with different deps
32 | # for gpu and cpu modes. It's fine to leave either of these blank,
33 | # or not include it.
34 | {
35 | "apt_packages": {
36 | "gpu": ["libsm6", "libxext6", "libxrender-dev"],
37 | "cpu": ["some_other_package"]
38 | }
39 | }
40 |
41 | These values will do what you expect and run ``apt-get install ``
42 | for each package. Packages are alphabetized, so changing the order won't
43 | invalidate Docker's build cache.
44 |
45 | Custom Base Images
46 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
47 | For details on Caliban's base images, see :ref:`What's the Base Docker Image?`.
48 |
49 | You can specify a custom base image for Caliban to use in your ``.calibanconfig.json`` file
50 | by adding an entry with the ``base_image`` key as follows:
51 |
52 | .. code-block:: json
53 |
54 | {
55 | "base_image": "gcr.io/blueshift-playground/blueshift:gpu-ubuntu1804-py38-cuda101"
56 | }
57 |
58 | You can also specify different base images for ``cpu`` and ``gpu`` modes as follows:
59 |
60 | .. code-block:: json
61 |
62 | {
63 | "base_image": {
64 | "cpu": "gcr.io/blueshift-playground/blueshift:cpu-ubuntu1804-py38",
65 | "gpu": "gcr.io/blueshift-playground/blueshift:gpu-ubuntu1804-py38-cuda101"
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/docs/explore/custom_docker_run.rst:
--------------------------------------------------------------------------------
1 | Custom Docker Run Arguments
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | ``caliban {shell, notebook, run}`` all perform some combination of ``docker build``
5 | and ``docker run`` to provide their functionality. Each provides various sane
6 | defaults that should be fine for most use cases; sometimes, however, you might
7 | need to break through the ``caliban`` abstraction layer and pass arguments to
8 | ``docker run`` directly.
9 |
10 | One example would be if you need to set environment variables inside the
11 | container, or limit which GPUs are mounted into the container.
12 |
13 | To pass custom options to ``docker run``\ , use ``--docker_run_args``\ , like this:
14 |
15 | .. code-block:: bash
16 |
17 | caliban run --docker_run_args "--env MY_VARIABLE" trainer.train
18 |
19 | This particular command will set ``MY_VARIABLE`` inside the container to its
20 | current value in the shell where you run the above command, as described in the
21 | `docker run `_
22 | documentation. (The
23 | `\ ``docker run`` `_ docs
24 | have information on all possible options.)
25 |
26 | This argument is available in ``caliban run``\ , ``caliban shell`` and ``caliban
27 | notebook``.
28 |
29 | You may see an error if you pass some flag or argument that ``caliban`` already
30 | supplies. Caliban prints the ``docker run`` command it executes on each
31 | invocation, so if you need full control you can always use ``docker run``
32 | directly.
33 |
--------------------------------------------------------------------------------
/docs/explore/custom_script_args.rst:
--------------------------------------------------------------------------------
1 | Custom Script Arguments
2 | ^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | In ``caliban run`` or ``caliban cloud`` modes, if you pass ``--`` to the CLI, Caliban
5 | will stop parsing commands and pass everything after ``--`` through to your
6 | script, untouched. If you run:
7 |
8 | .. code-block:: bash
9 |
10 | caliban cloud trainer.train -- --epochs 2 --job_dir my_directory
11 |
12 | Your script will execute inside the container environment with the following
13 | command:
14 |
15 | .. code-block:: bash
16 |
17 | python -m trainer.train --epochs 2 --job_dir my_directory
18 |
19 | This feature is compatible with :doc:`experiment_broadcasting` in ``cloud``,
20 | ``run`` or ``cluster`` mode; arguments are prepended to the list generated by
21 | the specific experiment being executed from your experiment config.
22 |
--------------------------------------------------------------------------------
/docs/explore/declaring_requirements.rst:
--------------------------------------------------------------------------------
1 | Declaring Requirements
2 | ^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | To use a Python library in your Caliban-based workflow you'll need to declare it
5 | in either a
6 |
7 |
8 | * ``requirements.txt`` file in the directory, or a
9 | * ``setup.py`` file, or
10 | * both of these together.
11 |
12 | If you run any of the Caliban commands in a directory without these, your image
13 | will have access to bare Python alone with no dependencies.
14 |
15 | A ``requirements.txt`` file is the simplest way to get started. See the
16 | `pip docs `_ for more
17 | information on the structure here. You've got ``git`` inside the container, so
18 | ``git`` dependencies will work fine.
19 |
20 | Setup.py and Extra Dependency Sets
21 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
22 |
23 | Declaring your dependencies in a ``setup.py`` file gives you the ability to
24 | declare different sets of dependencies for the different Caliban modes (CPU vs
25 | GPU), in addition to your own custom dependency sets.
26 |
27 | This solves the problem of depending on, say, ``tensorflow-gpu`` for a GPU job,
28 | and ``tensorflow`` for normal, CPU-only jobs, without having to modify your
29 | dependency file.
30 |
31 | Here's an example ``setup.py`` file:
32 |
33 | .. code-block:: python
34 |
35 | from setuptools import find_packages
36 | from setuptools import setup
37 |
38 | setup(
39 | name='hello-tensorflow',
40 | version='0.1',
41 | install_requires=['absl-py', 'google-cloud-storage'],
42 | extras_require={
43 | 'cpu': ['tensorflow==2.0.*'],
44 | 'gpu': ['tensorflow-gpu==2.0.*'],
45 | },
46 | packages=find_packages(),
47 | description='Hello Tensorflow setup file.')
48 |
49 | This project has two normal dependencies - ``'absl-py'`` for flags, and
50 | ``'google-cloud-storage'`` to interact with Cloud buckets.
51 |
52 | The ``setup.py`` file declares its Tensorflow dependencies in a dictionary under
53 | the ``extras_require`` key. If you're using pip, you would install dependencies
54 | from just ``install_requires`` by running
55 |
56 | .. code-block:: bash
57 |
58 | pip install .
59 |
60 | If you instead ran
61 |
62 | .. code-block:: bash
63 |
64 | pip install .[gpu]
65 |
66 | ``pip`` would install
67 |
68 |
69 | * the entries under ``install_requires``\ ,
70 | * AND, additionally, the entries under the ``'gpu'`` key of the ``extras_require``
71 | dictionary.
72 |
73 | By default, if you have a ``setup.py`` file in your directory, caliban will do the
74 | latter and attempt to install a ``'gpu'`` set of extras, like
75 |
76 | .. code-block::
77 |
78 | pip install .[gpu]
79 |
80 | If you pass ``--nogpu`` to any of the commands, Caliban will similarly attempt to
81 | run
82 |
83 | .. code-block::
84 |
85 | pip install .[cpu]
86 |
87 | If you don't declare these keys, don't worry. You'll see a warning that the
88 | extras dependencies didn't exist, and everything will proceed, no problem.
89 |
90 | If you have some other set of dependencies you want to install, you can pass
91 | ``--extras my_deps``\ , or ``-e my_deps``\ , to any of the caliban modes install those
92 | in addition to the ``cpu`` or ``gpu`` dependency set.
93 |
94 | You can provide many sets, like this:
95 |
96 | .. code-block:: bash
97 |
98 | caliban cloud -e my_deps -e logging_extras
99 |
100 | And Caliban will install the dependencies from all declared sets inside of the
101 | containerized environment.
102 |
--------------------------------------------------------------------------------
/docs/explore/exp_stdin.rst:
--------------------------------------------------------------------------------
1 | Experiment Config via stdin, pipes
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | In addition to passing an explicit JSON file to ``caliban cloud
5 | --experiment_config``\ , if you pass the string ``stdin`` as the flag's value
6 | ``caliban cloud`` will attempt to read the experiment config in off of ``stdin``.
7 |
8 | As an example, this command pipes in a config and also passes ``--dry_run`` to
9 | show the series of jobs that WILL be submitted when the ``--dry_run`` flag is
10 | removed:
11 |
12 | .. code-block:: bash
13 |
14 | cat experiment.json | caliban cloud --experiment_config stdin --dry_run trainer.train
15 |
16 | Because ``experiment.json`` is a file on disk, the above command is not that
17 | interesting, and equivalent to running:
18 |
19 | .. code-block:: bash
20 |
21 | caliban cloud --experiment_config experiment.json --dry_run trainer.train
22 |
23 | Things get more interesting when you need to dynamically generate an experiment
24 | config.
25 |
26 | Imagine you've written some python script ``generate_config.py`` that builds up a
27 | list of complex, interdependent experiments. If you modify that script to print
28 | a ``json`` list of ``json`` dicts when executed, you can pipe the results of the
29 | script directly into ``caliban cloud``\ :
30 |
31 | .. code-block:: bash
32 |
33 | python generate_config.py --turing_award 'winning' | \
34 | caliban cloud --experiment_config stdin --dry_run trainer.train
35 |
36 | And see immediately (thanks to ``--dry_run``\ ) the list of jobs that would be
37 | executed on AI Platform with a real run.
38 |
39 |
40 | Experiment File Expansion and Pipes
41 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
42 |
43 | The :doc:`../cli/expansion` command described :doc:`above <../cli/expansion>`
44 | allows you to expand an experiment config into its component JSON objects.
45 | Because these are printed to ``stdout``\ , you can pipe them directly in to
46 | Caliban's commands, like this:
47 |
48 | .. code-block:: bash
49 |
50 | expansion experiment.json | caliban cloud --experiment_config stdin trainer.train
51 |
52 | You can also insert your own script into the middle of this pipeline. Imagine a
53 | script called ``my_script.py`` that:
54 |
55 |
56 | * reads a JSON list of experiments in via ``stdin``
57 | * modifies each entry by inserting a new key whose value is a function of one
58 | or more existing entries
59 | * prints the resulting JSON list back out to ``stdout``
60 |
61 | You could sequence these steps together like so:
62 |
63 | .. code-block:: bash
64 |
65 | cat experiment.json | \
66 | expansion experiment.json | \
67 | my_script.py | \
68 | caliban cloud --experiment_config stdin --dry_run trainer.train
69 |
70 | If you supply ``--dry_run`` to caliban, as in the example above, caliban will
71 | print out all of the jobs that this particular command will kick off when you
72 | remove ``--dry_run``. This is a great way to generate complex experiments and test
73 | everything out before submitting your jobs.
74 |
--------------------------------------------------------------------------------
/docs/explore/experiment_groups.rst:
--------------------------------------------------------------------------------
1 | Experiment Groups
2 | ^^^^^^^^^^^^^^^^^
3 |
4 | Caliban supports grouping experiments into a collection called an *experiment
5 | group*. This allows you to do things like monitor all of the jobs in a given
6 | group, stop all running jobs in a group, or re-run all of the jobs in a group.
7 |
8 | Each of the caliban compute backends supports specifying an experiment group via
9 | the ``--xgroup`` flag:
10 |
11 | .. code-block::
12 |
13 | $ caliban run --xgroup my-xgroup ...
14 | $ caliban cloud --xgroup my-xgroup ...
15 | $ caliban cluster job submit --xgroup my-xgroup ...
16 |
17 | If you don't specify an experiment group when submitting jobs via caliban, a new
18 | experiment group will be generated for you, so you don't need to use them if you
19 | don't want to. Also, the existence of this group should be transparent to you.
20 |
21 | You can add new jobs to an existing experiment group simply by specifying the
22 | same group on different caliban job submission calls:
23 |
24 | .. code-block::
25 |
26 | caliban cloud --xgroup my-xgroup ... foo.py --
27 | ...
28 | (some time later...)
29 | caliban cloud --xgroup my-xgroup ... bar.py --
30 |
31 | The experiment group ``my-xgroup`` will contain the experiments generated by both
32 | of the caliban calls, and you can then perform different operations on these as
33 | described in the sections below.
34 |
--------------------------------------------------------------------------------
/docs/explore/mac.rst:
--------------------------------------------------------------------------------
1 | Caliban on a Mac
2 | ^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | If you're developing on your Macbook, you'll be able to build GPU containers,
5 | but you won't be able to run them locally. You can still submit GPU jobs to AI
6 | Platform!
7 |
8 | To use Caliban's ``shell``\ , ``notebook`` and ``run``\ , you'll have to pass
9 | ``--nogpu`` as a keyword argument. If you don't do this you'll see the following
10 | error:
11 |
12 | .. code-block:: text
13 |
14 | [totoro@totoro-macbookpro hello-tensorflow (master)]$ caliban run trainer.train
15 |
16 | 'caliban run' doesn't support GPU usage on Macs! Please pass --nogpu to use this command.
17 |
18 | (GPU mode is fine for 'caliban cloud' from a Mac; just nothing that runs locally.)
19 |
20 | The :doc:`../getting_started/prerequisites` page covers Macbook installation of
21 | Docker and other dependencies.
22 |
--------------------------------------------------------------------------------
/docs/explore/script_vs_module.rst:
--------------------------------------------------------------------------------
1 | What can Caliban Execute?
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Caliban's commands can run python files as modules or scripts. If you need more
5 | customization, you can run arbitrary shell scripts with Caliban.
6 |
7 | Script vs Module
8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 |
10 | Inside the containerized environment, your Python script will run as a module or
11 | a script, depending on the format of the argument you supply to caliban. If you
12 | explicitly pass a python module, with components separated by dots:
13 |
14 | .. code-block:: bash
15 |
16 | caliban cloud trainer.train -- --epochs 2 --job_dir my_directory
17 |
18 | Your script will execute inside the container environment with the following
19 | command:
20 |
21 | .. code-block:: bash
22 |
23 | python -m trainer.train --epochs 2 --job_dir my_directory
24 |
25 | If instead you supply a relative path to the python file, like this:
26 |
27 | .. code-block:: bash
28 |
29 | caliban cloud trainer/train.py -- --epochs 2 --job_dir my_directory
30 |
31 | Caliban will execute your code as a python *script* by passing it directly to
32 | python without the ``-m`` flag, like this:
33 |
34 | .. code-block:: bash
35 |
36 | python trainer/train.py --epochs 2 --job_dir my_directory
37 |
38 | What does this mean for you? Concretely it means that if you execute your code
39 | as a module, all imports inside of your script have to be declared relative to
40 | the root directory, ie, the directory where you run the caliban command. If you
41 | have other files inside of the ``trainer`` directory, you'll have to import them
42 | from ``trainer/train.py`` like this:
43 |
44 | .. code-block:: python
45 |
46 | import trainer.util
47 | from trainer.cloud import load_bucket
48 |
49 | We do this because it enforces a common structure for all code. The reproducible
50 | unit is the directory that holds all of the code. The script doesn't live in
51 | isolation; it's part of a project, and depends on the other files in the code
52 | tree as well as the dependencies declared in the root directory.
53 |
54 | If you run your code as a script, imports will only work if they're relative to
55 | the file itself, not to the running code.
56 |
57 | I highly recommend running code as a module!
58 |
59 | Using Caliban with Shell Scripts
60 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
61 |
62 | Caliban can build containers for you that will execute arbitrary shell scripts,
63 | in addition to python code.
64 |
65 | If you pass a relative path that points to any file other other than:
66 |
67 |
68 | * a python module, or
69 | * an explicit path to a python file ending with ``.py``\ ,
70 |
71 | to ``caliban cloud``\ , ``caliban run`` or one of the other modes that accepts
72 | modules, caliban will execute the code as a bash script.
73 |
74 | This feature is compatible with :doc:`custom script arguments
75 | ` or an :doc:`experiment broadcast
76 | `; your shell script will receive the same flags that
77 | any python module would receive.
78 |
--------------------------------------------------------------------------------
/docs/explore/why_caliban.rst:
--------------------------------------------------------------------------------
1 | Why Caliban and Docker?
2 | ^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Caliban uses Docker to build isolated environments for your research code. What
5 | does this mean, and why would you want to do this?
6 |
7 | One major source of friction in machine learning research is the potential
8 | mismatch between the environment where your code runs during local development
9 | and the environment in AI Platform or Cloud. Here's a typical situation:
10 |
11 |
12 | * You run your code locally against some set of dependencies you installed
13 | months ago in the virtual environment you use for all your code.
14 | * You get everything working and submit it to Cloud. Minutes later you see a
15 | failure - your specified Tensorflow version is wrong. You submit again,
16 | specifying the beta of TF 2.0 that you've been using... and the job fails.
17 | That version's not available in Cloud.
18 | * Finally the submission works, but the job fails again. The ``gsutil`` command
19 | you've been shelling out to to save your models locally isn't available on
20 | AI Platform.
21 | * You sigh and look at the clock. It's 4pm. Should I have another cup of
22 | coffee? What am I even doing? Is this what my life has become?
23 |
24 | Each of these issues is small, but they stack up and turn you into a broken,
25 | cautious person, afraid to flex the wings you've forgotten are attached to your
26 | back.
27 |
28 | Docker is the answer to this problem. `Docker `_ is a
29 | piece of software that allows you to build and run "containers"; you can think
30 | of a container as a tiny Linux machine that you can run on your Mac or
31 | workstation, or ship off to execute on AI platform. The container gets access to
32 | the resources of the machine where it's running, but can't affect that machine
33 | in any other way.
34 |
35 | If you design your Python code to run inside of a container, you can move that
36 | container between different environments and know that the code's behavior won't
37 | change.
38 |
39 | The Trouble with Bare Docker
40 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
41 |
42 | To build a Docker container for your code you need to write a ``Dockerfile``. If
43 | you try this you'll realize that you actually need many ``Dockerfile`` copies...
44 | one for GPU mode. One for CPU mode locally. Slight tweaks show up every time you
45 | want to add some environment variable; locally, you don't want to copy your code
46 | into the container, since you can live-mount the directory using ``docker run``\ ,
47 | but on AI Platform you DO need a copy.
48 |
49 | Soon your ``Dockerfile`` is infested with comments and instructions to a future,
50 | less patient version of yourself, even less capable of remembering all of this
51 | than you are now.
52 |
53 | Caliban + Docker = <3
54 | ~~~~~~~~~~~~~~~~~~~~~
55 |
56 | If you've felt this pain, you now understand the motivation for Caliban. Caliban
57 | is a tool that dynamically builds docker images (by dynamically generating
58 | ``Dockerfile`` instances) for the various modes you rely on for machine learning
59 | research:
60 |
61 |
62 | * Jupyter notebook development
63 | * Local, interactive development at the shell
64 | * Local execution on your workstation on GPU
65 | * AI platform execution of 100s of jobs for some experiment
66 |
67 | By developing your research workflows inside of Docker containers (made easy by
68 | Caliban) you're much closer to that noble goal of reproducible research.
69 |
70 | Theoretically, you could publish the container that Caliban builds along with
71 | the range of experiment parameters you used to produce your data.
72 |
--------------------------------------------------------------------------------
/docs/getting_started/getting_caliban.rst:
--------------------------------------------------------------------------------
1 | Getting Caliban
2 | ---------------
3 |
4 | .. warning:: If you're currently in a ``virtualenv``\ , please run ``deactivate``
5 | to disable it before proceeding.
6 |
7 | We recommend installing ``caliban`` using `pipx
8 | `_. `pipx `_ is
9 | a tool that lets you install command line utilities written in Python into their
10 | own virtual environments, completely isolated from your system python packages.
11 |
12 | You don't HAVE to do this - you can install caliban in your global environment,
13 | or in a virtualenv - but ``pipx`` is the sanest way we've found to install
14 | Python CLI command tools.
15 |
16 | .. NOTE:: Before you install Caliban, you'll need to visit the
17 | :doc:`prerequisites` page and make sure you have Docker installed and
18 | the correct version of Python 3.
19 |
20 | Install ``pipx`` into your global python environment like this:
21 |
22 | .. code-block:: bash
23 |
24 | python3 -m pip install --user pipx
25 | python3 -m pipx ensurepath
26 |
27 | Once ``pipx`` is installed, use it to install ``caliban``:
28 |
29 | .. code-block:: bash
30 |
31 | pipx install caliban
32 |
33 | If you don't want to use `pipx`, install Caliban via pip:
34 |
35 | .. code-block:: bash
36 |
37 | pip install -U caliban
38 |
39 | Upgrading Caliban
40 | ^^^^^^^^^^^^^^^^^
41 |
42 | With ``pipx``\ , upgrading Caliban is simple. The following command will do it:
43 |
44 | .. code-block:: bash
45 |
46 | pipx upgrade caliban
47 |
48 | If you've installed Caliban with pip:
49 |
50 | .. code-block:: bash
51 |
52 | pip upgrade caliban
53 |
54 | Check your Installation
55 | ^^^^^^^^^^^^^^^^^^^^^^^
56 |
57 | To check if all is well, run
58 |
59 | .. code-block:: bash
60 |
61 | caliban --help
62 |
63 | To take Caliban through its paces, visit the `"Getting Started with Caliban"
64 | `_ tutorial on
65 | the main page of `Caliban's github repository
66 | `_.
67 |
--------------------------------------------------------------------------------
/docs/getting_started/prerequisites.rst:
--------------------------------------------------------------------------------
1 | Prerequisites
2 | -------------
3 |
4 | Before you can use Caliban, you'll need to install Docker and make sure your
5 | Python 3 is up to date. Follow these steps to get set up.
6 |
7 | Python 3
8 | ^^^^^^^^
9 |
10 | Caliban requires Python >= 3.6. Check your current version at the terminal:
11 |
12 | .. code-block:: bash
13 |
14 | $ python3 --version
15 | Python 3.6.9 # Or something above 3.6.0
16 |
17 | If you need to upgrade:
18 |
19 | - on MacOS, download `the latest Python from python.org
20 | `_.
21 | - On Linux, make sure your ``python3`` is up to date by running the following
22 | command at your terminal:
23 |
24 | .. code-block:: bash
25 |
26 | sudo apt-get install python3 python3-venv python3-pip
27 |
28 | Once that's all set, run ``python3 --help`` again to verify that you're running
29 | python 3.6 or above.
30 |
31 | Docker
32 | ^^^^^^
33 |
34 | To use Caliban, you'll need a working Docker installation. If you have a GPU and
35 | want to run jobs that use it, you'll have to install ``nvidia-docker2``, as
36 | described below in :ref:`GPU Support on Linux Machines`
37 |
38 | - On MacOS, install `Docker Desktop for Mac
39 | `_. You'll
40 | only be able to run in CPU mode, as MacOS doesn't support Docker's nvidia
41 | runtime. You will, however, be able to build GPU containers and submit them to
42 | Google Cloud.
43 | - On Linux, install Docker with `these instructions
44 | `_.
45 |
46 | Add your username to the docker group so that you can run Docker without using
47 | ``sudo``:
48 |
49 | .. code-block:: bash
50 |
51 | sudo usermod -a -G docker ${USER}
52 |
53 | GPU Support on Linux Machines
54 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
55 |
56 | On Linux, Caliban can run jobs locally that take advantage of a GPU you may have installed.
57 |
58 | To use this feature, install the ``nvidia-docker2`` runtime by following the
59 | instructions at the `nvidia-docker2
60 | `_
61 | page.
62 |
63 | .. NOTE:: It's important that you install ``nvidia-docker2``, not
64 | ``nvidia-docker``! The `nvidia-docker2
65 | `_
66 | instructions discuss how to upgrade if you accidentally install
67 | ``nvidia-docker``.
68 |
69 | .. NOTE:: The most recent versions of docker don't need the ``nvidia-docker2``
70 | dependency. In a future version of Caliban we'll remove this
71 | dependency and upgrade the documentation.
72 |
--------------------------------------------------------------------------------
/docs/gke/cluster_management.rst:
--------------------------------------------------------------------------------
1 | Cluster Management
2 | ^^^^^^^^^^^^^^^^^^
3 |
4 | This section describes how to create and delete clusters. We'll add
5 | documentation on other relevant cluster lifecycle tasks as we go.
6 |
7 | Cluster Creation
8 | ~~~~~~~~~~~~~~~~
9 |
10 | As described in the ``create`` section of :doc:`../cli/caliban_cluster`, you
11 | will typically create a cluster once for a given project and leave it running.
12 |
13 | You can create a cluster for your project as follows:
14 |
15 | .. code-block:: bash
16 |
17 | totoro@totoro:$ caliban cluster create --cluster_name cluster_name --zone us-central1-a
18 | I0204 09:24:08.710866 139910209476416 cli.py:165] creating cluster cluster_name in project totoro-project in us-central1-a...
19 | I0204 09:24:08.711183 139910209476416 cli.py:166] please be patient, this may take several minutes
20 | I0204 09:24:08.711309 139910209476416 cli.py:167] visit https://console.cloud.google.com/kubernetes/clusters/details/us-central1-a/cluster_name?project=totoro-project to monitor cluster creation progress
21 | I0204 09:28:05.274621 139910209476416 cluster.py:1091] created cluster cluster_name successfully
22 | I0204 09:28:05.274888 139910209476416 cluster.py:1092] applying nvidia driver daemonset...
23 |
24 | The command will typically take several minutes to complete. The command will
25 | provide you with an url you can follow to monitor the creation process. The page
26 | will look something like the following:
27 |
28 | .. image:: /_static/img/gke/cluster_create_progress.png
29 | :width: 600
30 | :align: center
31 | :alt: Cluster creation progress
32 |
33 | Once your cluster is created and running, you can view and inspect it from the
34 | cloud dashboard from the ``Kuberenetes Engine > Clusters`` menu option:
35 |
36 | .. image:: /_static/img/gke/cluster_dashboard.png
37 | :width: 600
38 | :align: center
39 | :alt: Cluster dashboard
40 |
41 | Cluster Deletion
42 | ~~~~~~~~~~~~~~~~
43 |
44 | In most cases you will bring up your cluster and leave it running. The cluster
45 | master does consume resources, however, so if you know that you are not going to
46 | be submitting jobs to your cluster for some length of time, you may want to
47 | delete your cluster to save money. Before doing this, please make sure that all
48 | of your jobs are complete, as deleting the cluster will also kill any running
49 | jobs. Deleting the cluster is very straightforward, simply using the
50 | :doc:`../cli/caliban_cluster` ``delete`` command.
51 |
--------------------------------------------------------------------------------
/docs/gke/concepts.rst:
--------------------------------------------------------------------------------
1 | GKE Concepts
2 | ^^^^^^^^^^^^
3 |
4 | Caliban makes it easy to create your own GKE Cluster - similar to your own
5 | personal copy of AI Platform - in your Cloud project, and submit jobs to that
6 | cluster. The advantage over AI Platform currently is that you can get more
7 | quota, often 10x what you have available in AI Platform, and many features are
8 | supported in GKE much earlier than they are in AI Platform.
9 |
10 | The quota disparity is particularly notable with TPUs. AI Platform currently
11 | only allows 8 TPUs, while a GKE cluster lets you specify 32, 64, etc TPUs for a
12 | given job.
13 |
14 | A good collection of GKE documentation can be found
15 | `here `_
16 |
17 | Cluster
18 | ~~~~~~~
19 |
20 | A
21 | `cluster `_
22 | is a collection of cloud machines, combining a set of *nodes* that run your
23 | processing jobs, and *control plane* (also referred to as a *cluster master*\ )
24 | that manages these worker nodes and handles scheduling your jobs and creating
25 | worker nodes to run them.
26 |
27 | Cluster Master
28 | ~~~~~~~~~~~~~~
29 |
30 | A
31 | `cluster master `_
32 | is the controller for the cluster and all its resources. It handles creating and
33 | deleting worker nodes, and scheduling jobs submitted by users.
34 |
35 | Nodes
36 | ~~~~~
37 |
38 | A
39 | `node `_
40 | is a worker machine (a cloud compute engine instance) that actually performs the
41 | work your job requires. The cluster control plane creates and manages these
42 | instances.
43 |
44 | Node Pool
45 | ~~~~~~~~~
46 |
47 | A
48 | `node pool `_
49 | is a collection of identical nodes (cpu, memory, gpu, tpu).
50 |
51 | Job
52 | ~~~
53 |
54 | A
55 | `job `_
56 | is a task that is to be run to completion using cluster resources. The cluster
57 | control plane manages the resources the job needs and handles restarting the job
58 | in case of failure or preemption. A job probably matches the concept you have in
59 | mind when you think of a job you submit to AI platform. A job is a top-level
60 | task, which may be run on multiple machines/containers, which in GKE are
61 | referred to as *pods*\ , described below.
62 |
63 | Pod
64 | ~~~
65 |
66 | A `pod `_ is a
67 | single, ephemeral, running execution of your container. A job may run on several
68 | pods.
69 |
--------------------------------------------------------------------------------
/docs/gke/prereq.rst:
--------------------------------------------------------------------------------
1 | GKE Prerequisites
2 | ^^^^^^^^^^^^^^^^^
3 |
4 | There are a few prerequisites for creating and submitting jobs to a gke cluster.
5 |
6 | Required Permissions
7 | ~~~~~~~~~~~~~~~~~~~~
8 |
9 | To create and use a GKE cluster, you'll need to modify your service account key
10 | to give it Account Owner permissions. Those instructions live at the
11 | :doc:`/cloud/service_account` docs page. Note that this only applies if you are
12 | using a service account key.
13 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/recipes/dockerignore.rst:
--------------------------------------------------------------------------------
1 | dockerignore speeds up builds
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Many of Caliban's commands begin their work by triggering a ``docker build``
5 | command; this command has a side effect of bundling up the entire directory
6 | where you run the command into a "build context", which is zipped up and sent
7 | off to the Docker build process on your machine.
8 |
9 | In a directory containing machine learning code, it's not unusual that you might
10 | also have subdirectories that contain, for example:
11 |
12 |
13 | * large datasets that you've cached locally
14 | * tensorboard output from local runs
15 | * metrics
16 |
17 | If you don't want to include any of these things in the Docker container that
18 | caliban builds for you, you can significantly speed up your builds by creating a
19 | file called ``.dockerignore`` in the directory of your project.
20 |
21 | Here's an example ``.dockerignore`` file, with comments explaining each line:
22 |
23 | .. code-block::
24 |
25 | # ignore the git repository info and the pip installation cache
26 | .git
27 | .cache
28 |
29 | # this is huge - ignore the virtualenv we've created inside the folder!
30 | env
31 |
32 | # tests don't belong inside the repo.
33 | tests
34 |
35 | # no need to package info about the packaged-up code in egg form.
36 | *.egg-info
37 |
38 | # These files are here for local development, but have nothing
39 | # to do with the code itself, and don't belong on the docker image.
40 | Makefile
41 | pylintrc
42 | setup.cfg
43 | __pycache__
44 | .coverage
45 | .pytest_cache
46 |
47 | As a starting point, you might take your project's ``.gitignore`` file, copy
48 | everything other to ``.dockerignore`` and then delete any entries that you
49 | actually DO need inside your Docker container. An example might be some data you
50 | don't control with ``git``\ , but that you do want to include in the container using
51 | Caliban's ``-d`` flag.
52 |
--------------------------------------------------------------------------------
/docs/recipes/flagfile.rst:
--------------------------------------------------------------------------------
1 | Passing Flags via --flagfile
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | If you find yourself passing lots of flags in to some caliban subcommand, you
5 | might consider Abseil's ``--flagfile`` feature.
6 |
7 | .. NOTE:: `Abseil `_ is a Google library that we
8 | use to generate Caliban's CLI. You can see the options `Abseil
9 | `_ provides on top of Caliban's arguments by
10 | passing ``--helpfull`` to any command; ``caliban cloud --helpfull``\ , for
11 | example.
12 |
13 | ``--flagfile`` allows you to put any number of flags or arguments to caliban into
14 | a file, one pair per line. Given some file like ``my_args.txt`` with the following
15 | contents:
16 |
17 | .. code-block::
18 |
19 | --docker_run_args "CUDA_VISIBLE_DEVICES=0"
20 | --experiment_config experiment_one.json
21 | --cloud_key my_key.json
22 | --extras extra_deps
23 |
24 | You could run the following command:
25 |
26 | .. code-block:: bash
27 |
28 | caliban run --flagfile my_args.txt trainer.train
29 |
30 | All arguments expand in-line, so the above command would be equivalent to
31 | running:
32 |
33 | .. code-block:: bash
34 |
35 | caliban run --docker_run_args "CUDA_VISIBLE_DEVICES=0" \
36 | --experiment_config experiment_one.json \
37 | --cloud_key my_key.json \
38 | --extras extra_deps \
39 | trainer.train
40 |
41 | One major benefit is that you can share groups of arguments between various
42 | subcommand invocations, like ``caliban run`` and ``caliban cloud``\ , without having
43 | to store large duplicated strings of arguments.
44 |
45 | Nested Flagfiles
46 | ~~~~~~~~~~~~~~~~
47 |
48 | You can supply ``--flagfile some_file`` arguments inside flag files! This allows
49 | you to build up trees of arguments in a fine grained way. Imagine some flagfile
50 | called ``v100_project.flags``\ :
51 |
52 | .. code-block:: text
53 |
54 | # Definition for big iron GPUs.
55 | --gpu_spec 8xV100
56 | --machine_type n1-highcpu-64
57 | --cloud_key my_key.json
58 |
59 | And then some further file called ``tpu_plus_gpu.flags``\ :
60 |
61 | .. code-block:: text
62 |
63 | --flagfile v100_project.flags
64 | --tpu_spec 8xV3
65 | --region us-central1
66 |
67 | The command:
68 |
69 | .. code-block:: bash
70 |
71 | caliban cloud --flagfile tpu_plus_gpu.flags trainer.train
72 |
73 | Would expand out **both** sets of flags, as expected. (I don't know what would
74 | happen if each file referenced the other... feel free to try!)
75 |
76 | For more information, check out the
77 | `Abseil docs on ``--flagfile`` `_.
78 |
--------------------------------------------------------------------------------
/docs/recipes/local_dir.rst:
--------------------------------------------------------------------------------
1 | Mounting a Local Directory for Data Persistence
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | Let's say you're using ``caliban run`` with an experiment configuration to run
5 | many experiments locally. Because ``caliban run`` attempts to look just like the
6 | environment you'll see in the Cloud, the command doesn't mount any local
7 | directories by default; the container is completely isolated, and you (usually)
8 | have to persist data by writing it to a Cloud bucket.
9 |
10 | It's possible to avoid this, however, and use Caliban to mount a local directory
11 | into the Docker container. If you do this, you can take advantage of local
12 | experiment broadcasting to loop through many experimental runs on your
13 | workstation, and still persist all results and models to your local machine.
14 |
15 | The answer comes from the :doc:`../explore/custom_docker_run` feature. If you
16 | pass
17 |
18 | .. code-block:: bash
19 |
20 | --docker_run_args "--volume workstation_dir:/foo"
21 |
22 | to ``caliban run``\ , Caliban will mount the directory at ``workstation_dir`` into
23 | your container at ``/foo``. (You can use any name or directory you choose instead
24 | of ``/foo``\ , of course.)
25 |
26 | Let's look at an example. The following command will mount a folder called
27 | ``data`` in your workstation's home directory into your container.
28 |
29 | .. code-block:: bash
30 |
31 | caliban run \
32 | --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo"
33 | --experiment_config exp_config.json \
34 | trainer.train
35 |
36 | When you look at ``/foo`` inside the container, you'll see all of the files on
37 | your workstation at ``/usr/local/google/home/totoro/data``. If you create or
38 | edit any files, those changes will happen to the files on your workstation as
39 | well.
40 |
41 | .. WARNING:: For some reason I don't understand, if you pass ``-v`` instead of
42 | ``--volume``\ , as in ``--docker_run_args "-v mydir:containerdir"``\ , the
43 | argument parser in Caliban will break. Use ``--volume`` and you'll be set!
44 |
45 | If you want to play around with volume mounting, you can pass the same argument
46 | to ``caliban shell`` to get an interactive view of the filesystem your container
47 | will have access to when you run the above command:
48 |
49 | .. code-block:: bash
50 |
51 | # "--bare" prevents your home directory from mounting.
52 | caliban shell --bare \
53 | --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo"
54 |
55 | In the shell that launches you'll see the directory mirrored:
56 |
57 | .. code-block::
58 |
59 | $ caliban shell --docker_run_args "--volume /usr/local/google/home/totoro/data:/foo" --nogpu --bare
60 | I0122 14:30:24.923780 4445842880 docker.py:438] Running command: docker build --rm -f- /Users/totoro/code/python/tutorials/hello-tensorflow
61 | Sending build context to Docker daemon 36.56MB
62 | <....lots of Docker output....>
63 | Successfully built f2ba6fb7b628
64 | I0122 14:30:33.125234 4445842880 docker.py:666] Running command: docker run --ipc host -w /usr/app -u 735994:89939 -v /Users/totoro/code/python/tutorials/hello-tensorflow:/usr/app -it --entrypoint /bin/bash --volume /usr/local/google/home/totoro/data:/foo f2ba6fb7b628
65 | _________ __ ________ ___ _ __ __ __
66 | / ____/ | / / / _/ __ )/ | / | / / \ \ \ \
67 | / / / /| | / / / // __ / /| | / |/ / \ \ \ \
68 | / /___/ ___ |/ /____/ // /_/ / ___ |/ /| / / / / /
69 | \____/_/ |_/_____/___/_____/_/ |_/_/ |_/ /_/ /_/
70 |
71 | You are running caliban shell as user with ID 735994 and group 89939,
72 | which should map to the ID and group for your user on the Docker host. Great!
73 |
74 | caliban-shell /usr/app > ls -al /foo
75 | total 9788
76 | drwx------ 21 totoro 89939 672 Jan 22 20:35 .
77 | drwxr-xr-x 1 root root 4096 Jan 22 21:30 ..
78 | -rw-r--r-- 1 totoro 89939 41689 Jan 20 21:48 sets.png
79 | -rw-r--r-- 1 totoro 89939 82811 Jan 20 21:48 tree.png
80 | caliban-shell /usr/app >
81 |
--------------------------------------------------------------------------------
/docs/recipes/single_gpu.rst:
--------------------------------------------------------------------------------
1 | Using a Single GPU
2 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3 |
4 | By default, ``docker run`` will make all GPUs on your workstation available
5 | inside of the container. This means that in ``caliban shell``\ , ``caliban
6 | notebook`` or ``caliban run``\ , any jobs executed on your workstation will
7 | attempt to use:
8 |
9 |
10 | * your huge GPU, custom-built and installed for ML Supremacy
11 | * the dinky GPU that exists solely to power your monitor, NOT to help train
12 | models
13 |
14 | The second GPU will slow down everything.
15 |
16 | To stop this from happening you need to set the ``CUDA_VISIBLE_DEVICES``
17 | environment variable equal to ``0``\ , as described on this
18 | `nvidia blog `_
19 | about the issue.
20 |
21 | You can set the environment variable inside your container by passing
22 | ``--docker_run_args`` to caliban, like this:
23 |
24 | .. code-block:: bash
25 |
26 | caliban run --docker_run_args "--env CUDA_VISIBLE_DEVICES=0" trainer.train
27 |
28 | .. NOTE:: you may have noticed that this problem doesn't happen when you run a
29 | job inside ``caliban shell``. Your local environment may have
30 | ``CUDA_VISIBLE_DEVICES`` set. ``caliban shell`` and ``caliban notebook``
31 | mount your home directory by default, which loads all of your local
32 | environment variables into the container and, if you've set this environment
33 | variable, modifies this setting inside your container. This doesn't happen
34 | with ``caliban run`` or ``caliban cloud``. You will always need to use this
35 | trick with those modes.
36 |
37 | There are two other ways to solve this problem using the
38 | `custom ``docker run`` arguments detailed here `_.
39 | You can directly limit the GPUs that mount into the container using the ``--gpus``
40 | argument:
41 |
42 | .. code-block:: bash
43 |
44 | caliban run --docker_run_args "--gpus device=0" trainer.train
45 |
46 | If you run ``nvidia-smi`` in the container after passing this argument you won't
47 | see more than 1 GPU. This is useful if you know that some library you're using
48 | doesn't respect the ``CUDA_VISIBLE_DEVICES`` environment variable for any reason.
49 |
50 | You could also pass this and other environment variables using an env file.
51 | Given some file, say, ``myvars.env``\ , whose contents look like this:
52 |
53 | .. code-block:: text
54 |
55 | CUDA_VISIBLE_DEVICES=0
56 | IS_THIS_A_VARIABLE=yes
57 |
58 | The ``--env-file`` argument will load all of the referenced variables into the
59 | docker environment:
60 |
61 | .. code-block:: bash
62 |
63 | caliban run --docker_run_args "--env-file myvars.env" trainer.train
64 |
65 | Check out :doc:`../explore/custom_docker_run` for more information.
66 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==3.0.4
2 | sphinx_rtd_theme
3 |
--------------------------------------------------------------------------------
/paper/10.21105.joss.02403.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/paper/10.21105.joss.02403.pdf
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @article{merkel2014docker,
2 | author = {Merkel, Dirk},
3 | title = {Docker: {L}ightweight {L}inux {C}ontainers for {C}onsistent {D}evelopment and {D}eployment},
4 | year = {2014},
5 | issue_date = {March 2014},
6 | publisher = {Belltown Media},
7 | address = {Houston, TX},
8 | volume = {2014},
9 | number = {239},
10 | issn = {1075-3583},
11 | abstract = {Docker promises the ability to package applications and their dependencies into lightweight containers that move easily between different distros, start up quickly and are isolated from each other.},
12 | journal = {Linux J.},
13 | month = mar,
14 | articleno = {2},
15 | numpages = {1}
16 | }
17 |
18 | @inproceedings{cito2016,
19 | author = {Cito, J\"{u}rgen and Gall, Harald C.},
20 | title = {Using {D}ocker {C}ontainers to {I}mprove {R}eproducibility in {S}oftware {E}ngineering {R}esearch},
21 | year = {2016},
22 | isbn = {9781450342056},
23 | publisher = {Association for Computing Machinery},
24 | address = {New York, NY, USA},
25 | url = {https://doi.org/10.1145/2889160.2891057},
26 | doi = {10.1145/2889160.2891057},
27 | booktitle = {Proceedings of the 38th International Conference on Software Engineering Companion},
28 | pages = {906–907},
29 | numpages = {2},
30 | keywords = {containers, reproducibility, cloud},
31 | location = {Austin, Texas},
32 | series = {ICSE ’16}
33 | }
34 |
35 | @inproceedings{deng2009imagenet,
36 | title={Imagenet: A large-scale hierarchical image database},
37 | author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
38 | booktitle={2009 {IEEE} conference on computer vision and pattern recognition},
39 | pages={248--255},
40 | url={https://doi.org/10.1109/cvpr.2009.5206848},
41 | doi={10.1109/cvpr.2009.5206848},
42 | year={2009},
43 | organization={IEEE}
44 | }
45 |
46 | @article{zaharia2018accelerating,
47 | title={Accelerating the {M}achine {L}earning {L}ifecycle with {MLflow}},
48 | author={M. Zaharia and Andrew Chen and A. Davidson and A. Ghodsi and S. Hong and A. Konwinski and Siddharth Murching and Tomas Nykodym and P. Ogilvie and Mani Parkhe and F. Xie and Corey Zumar},
49 | journal={{IEEE} Data Eng. Bull.},
50 | year={2018},
51 | volume={41},
52 | pages={39-45}
53 | }
54 |
55 | @inproceedings{Forde2018ReproducingML,
56 | title={Reproducing {M}achine {L}earning {R}esearch on {B}inder},
57 | author={Forde, Jessica and Bussonnier, Matthias and Fortin, F{\'e}lix-Antoine and Granger, Brian and Head, Tim and Holdgraf, Chris and Ivanov, Paul and Kelley, Kyle and Pacer, M and Panda, Yuvi and others},
58 | booktitle={{NIPS} {W}orkshop on {M}achine {L}earning {O}pen {S}ource {S}oftware},
59 | year={2018}
60 | }
61 |
62 | @article{DBLP:journals/corr/JonasVSR17,
63 | author = {Eric Jonas and
64 | Shivaram Venkataraman and
65 | Ion Stoica and
66 | Benjamin Recht},
67 | title = {Occupy the {C}loud: {D}istributed {C}omputing for the 99{\%}},
68 | journal = {CoRR},
69 | volume = {abs/1702.04024},
70 | year = {2017},
71 | url = {http://arxiv.org/abs/1702.04024},
72 | archivePrefix = {arXiv},
73 | eprint = {1702.04024},
74 | timestamp = {Mon, 13 Aug 2018 16:49:06 +0200},
75 | biburl = {https://dblp.org/rec/journals/corr/JonasVSR17.bib},
76 | bibsource = {dblp computer science bibliography, https://dblp.org}
77 | }
78 |
79 | @inproceedings{adam_richie-halford-proc-scipy-2018,
80 | author = { {A}dam {R}ichie-{H}alford and {A}riel {R}okem },
81 | title = { {C}loudknot: {A} {P}ython {L}ibrary to {R}un your {E}xisting {C}ode on {A}{W}{S} {B}atch },
82 | booktitle = { {P}roceedings of the 17th {P}ython in {S}cience {C}onference },
83 | pages = { 8 - 14 },
84 | year = { 2018 },
85 | editor = { {F}atih {A}kici and {D}avid {L}ippa and {D}illon {N}iederhut and {M} {P}acer },
86 | doi = { 10.25080/Majora-4af1f417-001 }
87 | }
88 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff]
2 | indent-width = 2
3 |
4 | [tool.ruff.lint.pydocstyle]
5 | convention = "google"
6 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # Required for development, not publication.
2 | hypothesis
3 | ipython
4 | pre-commit
5 | pytest==7.3.2
6 | pytest-cov==4.1.0
7 | pytest-subprocess==1.5.0
8 | twine
9 |
--------------------------------------------------------------------------------
/scripts/bashrc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Basic bash commands for Caliban's shell.
4 |
5 | export PS1="\[\e[31m\]caliban-shell\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
6 | export TERM=xterm-256color
7 | alias grep="grep --color=auto"
8 | alias ls="$(which ls) --color=auto"
9 | alias ll="ls -al"
10 |
11 |
12 | printf "\e[1;34m"
13 | cat<=4.45.0",
48 | "kubernetes>=10.0.1",
49 | "google-auth>=1.19.0",
50 | "google-cloud-core>=1.0.3",
51 | "google-cloud-container>=0.3.0",
52 | "psycopg2-binary==2.9.6",
53 | "schema==0.7.5",
54 | "urllib3>=1.25.7",
55 | "yaspin>=0.16.0",
56 | "SQLAlchemy==1.3.11",
57 | "pg8000==1.16.1",
58 | ]
59 |
60 | setup(
61 | name="caliban",
62 | version=with_versioneer(lambda v: v.get_version()),
63 | cmdclass=with_versioneer(lambda v: v.get_cmdclass(), {}),
64 | description="Docker-based job runner for AI research.",
65 | long_description=readme(),
66 | long_description_content_type="text/markdown",
67 | python_requires=">=3.6.0",
68 | author="Caliban Team",
69 | author_email="samritchie@google.com",
70 | url="https://github.com/google/caliban",
71 | license="Apache-2.0",
72 | packages=find_packages(exclude=("tests", "docs")),
73 | install_requires=REQUIRED_PACKAGES,
74 | include_package_data=True,
75 | entry_points={
76 | "console_scripts": [
77 | "caliban = caliban.main:main",
78 | "expansion = caliban.expansion:main",
79 | ]
80 | },
81 | )
82 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | #!/usr/bin/python
18 | #
19 | # Copyright 2020 Google LLC
20 | #
21 | # Licensed under the Apache License, Version 2.0 (the "License");
22 | # you may not use this file except in compliance with the License.
23 | # You may obtain a copy of the License at
24 | #
25 | # http://www.apache.org/licenses/LICENSE-2.0
26 | #
27 | # Unless required by applicable law or agreed to in writing, software
28 | # distributed under the License is distributed on an "AS IS" BASIS,
29 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30 | # See the License for the specific language governing permissions and
31 | # limitations under the License.
32 |
--------------------------------------------------------------------------------
/tests/caliban/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/config/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/docker/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/docker/test_build.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import caliban.docker.build as b
18 |
19 |
20 | def test_shell_dict():
21 | """Tests that the shell dict has an entry for all possible Shell values."""
22 |
23 | assert set(b.Shell) == set(b.SHELL_DICT.keys())
24 |
25 |
26 | def test_copy_command():
27 | multiline = b.copy_command(
28 | 1, 1, "face", "cake", "This is an example\nof a multiline comment."
29 | )
30 |
31 | assert (
32 | multiline
33 | == """# This is an example
34 | # of a multiline comment.
35 | COPY --chown=1:1 face cake
36 | """
37 | )
38 |
39 | # single lines don't append comments.
40 | oneline = b.copy_command(1, 1, "face", "cake.py")
41 | assert (
42 | oneline
43 | == """COPY --chown=1:1 face cake.py
44 | """
45 | )
46 |
47 | # single comments work.
48 | oneline_comment = b.copy_command(1, 1, "face", "cake.py", comment="Comment!")
49 | assert (
50 | oneline_comment
51 | == """# Comment!
52 | COPY --chown=1:1 face cake.py
53 | """
54 | )
55 |
--------------------------------------------------------------------------------
/tests/caliban/docker/test_push.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import caliban.docker.push as p
18 |
19 |
20 | def register_list_tags(process, project_id, tag, **kwargs):
21 | process.register_subprocess(
22 | [
23 | "gcloud",
24 | "container",
25 | "images",
26 | "list-tags",
27 | f"--project={project_id}",
28 | "--format=json",
29 | tag,
30 | ],
31 | **kwargs,
32 | )
33 |
34 |
35 | def test_image_tag_for_project():
36 | """Tests that we generate a valid image tag for domain-scoped and modern
37 | project IDs.
38 |
39 | """
40 | assert p._image_tag_for_project("face", "imageid") == "gcr.io/face/imageid:latest"
41 |
42 | assert (
43 | p._image_tag_for_project("google.com:face", "imageid")
44 | == "gcr.io/google.com/face/imageid:latest"
45 | )
46 |
47 |
48 | def test_force_push_uuid_tag(fake_process):
49 | """Check that the push command actually attempts to tag and push."""
50 | project_id = "project"
51 | image_id = "imageid"
52 |
53 | tag = p._image_tag_for_project(project_id, image_id)
54 |
55 | fake_process.register_subprocess(["docker", "tag", image_id, tag])
56 | fake_process.register_subprocess(["docker", "push", tag])
57 |
58 | assert p.push_uuid_tag(project_id, image_id, force=True) == tag
59 |
60 |
61 | def test_already_pushed_uuid_tag(fake_process):
62 | """Check that push_uuid_tag does NOT attempt to push if the process already
63 | exists.."""
64 | project_id = "project"
65 | image_id = "imageid"
66 |
67 | base_tag = p._image_tag_for_project(project_id, image_id, include_tag=False)
68 | tag = p._image_tag_for_project(project_id, image_id)
69 |
70 | register_list_tags(fake_process, project_id, base_tag, stdout='[{"metadata": []}]')
71 |
72 | assert p.push_uuid_tag(project_id, image_id) == tag
73 |
74 |
75 | def test_push_uuid_tag_if_no_remote_image(fake_process):
76 | """Check that push_uuid_tag DOES attempt to push if the image doesn't exist in
77 | the remote container registry already.
78 |
79 | """
80 | project_id = "project"
81 | image_id = "imageid"
82 |
83 | base_tag = p._image_tag_for_project(project_id, image_id, include_tag=False)
84 | tag = p._image_tag_for_project(project_id, image_id)
85 |
86 | register_list_tags(fake_process, project_id, base_tag, stdout="[]")
87 |
88 | fake_process.register_subprocess(["docker", "tag", image_id, tag])
89 | fake_process.register_subprocess(["docker", "push", tag])
90 |
91 | assert p.push_uuid_tag(project_id, image_id) == tag
92 |
--------------------------------------------------------------------------------
/tests/caliban/history/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/platform/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/platform/cloud/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/platform/cloud/test_types.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import unittest
18 | from argparse import ArgumentTypeError
19 |
20 | import hypothesis.strategies as st
21 | from hypothesis import given
22 |
23 | import caliban.platform.cloud.types as ct
24 |
25 |
26 | class TypesTestSuite(unittest.TestCase):
27 | """Tests for caliban.platform.cloud.types."""
28 |
29 | @given(
30 | st.integers(min_value=0, max_value=40), st.sampled_from(list(ct.GPU) + list(ct.TPU))
31 | )
32 | def test_validate_accelerator_count(self, i, accel):
33 | valid_counts = ct.accelerator_counts(accel)
34 | if i in valid_counts:
35 | self.assertEqual(i, ct.validate_accelerator_count(accel, i))
36 | else:
37 | with self.assertRaises(ArgumentTypeError):
38 | ct.validate_accelerator_count(accel, i)
39 |
40 | def test_parse_machine_type(self):
41 | """Test that strings parse into machine types using the Google Cloud strings,
42 | NOT the name string for the enum.
43 |
44 | """
45 | self.assertEqual(ct.MachineType.standard_8, ct.parse_machine_type("n1-standard-8"))
46 |
47 | with self.assertRaises(ArgumentTypeError):
48 | ct.parse_machine_type("random-string")
49 |
50 | def test_gpuspec_parse_arg(self):
51 | with self.assertRaises(ArgumentTypeError):
52 | # invalid format string, no x separator.
53 | ct.GPUSpec.parse_arg("face")
54 |
55 | with self.assertRaises(ArgumentTypeError):
56 | # Invalid number.
57 | ct.GPUSpec.parse_arg("randomxV100")
58 |
59 | with self.assertRaises(ArgumentTypeError):
60 | # invalid GPU type.
61 | ct.GPUSpec.parse_arg("8xNONSTANDARD")
62 |
63 | with self.assertRaises(ArgumentTypeError):
64 | # Invalid number for the valid GPU type.
65 | ct.GPUSpec.parse_arg("15xV100")
66 |
67 | self.assertEqual(
68 | ct.GPUSpec(ct.GPU.V100, 7), ct.GPUSpec.parse_arg("7xV100", validate_count=False)
69 | )
70 |
71 | # Valid!
72 | self.assertEqual(ct.GPUSpec(ct.GPU.V100, 8), ct.GPUSpec.parse_arg("8xV100"))
73 |
74 | def test_tpuspec_parse_arg(self):
75 | with self.assertRaises(ArgumentTypeError):
76 | # invalid format string, no x separator.
77 | ct.TPUSpec.parse_arg("face")
78 |
79 | with self.assertRaises(ArgumentTypeError):
80 | # Invalid number.
81 | ct.TPUSpec.parse_arg("randomxV3")
82 |
83 | with self.assertRaises(ArgumentTypeError):
84 | # invalid TPU type.
85 | ct.TPUSpec.parse_arg("8xNONSTANDARD")
86 |
87 | with self.assertRaises(ArgumentTypeError):
88 | # Invalid number for the valid TPU type.
89 | ct.TPUSpec.parse_arg("15xV3")
90 |
91 | self.assertEqual(
92 | ct.TPUSpec(ct.TPU.V3, 7), ct.TPUSpec.parse_arg("7xV3", validate_count=False)
93 | )
94 |
95 | # Valid!
96 | self.assertEqual(ct.TPUSpec(ct.TPU.V3, 8), ct.TPUSpec.parse_arg("8xV3"))
97 |
--------------------------------------------------------------------------------
/tests/caliban/platform/gke/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/platform/gke/test_types.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """unit tests for gke utilities"""
17 | import unittest
18 |
19 | from datetime import datetime
20 | import hypothesis.strategies as st
21 | from hypothesis import given
22 | from kubernetes.client import V1Job, V1JobStatus
23 |
24 | from caliban.platform.gke.types import ReleaseChannel, JobStatus
25 |
26 |
27 | # ----------------------------------------------------------------------------
28 | class TypesTestSuite(unittest.TestCase):
29 | """tests for caliban.platform.gke.types"""
30 |
31 | # --------------------------------------------------------------------------
32 | @given(
33 | st.from_regex("\A(?!UNSPECIFIED\Z|RAPID\Z|REGULAR\Z|STABLE\Z).*\Z"),
34 | st.sampled_from(ReleaseChannel),
35 | )
36 | def test_release_channel(self, invalid: str, valid: ReleaseChannel):
37 | """test ReleaseChannel"""
38 |
39 | with self.assertRaises(ValueError):
40 | _x = ReleaseChannel(invalid)
41 |
42 | self.assertEqual(valid, ReleaseChannel(valid.value))
43 |
44 |
45 | # ----------------------------------------------------------------------------
46 | def test_job_status():
47 | for s in JobStatus:
48 | terminal = s.is_terminal()
49 | if s.name in ["FAILED", "SUCCEEDED", "UNAVAILABLE"]:
50 | assert terminal
51 | else:
52 | assert not terminal
53 |
54 | # completed jobs
55 | status = V1JobStatus(completion_time=datetime.now(), succeeded=1)
56 | job_info = V1Job(status=status)
57 | job_status = JobStatus.from_job_info(job_info)
58 | assert job_status == JobStatus.SUCCEEDED
59 |
60 | status = V1JobStatus(completion_time=datetime.now(), succeeded=0)
61 | job_info = V1Job(status=status)
62 | job_status = JobStatus.from_job_info(job_info)
63 | assert job_status == JobStatus.FAILED
64 |
65 | # active jobs
66 | status = V1JobStatus(completion_time=None, active=1)
67 | job_info = V1Job(status=status)
68 | job_status = JobStatus.from_job_info(job_info)
69 | assert job_status == JobStatus.RUNNING
70 |
71 | # pending jobs
72 | status = V1JobStatus(completion_time=None, active=0)
73 | job_info = V1Job(status=status)
74 | job_status = JobStatus.from_job_info(job_info)
75 | assert job_status == JobStatus.PENDING
76 |
77 | # unknown state
78 | status = V1JobStatus()
79 | job_info = V1Job(status=status)
80 | job_status = JobStatus.from_job_info(job_info)
81 | assert job_status == JobStatus.STATE_UNSPECIFIED
82 |
83 | job_info = V1Job()
84 | job_status = JobStatus.from_job_info(job_info)
85 | assert job_status == JobStatus.STATE_UNSPECIFIED
86 |
87 | job_status = JobStatus.from_job_info(None)
88 | assert job_status == JobStatus.STATE_UNSPECIFIED
89 |
--------------------------------------------------------------------------------
/tests/caliban/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/tests/caliban/resources/__init__.py
--------------------------------------------------------------------------------
/tests/caliban/resources/test_caliban_launcher.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import argparse
18 | import builtins
19 | from google.auth import credentials
20 | import json
21 | import os
22 | import pytest
23 | import tempfile
24 | from typing import Any
25 |
26 | from caliban.resources import caliban_launcher
27 |
28 |
29 | @pytest.mark.parametrize("obj", [["a", 2, 3], {"a": 1, "b": 2}])
30 | def test_parse_json(obj: Any):
31 | # valid json, type
32 | j = caliban_launcher._parse_json("foo", json.dumps(obj), type(obj))
33 | assert j == obj
34 |
35 | # valid json, invalid type
36 | with pytest.raises(argparse.ArgumentTypeError):
37 | j = caliban_launcher._parse_json("bar", json.dumps(None), int)
38 |
39 | # invalid json
40 | with pytest.raises(argparse.ArgumentTypeError):
41 | j = caliban_launcher._parse_json("baz", "[", int)
42 |
43 |
44 | def test_start_services():
45 | with tempfile.TemporaryDirectory() as tmpdir:
46 | outfile = os.path.join(tmpdir, "bar")
47 | svc = [["bash", "-c", "touch $FOO"]]
48 | env = {"FOO": outfile}
49 | caliban_launcher._start_services(svc, env, delay=1)
50 |
51 | assert os.path.exists(outfile)
52 |
53 |
54 | def test_execute_command():
55 | with tempfile.TemporaryDirectory() as tmpdir:
56 | outfile = os.path.join(tmpdir, "bar")
57 | cmd = ["bash", "-c"]
58 | args = ["touch $FOO"]
59 | env = {"FOO": outfile}
60 | caliban_launcher._execute_command(cmd, args, env)
61 |
62 | assert os.path.exists(outfile)
63 |
64 |
65 | def test_load_config_file(monkeypatch):
66 | monkeypatch.setattr(os.path, "exists", lambda x: False)
67 | assert caliban_launcher._load_config_file() == {}
68 |
69 | cfg = {"foo": 7}
70 |
71 | class MockFile:
72 | def __enter__(self):
73 | pass
74 |
75 | def __exit__(self, a, b, c):
76 | pass
77 |
78 | monkeypatch.setattr(os.path, "exists", lambda x: True)
79 | monkeypatch.setattr(builtins, "open", lambda x: MockFile())
80 | monkeypatch.setattr(json, "load", lambda x: cfg)
81 | assert caliban_launcher._load_config_file() == cfg
82 |
83 |
84 | def test_get_config(monkeypatch):
85 | cfg = {"foo": 3, "env": {"a": 0}, "services": ["ls"]}
86 |
87 | class MockArgs:
88 | def __init__(self):
89 | self.caliban_config = cfg
90 |
91 | class MockFile:
92 | def __enter__(self):
93 | pass
94 |
95 | def __exit__(self, a, b, c):
96 | pass
97 |
98 | monkeypatch.setattr(os.path, "exists", lambda x: True)
99 | monkeypatch.setattr(builtins, "open", lambda x: MockFile())
100 | monkeypatch.setattr(json, "load", lambda x: {"env": {}, "services": []})
101 | assert caliban_launcher._get_config(MockArgs()) == cfg
102 |
103 |
104 | def test_ensure_non_null_project(monkeypatch):
105 | # test case where GOOGLE_CLOUD_PROJECT is already set
106 | env = {"foo": "bar", "GOOGLE_CLOUD_PROJECT": "project"}
107 |
108 | new_env = caliban_launcher._ensure_non_null_project(env)
109 | assert env == new_env
110 |
111 | # GOOGLE_CLOUD_PROJECT not set, but valid project from default()
112 | def mock_default(scopes=None, request=None, quota_project_id=None):
113 | return (credentials.AnonymousCredentials(), "foo")
114 |
115 | monkeypatch.setattr("google.auth.default", mock_default)
116 | env = {"foo": "bar"}
117 | assert caliban_launcher._ensure_non_null_project(env) == env
118 |
119 | # GOOGLE_CLOUD_PROJECT not set, no valid project from default()
120 | def mock_default(scopes=None, request=None, quota_project_id=None):
121 | return (credentials.AnonymousCredentials(), None)
122 |
123 | monkeypatch.setattr("google.auth.default", mock_default)
124 | env = {"foo": "bar"}
125 | new_env = caliban_launcher._ensure_non_null_project(env)
126 | for k, v in env.items():
127 | assert new_env.get(k) == v
128 |
129 | assert new_env.get("GOOGLE_CLOUD_PROJECT") is not None
130 |
--------------------------------------------------------------------------------
/tests/caliban/test_cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import unittest
18 |
19 | import caliban.cli as c
20 | import caliban.platform.cloud.types as ct
21 | from caliban.config import JobMode
22 |
23 |
24 | class CLITestSuite(unittest.TestCase):
25 | """Tests for caliban.cli."""
26 |
27 | def test_job_mode(self):
28 | """Tests for all possible combinations of the three arguments to
29 | resolve_job_mode.
30 |
31 | """
32 | gpu_spec = ct.GPUSpec(ct.GPU.P100, 4)
33 | tpu_spec = ct.TPUSpec(ct.TPU.V2, 8)
34 |
35 | def assertMode(expected_mode, use_gpu, gpu_spec, tpu_spec):
36 | mode = c._job_mode(use_gpu, gpu_spec, tpu_spec)
37 | self.assertEqual(mode, expected_mode)
38 |
39 | # --nogpu and no override.
40 | assertMode(JobMode.CPU, False, None, None)
41 |
42 | # TPU doesn't need GPUs
43 | assertMode(JobMode.CPU, False, None, tpu_spec)
44 |
45 | # Default GPUSpec filled in.
46 | assertMode(JobMode.GPU, True, None, None)
47 |
48 | # Explicit GPU spec, so GPU gets attached.
49 | assertMode(JobMode.GPU, True, gpu_spec, None)
50 | assertMode(JobMode.GPU, True, gpu_spec, tpu_spec)
51 |
52 | # If NO explicit GPU is supplied but a TPU is supplied, execute in CPU
53 | # mode, ie, don't attach a GPU.
54 | assertMode(JobMode.CPU, True, None, tpu_spec)
55 |
56 | # explicit GPU spec is incompatible with --nogpu in both of the following
57 | # cases, irrespective of TPU spec.
58 | with self.assertRaises(AssertionError):
59 | c._job_mode(False, gpu_spec, None)
60 |
61 | with self.assertRaises(AssertionError):
62 | c._job_mode(False, gpu_spec, tpu_spec)
63 |
--------------------------------------------------------------------------------
/tests/caliban/util/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/tests/caliban/util/test_argparse.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from collections import OrderedDict
18 |
19 | import caliban.util.argparse as ua
20 |
21 |
22 | def test_expand_args():
23 | m = OrderedDict([("a", "item"), ("b", None), ("c", "d")])
24 | expanded = ua.expand_args(m)
25 |
26 | # None is excluded from the results.
27 | assert expanded == ["a", "item", "b", "c", "d"]
28 |
29 |
30 | def test_is_key():
31 | """A key is anything that starts with a dash; nothing else!"""
32 | assert ua.is_key("--face")
33 | assert ua.is_key("-f")
34 | assert not ua.is_key("")
35 | assert not ua.is_key("face")
36 | assert not ua.is_key("f")
37 |
38 | # this should never happen, but what the heck, why not test that it's a
39 | # fine thing, accepted yet strange.
40 | assert ua.is_key("-----face")
41 |
--------------------------------------------------------------------------------
/tests/caliban/util/test_auth.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from subprocess import CalledProcessError
18 |
19 | from google.oauth2.credentials import Credentials
20 |
21 | import caliban.util.auth as a
22 |
23 |
24 | def register_auth(process, **kwargs):
25 | process.register_subprocess(["gcloud", "auth", "print-access-token"], **kwargs)
26 |
27 |
28 | def fail_process(process):
29 | process.returncode = 1
30 | raise CalledProcessError("cmd", "exception! Not logged in!")
31 |
32 |
33 | def test_auth_access_token(fake_process):
34 | """Check that if the user has logged in with `gcloud auth login`,
35 | `auth_access_token` returns the correct token.
36 |
37 | """
38 | token = "token"
39 | register_auth(fake_process, stdout=token)
40 | assert a.auth_access_token() == token
41 |
42 |
43 | def test_missing_auth_access_token(fake_process):
44 | """Check that if the user has NOT logged in with `gcloud auth login`,
45 | `auth_access_token` returns None.
46 |
47 | """
48 | register_auth(fake_process, callback=fail_process)
49 | assert a.auth_access_token() is None
50 |
51 |
52 | def test_gcloud_auth_credentials(fake_process):
53 | """Check that if the user has logged in with `gcloud auth login`,
54 | a proper instance of Credentials is returned.
55 |
56 | """
57 | token = "token"
58 | register_auth(fake_process, stdout=token)
59 | assert isinstance(a.gcloud_auth_credentials(), Credentials)
60 |
61 |
62 | def test_missing_gcloud_auth_credentials(fake_process):
63 | """Check that if the user has logged in with `gcloud auth login`,
64 | `auth_access_token` returns the correct token.
65 |
66 | """
67 | register_auth(fake_process, callback=fail_process)
68 | assert a.gcloud_auth_credentials() is None
69 |
--------------------------------------------------------------------------------
/tests/caliban/util/test_schema.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import tempfile
18 |
19 | import schema as s
20 |
21 | import caliban.util.schema as us
22 | import pytest
23 |
24 |
25 | def test_directory(tmpdir):
26 | # Proper directories pass validation.
27 | assert us.Directory.validate(tmpdir) == tmpdir
28 |
29 | # random dirs that I made up dont!
30 | with pytest.raises(s.SchemaError) as e:
31 | assert us.Directory.validate("random")
32 |
33 | # Check that the formatting string works.
34 | assert e.match("Directory 'random' doesn't exist")
35 |
36 |
37 | def test_file():
38 | with tempfile.NamedTemporaryFile() as tmp:
39 | # Existing files pass validation.
40 | assert us.File.validate(tmp.name) == tmp.name
41 |
42 | # random paths that I made up dont!
43 | with pytest.raises(s.SchemaError) as e:
44 | assert us.File.validate("random")
45 |
46 | # Check that the formatting string works.
47 | assert e.match("File 'random' isn't")
48 |
--------------------------------------------------------------------------------
/tests/caliban/util/test_tqdm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import io
18 |
19 | from tqdm.utils import _term_move_up
20 |
21 | import caliban.util.tqdm as ut
22 |
23 |
24 | def test_carriage_return():
25 | def through(xs):
26 | buf = io.StringIO()
27 | f = ut.TqdmFile(file=buf)
28 |
29 | for x in xs:
30 | f.write(x)
31 | f.flush()
32 |
33 | return buf.getvalue()
34 |
35 | # Strings pass through tqdmfile with no newline attached.
36 | assert through(["Yo!"]) == "Yo!"
37 |
38 | # Empty lines do nothing.
39 | assert through(["", "", ""]) == ""
40 |
41 | # A carriage return is converted to a newline, but the next line, if it's
42 | # written, will have the proper prefix to trigger a carriage return.
43 | assert through(["Yo!\r"]) == "Yo!\n"
44 |
45 | # Boom, triggered.
46 | assert through(["Yo!\r", "continue"]) == f"Yo!\n{_term_move_up()}\rcontinue"
47 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """Configuration for Hypothesis tests."""
17 |
18 | import os
19 |
20 | from hypothesis import Verbosity, settings
21 |
22 | settings.register_profile("ci", max_examples=1000)
23 | settings.register_profile("dev", max_examples=10)
24 | settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose)
25 | settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "default"))
26 |
--------------------------------------------------------------------------------
/tests/context.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | #!/usr/bin/python
18 | #
19 | # Copyright 2020 Google LLC
20 | #
21 | # Licensed under the Apache License, Version 2.0 (the "License");
22 | # you may not use this file except in compliance with the License.
23 | # You may obtain a copy of the License at
24 | #
25 | # http://www.apache.org/licenses/LICENSE-2.0
26 | #
27 | # Unless required by applicable law or agreed to in writing, software
28 | # distributed under the License is distributed on an "AS IS" BASIS,
29 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30 | # See the License for the specific language governing permissions and
31 | # limitations under the License.
32 |
33 | import os
34 | import sys
35 |
36 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
37 |
--------------------------------------------------------------------------------
/tutorials/README.md:
--------------------------------------------------------------------------------
1 | # Caliban Tutorials
2 |
3 | This directory contains a number of tutorials that show off various aspects of
4 | [Caliban](https://github.com/google/caliban).
5 |
6 | The `basic` directory contains the code for the ["Getting Started with
7 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban)
8 | tutorial on the main page of [Caliban's github
9 | repository](https://github.com/google/caliban).
10 |
11 | More coming soon!
12 |
--------------------------------------------------------------------------------
/tutorials/basic/.calibanconfig.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/tutorials/basic/README.md:
--------------------------------------------------------------------------------
1 | # Basic Tutorial
2 |
3 | This directory contains the code for the ["Getting Started with
4 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban)
5 | tutorial on the main page of [Caliban's github
6 | repository](https://github.com/google/caliban).
7 |
8 | Visit ["Getting Started with
9 | Caliban"](https://github.com/google/caliban#getting-started-with-caliban) for
10 | the full tutorial, and instructions on how to run the code in this folder.
11 |
--------------------------------------------------------------------------------
/tutorials/basic/experiment.json:
--------------------------------------------------------------------------------
1 | {"learning_rate": [0.01, 0.001, 0.0001]}
2 |
--------------------------------------------------------------------------------
/tutorials/basic/mnist.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """This tutorial comes from the Tensorflow MNIST quickstart at
17 | https://www.tensorflow.org/tutorials/quickstart/beginner.
18 |
19 | """
20 | import warnings
21 |
22 | import tensorflow as tf
23 | from absl import app, flags
24 |
25 | warnings.filterwarnings("ignore", category=DeprecationWarning)
26 |
27 | FLAGS = flags.FLAGS
28 |
29 | # Define a command-line argument using the Abseil library:
30 | # https://abseil.io/docs/python/guides/flags
31 | flags.DEFINE_float("learning_rate", 0.1, "Learning rate.")
32 | flags.DEFINE_integer("epochs", 3, "Epochs to train.")
33 |
34 |
35 | def get_keras_model(width=128, activation="relu"):
36 | """Returns an instance of a Keras Sequential model.
37 | https://www.tensorflow.org/api_docs/python/tf/keras/Sequential"""
38 | return tf.keras.models.Sequential(
39 | [
40 | tf.keras.layers.Flatten(input_shape=(28, 28)),
41 | tf.keras.layers.Dense(width, activation=activation),
42 | tf.keras.layers.Dense(width, activation=activation),
43 | tf.keras.layers.Dense(10, activation=None),
44 | ]
45 | )
46 |
47 |
48 | def main(_):
49 | """Train a model against the MNIST dataset and print performance metrics."""
50 | mnist = tf.keras.datasets.mnist
51 |
52 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
53 | x_train, x_test = x_train / 255.0, x_test / 255.0
54 |
55 | model = get_keras_model()
56 |
57 | loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
58 | optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate)
59 |
60 | model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
61 |
62 | print(
63 | f"Training model with learning rate={FLAGS.learning_rate} for {FLAGS.epochs} epochs."
64 | )
65 | model.fit(x_train, y_train, epochs=FLAGS.epochs)
66 |
67 | print("Model performance: ")
68 | model.evaluate(x_test, y_test, verbose=2)
69 |
70 |
71 | if __name__ == "__main__":
72 | app.run(main)
73 |
--------------------------------------------------------------------------------
/tutorials/basic/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-cpu
2 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/.calibanconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "apt_packages" : ["openssh-client", "curl"],
3 | "mlflow_config" : {"project": "blueshift-research",
4 | "region": "us-central1",
5 | "db": "mlflow",
6 | "user": "postgres",
7 | "password": "mlflow",
8 | "artifact_root": "gs://blueshift-research/mlflow",
9 | "debug" : false}
10 | }
11 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/README.md:
--------------------------------------------------------------------------------
1 | # UV + MLFlow Tutorial [ALPHA!]
2 |
3 | This directory contains a demo of a model training workflow that uses the
4 | [uv-metrics](https://github.com/google/uv-metrics) library to persist metrics to
5 | an [MLFlow](https://mlflow.org/) tracking server.
6 |
7 | This is mostly here for testing and reference. Check back for a documentation
8 | update once the API settles down.
9 |
10 | ## Prerequisites
11 |
12 | Right now we are supporting logging metrics to a sql-based backing store only
13 | in this tutorial, but we will update things to allow for local storage in the
14 | future. For now you will need to have a google cloud sql instance configured
15 | for this, and you will need an MLFlow server set up to serve results from
16 | this instance.
17 |
18 | To run this tutorial, you will need to edit the `.calibanconfig.json`
19 | file in this directory to reflect your database settings so that the training
20 | script can connect to the database and log metrics. The specific entries to
21 | edit here are in the `mflow_config` entry in `.calibanconfig.json`:
22 |
23 | ```
24 | {
25 | "apt_packages" : ["openssh-client", "curl"],
26 | "mlflow_config" : {"project": ,
27 | "region": ,
28 | "db": ,
29 | "user": ,
30 | "password": ,
31 | "artifact_root": ,
32 | "debug" : false}
33 | }
34 | ```
35 |
36 | One note here is that currently artifact storage is not working completely, but
37 | please specify this entry and we will update this tutorial once that is working properly.
38 |
39 | Once you have set these parameters properly, you should be able to run the tutorial code.
40 |
41 | ## Sanity Check (optional)
42 |
43 | A quick sanity check to test your database connection is to set the `debug` flag in
44 | the `.calibanconfig.json` file to `true`, and then use Caliban to run the `hello_world.sh`
45 | script. This script simply prints "hello, world", but by enabling the `debug` flag, we
46 | can check the status of the database connection.
47 |
48 | To run this test:
49 |
50 | ```
51 | caliban run --nogpu hello_world.sh
52 | ```
53 |
54 | If your database settings are configured properly, you should see output like the following:
55 |
56 | ```
57 | Successfully built 5eb8dcef14ce
58 | I0807 13:02:53.008464 139963939288896 tqdm.py:90] Restoring pure python logging
59 | I0807 13:02:53.010536 139963939288896 run.py:74]
60 | I0807 13:02:53.010816 139963939288896 run.py:75] Job 1 - Experiment args: []
61 | I0807 13:02:53.010974 139963939288896 run.py:198] Running command: docker run --ipc host -e PYTHONUNBUFFERED=1 -e COLUMNS=211 -e LINES=19 5eb8dcef14ce ...
62 | 2020/08/07 20:02:53 current FDs rlimit set to 1048576, wanted limit is 8500. Nothing to do here.
63 | 2020/08/07 20:02:53 using credential file for authentication; path="/home//.config/gcloud/application_default_credentials.json"
64 | 2020/08/07 20:02:54 Listening on /tmp/cloudsql/::/.s.PGSQL.5432 for ::
65 | 2020/08/07 20:02:54 Ready for new connections
66 | INFO:root:/bin/bash hello_world.sh
67 | hello, world
68 | I0807 13:03:04.015075 139963939288896 run.py:111] Job 1 succeeded!
69 | ```
70 |
71 | As long as you see `Ready for new connections`, then your configuration should be ok, and you
72 | can disable the `debug` flag and continue with the rest of the tutorial.
73 |
74 | ## Running a Job
75 |
76 | In the Caliban repository:
77 |
78 | ```
79 | git checkout master && git pull
80 | cd tutorials/uv-metrics
81 | ```
82 |
83 | Run a single job:
84 |
85 | ```
86 | caliban run --nogpu trainer.train
87 | ```
88 |
89 | Name the experiment group and run 3:
90 |
91 | ```
92 | caliban run --experiment_config experiment.json --xgroup mlflow_tutorial --nogpu trainer.train
93 | ```
94 |
95 | ## Check the MLFlow UI
96 |
97 | You may need to refresh, but the UI should now show multiple experiments. You can view the
98 | status and metrics for your jobs from the UI while your jobs are in progress, which is
99 | useful for long-running jobs.
100 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/cli.py:
--------------------------------------------------------------------------------
1 | """CLI Interface for the UV-metrics tutorial example."""
2 |
3 | import argparse
4 |
5 | from absl.flags import argparse_flags
6 |
7 |
8 | def create_parser():
9 | """Creates and returns the argparse instance for the experiment config
10 | expansion app.
11 |
12 | """
13 |
14 | parser = argparse_flags.ArgumentParser(
15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
16 | description="""Configurable arguments for the uv-metrics Caliban tutorial.""",
17 | prog="python -m mnist",
18 | )
19 |
20 | parser.add_argument(
21 | "--gcloud_path",
22 | help="""Path for gcloud logs; if supplied, used for persisting logs. This must be of
23 | the form gs://BUCKET_NAME/subfolder. Logs will be stored in the supplied
24 | folder in a subfolder named after the current job run.""",
25 | )
26 |
27 | parser.add_argument(
28 | "--local_path",
29 | help="""Path for gcloud logs; if supplied, this location on the local filesystem is
30 | used for persisting logs in jsonl format. The path can be relative. Logs
31 | will be stored in the supplied folder in a subfolder named after the
32 | current job run.""",
33 | )
34 |
35 | parser.add_argument(
36 | "--tensorboard_path",
37 | help="""project-local path for tensorboard logs; if supplied, this location on the
38 | local filesystem is used for persisting logs that tensorboard can
39 | read.""",
40 | )
41 |
42 | parser.add_argument(
43 | "--learning_rate", "--lr", type=float, default=0.01, help="Learning rate."
44 | )
45 | parser.add_argument("--epochs", type=int, default=3, help="Epochs to train.")
46 |
47 | return parser
48 |
49 |
50 | def parse_flags(argv):
51 | """Function required by absl.app.run. Internally generates a parser and returns
52 | the results of parsing hello-uv arguments.
53 |
54 | """
55 | args = argv[1:]
56 | return create_parser().parse_args(args)
57 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/experiment.json:
--------------------------------------------------------------------------------
1 | {"learning_rate": [0.01, 0.001, 0.0001]}
2 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/hello_world.sh:
--------------------------------------------------------------------------------
1 | #!/bin/env bash
2 | echo "hello, world"
3 | sleep 5
4 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Copyright 2020 Google LLC
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | from setuptools import find_packages, setup
18 |
19 | REQUIRED_PACKAGES = [
20 | "alembic==1.4.2",
21 | "google-cloud-storage",
22 | "matplotlib",
23 | "mlflow==1.10.0",
24 | "pg8000==1.16.1",
25 | "sqlalchemy==1.3.13",
26 | "tensorflow-cpu",
27 | "tensorflow_datasets",
28 | "uv-metrics>=0.4.2",
29 | ]
30 |
31 | setup(
32 | version="0.0.1",
33 | name="uv-metrics-tutorial",
34 | description="UV Metrics example.",
35 | packages=find_packages(exclude=("tests", "docs")),
36 | install_requires=REQUIRED_PACKAGES,
37 | )
38 |
--------------------------------------------------------------------------------
/tutorials/uv-metrics/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/caliban/205da6f25b99c0c98cd0d263c47f25bf5e51fa97/tutorials/uv-metrics/trainer/__init__.py
--------------------------------------------------------------------------------
/tutorials/uv-metrics/trainer/cli.py:
--------------------------------------------------------------------------------
1 | """CLI Interface for the Hello-UV tutorial example."""
2 |
3 | import argparse
4 |
5 | from absl.flags import argparse_flags
6 |
7 |
8 | def create_parser():
9 | """Creates and returns the argparse instance for the experiment config
10 | expansion app.
11 |
12 | """
13 |
14 | parser = argparse_flags.ArgumentParser(
15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter,
16 | description="""Configurable arguments for the UV Metrics demo.""",
17 | prog="python -m trainer.train",
18 | )
19 |
20 | parser.add_argument(
21 | "--gcloud_path",
22 | help="""Path for gcloud logs; if supplied, used for persisting logs. This must be of
23 | the form gs://BUCKET_NAME/subfolder. Logs will be stored in the supplied
24 | folder in a subfolder named after the current job run.""",
25 | )
26 |
27 | parser.add_argument(
28 | "--local_path",
29 | help="""Path for gcloud logs; if supplied, this location on the local filesystem is
30 | used for persisting logs in jsonl format. The path can be relative. Logs
31 | will be stored in the supplied folder in a subfolder named after the
32 | current job run.""",
33 | )
34 |
35 | parser.add_argument(
36 | "--tensorboard_path",
37 | help="""project-local path for tensorboard logs; if supplied, this location on the
38 | local filesystem is used for persisting logs that tensorboard can
39 | read.""",
40 | )
41 |
42 | parser.add_argument(
43 | "--activation",
44 | help="""Activation strings. Choose from the options at
45 | https://www.tensorflow.org/api_docs/python/tf/keras/activations""",
46 | default="relu",
47 | )
48 | parser.add_argument(
49 | "--width", type=int, default=1000, help="Width of the network to train."
50 | )
51 | parser.add_argument(
52 | "--depth", type=int, default=2, help="Depth of the network to train."
53 | )
54 | parser.add_argument(
55 | "--learning_rate",
56 | "--lr",
57 | type=float,
58 | default=0.1,
59 | help="Learning rate to use while training.",
60 | )
61 |
62 | return parser
63 |
64 |
65 | def parse_flags(argv):
66 | """Function required by absl.app.run. Internally generates a parser and returns
67 | the results of parsing hello-uv arguments.
68 |
69 | """
70 | args = argv[1:]
71 | return create_parser().parse_args(args)
72 |
--------------------------------------------------------------------------------