├── .dockerignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── documentation.md
    │   ├── feature-request.md
    │   └── questions-help-support.md
    └── PULL_REQUEST_TEMPLATE
    │   └── pull_request_template.md
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── aws
    ├── README.md
    ├── __init__.py
    ├── auth
    │   ├── __init__.py
    │   └── session.py
    ├── autoscaling.py
    ├── cfn
    │   └── setup.yml
    ├── cloudformation.py
    ├── config
    │   ├── sample_specs.json
    │   ├── user_data_rdzv
    │   └── user_data_worker
    ├── petctl.py
    ├── requirements.txt
    ├── s3.py
    └── util.py
├── azure
    ├── README.md
    ├── config
    │   ├── Dockerfile
    │   ├── kubernetes.json
    │   └── sample_specs.yaml
    ├── petctl.py
    └── util.py
├── design
    ├── kubernetes
    │   └── torchelastic-operator-design.md
    └── torchelastic
    │   └── 0.2.0
    │       ├── design_doc.md
    │       ├── torchelastic_agent_diagram.jpg
    │       └── torchelastic_diagram.jpg
├── docs
    ├── Makefile
    ├── doc_push.sh
    ├── requirements.txt
    ├── source
    │   ├── _static
    │   │   └── img
    │   │   │   ├── efs-setup.jpg
    │   │   │   ├── pytorch-logo-dark.svg
    │   │   │   └── pytorch-logo-flame.png
    │   ├── conf.py
    │   ├── index.rst
    │   └── scripts
    │   │   └── create_redirect_md.py
    └── src
    │   └── pip-delete-this-directory.txt
├── examples
    ├── Dockerfile
    ├── README.md
    ├── bin
    │   ├── fetch_and_run
    │   └── install_etcd
    ├── imagenet
    │   └── main.py
    └── multi_container
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── docker-compose.yaml
    │   └── echo.py
├── kubernetes
    ├── DEVELOPMENT.md
    ├── Dockerfile
    ├── Makefile
    ├── PROJECT
    ├── README.md
    ├── TROUBLESHOOTING.md
    ├── api
    │   └── v1alpha1
    │   │   ├── constants.go
    │   │   ├── elasticjob_types.go
    │   │   ├── groupversion_info.go
    │   │   └── zz_generated.deepcopy.go
    ├── config
    │   ├── crd
    │   │   ├── bases
    │   │   │   └── elastic.pytorch.org_elasticjobs.yaml
    │   │   ├── kustomization.yaml
    │   │   └── kustomizeconfig.yaml
    │   ├── default
    │   │   └── kustomization.yaml
    │   ├── manager
    │   │   ├── kustomization.yaml
    │   │   └── manager.yaml
    │   ├── rbac
    │   │   ├── elasticjob_editor_role.yaml
    │   │   ├── elasticjob_viewer_role.yaml
    │   │   ├── kustomization.yaml
    │   │   ├── leader_election_role.yaml
    │   │   ├── leader_election_role_binding.yaml
    │   │   ├── role.yaml
    │   │   └── role_binding.yaml
    │   └── samples
    │   │   ├── classy-vision.yaml
    │   │   ├── etcd.yaml
    │   │   └── imagenet.yaml
    ├── controllers
    │   ├── elasticjob_controller.go
    │   ├── expectation.go
    │   ├── job.go
    │   ├── pod.go
    │   ├── service.go
    │   ├── suite_test.go
    │   └── util.go
    ├── go.mod
    ├── go.sum
    ├── hack
    │   └── boilerplate.go.txt
    └── main.go
├── requirements.txt
├── scripts
    └── formatter_python.sh
├── setup.py
└── torchelastic
    ├── __init__.py
    └── distributed
        ├── __init__.py
        └── launch.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .gitignore
2 | .git
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F41B Bug Report"
 3 | about: Submit a bug report to help us improve PyTorch Elastic
 4 | 
 5 | ---
 6 | 
 7 | ## 🐛 Bug
 8 | 
 9 | <!-- A clear and concise description of what the bug is. -->
10 | 
11 | Component (check all that applies):
12 |  * [ ] `state api`
13 |  * [ ] `train_step api`
14 |  * [ ] `train_loop`
15 |  * [ ] `rendezvous`
16 |  * [ ] `checkpoint`
17 |  * [ ] `rollback`
18 |  * [ ] `metrics`
19 |  * [ ] `petctl`
20 |  * [ ] `examples`
21 |  * [ ] `docker`
22 |  * [ ] other
23 |  
24 | 
25 | 
26 | ## To Reproduce
27 | 
28 | Steps to reproduce the behavior:
29 | 
30 | 1.
31 | 1.
32 | 1.
33 | 
34 | <!-- If you have a code sample, error messages, stack traces, please provide it here as well -->
35 | 
36 | ## Expected behavior
37 | 
38 | <!-- A clear and concise description of what you expected to happen. -->
39 | 
40 | ## Environment
41 | 
42 |  - torchelastic version (e.g. 0.1.0rc1):
43 |  - OS (e.g., Linux):
44 |  - How you installed torchelastic (`conda`, `pip`, source, `docker`):
45 |  - Docker image and tag (if using docker):
46 |  - Build command you used (if compiling from source):
47 |  - Git commit (if installed from source):
48 |  - Python version:
49 |  - CUDA/cuDNN version:
50 |  - GPU models and configuration:
51 |  - Execution environment (on-prem, aws, etc):
52 |  - Any other relevant information:
53 | 
54 | ## Additional context
55 | 
56 | <!-- Add any other context about the problem here. -->
57 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F4DA Documentation"
 3 | about: Report an issue related to PyTorch Elastic documentation
 4 | 
 5 | ---
 6 | 
 7 | ## 📚 Documentation
 8 | 
 9 | ## Link
10 | <!-- link to the problematic documentation -->
11 | 
12 | ## What does it currently say?
13 | <!-- copy paste the section that is wrong -->
14 | 
15 | ## What should it say?
16 | <!-- the proposed new documentation -->
17 | 
18 | ### Why?
19 | <!-- (if not clear from the proposal) why is the new proposed documentation more correct/improvement over the existing one? -->
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F680Feature Request"
 3 | about: Submit a proposal/request for a new feature or enhancement
 4 | 
 5 | ---
 6 | 
 7 | ## Description
 8 | <!-- concise description of the feature/enhancement -->
 9 | 
10 | ## Motivation/Background
11 | <!-- why is this feature/enhancement important? provide background context -->
12 | 
13 | 
14 | ## Detailed Proposal
15 | <!-- provide a detailed proposal -->
16 | 
17 | 
18 | ## Alternatives
19 | <!-- discuss the alternatives considered and their pros/cons -->
20 | 
21 | 
22 | ## Additional context/links
23 | <!-- link to code, documentation, etc. -->
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/questions-help-support.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "❓Questions/Help/Support"
 3 | about: Do you need support? We have resources.
 4 | 
 5 | ---
 6 | 
 7 | ## ❓ Questions and Help
 8 | 
 9 | 
10 | ### Please note that this issue tracker is not a help form and this issue will be closed.
11 | 
12 | Before submitting, please ensure you have gone through our documentation. Here
13 | are some links that may be helpful:
14 | 
15 | * [What is torchelastic?](../../README.md)
16 | * [Quickstart on AWS](../../aws/README.md)
17 | * [Usage](../../USAGE.md)
18 | * [Examples](../../examples/README.md)
19 | * API documentation
20 |     * [Overview](../../USAGE.md)
21 |     * [Rendezvous documentation](../../torchelastic/rendezvous/README.md)
22 |     * [Checkpointing documentation](../../torchelastic/checkpoint/README.md)
23 | * [Configuring](../../USAGE.md#configuring)
24 | 
25 |   
26 | ### Question
27 | <!-- your question here -->


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch/elastic/bc88e6982961d4117e53c4c8163ecf277f35c2c5/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | docs/src
 2 | docs/build
 3 | docs/torchelastic.docset
 4 | **/__pycache__
 5 | 
 6 | # Mac OS X files
 7 | .DS_Store
 8 | 
 9 | # Binaries for programs and plugins
10 | *.exe
11 | *.dll
12 | *.so
13 | *.dylib
14 | 
15 | # Test binary, build with `go test -c`
16 | *.test
17 | 
18 | # Output of the go coverage tool, specifically when used with LiteIDE
19 | *.out
20 | 
21 | # IDE
22 | **/.idea/
23 | 
24 | # Operator Binary
25 | kubernetes//bin/manager
26 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # CHANGELOG
 2 | 
 3 | ## 0.2.2 (Feb 18, 2021)
 4 | 
 5 | > **_NOTE:_** This is the last release for torchelastic! We are upstreaming TorchElastic into
 6 | > pytorch. See [pytorch issue-50621](https://github.com/pytorch/pytorch/issues/50621).
 7 | 
 8 | ### PyTorch Elastic
 9 | 
10 | * (new) `torchelastic.multiprocessing`, drop in replacement for `torch.multiprocessing` that supports:
11 |    * both function and binary launches
12 |    * inter-process exception propagation
13 |    * piping worker stdout/stderr to separate log files
14 |    * tail worker log files to main console with `{role}_{rank}:` prefix on each line
15 | * Improvements to `torchelastic.events`
16 | * `NCCL_ASYNC_ERROR_HANDLING` set by default in torchelastic agent
17 | * Implemented shutdown barrier on agent to reduce exit time variance
18 | * Minor cosmetic improvements to rendezvous configuration
19 | * Non functional refactoring of `EtcdRendezvous`
20 | * TSM API improvements
21 | 
22 | ## 0.2.1 (October 05, 2020)
23 | 
24 | ### PyTorch Elastic
25 | 
26 | > **_NOTE:_** As of torch-1.7 and torchelastic-0.2.1 torchelastic will be bundled into the main [pytorch docker](https://hub.docker.com/r/pytorch/pytorch)
27 |    image. [torchelastic/examples](https://hub.docker.com/r/torchelastic/examples) will be available post torch-1.7 release since
28 |    its base image will now be **pytorch/pytorch**
29 | 
30 | * Torchelastic agent:
31 |   * `run_id` available to workers as `TORCHELASTIC_RUN_ID` environment variable
32 |   * Allow `max_restarts=0`
33 |   * Worker exit barrier added to torchelastic agent to protect against variances in worker finish times
34 |   * Improvements to error handling and propagation from torchelastic agent
35 |   * Enable fault handlers on worker processes to get torch C++ stack traces
36 | 
37 | * `torchelastic.distributed.launch` CLI:
38 |    * New option `--role` to allow users to set worker role name
39 |    * CLI options can now be set via environment variables (e.g. `PET_NNODES="1:2"`)
40 |    
41 | * Project:
42 |   * Upgraded to Python 3.8
43 |   * Tests moved to `test` directory within the respective modules
44 |   * Use Pyre
45 |   
46 | * Deprecated:
47 |   * [pytorch/elastic](https://hub.docker.com/r/pytorch/elastic) Docker image
48 | 
49 | * Experimental:
50 |   * [Training Session Manager (TSM)](http://pytorch.org/elastic/0.2.1/tsm_driver.html)  with localhost scheduler
51 |   * [torchelastic.multiprocessing](http://pytorch.org/elastic/0.2.1/multiprocessing.html)
52 | 
53 | 
54 | ## 0.2.0 (April 29, 2020)
55 | 
56 | ### PyTorch Elastic
57 | 
58 | * Separate infrastructure related work from the user script. [DesignDoc]
59 | * Events API
60 | 
61 | [DesignDoc]: https://github.com/pytorch/elastic/blob/master/design/torchelastic/0.2.0/design_doc.md
62 | 
63 | ## 0.1.0rc1 (December 06, 2019)
64 | 
65 | ### PyTorch Elastic
66 | 
67 | * First release torchelastic v0.1.0rc1 (experimental)
68 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at <conduct@pytorch.org>. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to torchelastic
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | ... (in particular how this is synced with internal changes to the project)
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `master`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints (run `scripts/formatter_python.sh`).
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## License
33 | By contributing to torchelastic, you agree that your contributions will be licensed
34 | under the LICENSE file in the root directory of this source tree.
35 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.5-cuda10.1-cudnn7-runtime
 2 | 
 3 | # install torchelastic
 4 | WORKDIR /opt/torchelastic
 5 | COPY . .
 6 | RUN pip install -v .
 7 | 
 8 | WORKDIR /workspace
 9 | RUN chmod -R a+w .
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019-present, Facebook, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TorchElastic
2 | 
3 | **IMPORTANT:** This repository is deprecated.
4 | 1. TorchElastic has been upstreamed to PyTorch 1.9 under `torch.distributed.elastic`.
5 | Please refer to the PyTorch documentation [here](https://pytorch.org/docs/stable/distributed.elastic.html).
6 | 
7 | 2. The TorchElastic Controller for Kubernetes is no longer being actively maintained in favor of [TorchX](https://pytorch.org/torchx).
8 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | # All rights reserved.
5 | #
6 | # This source code is licensed under the BSD-style license found in the
7 | # LICENSE file in the root directory of this source tree.
8 | 


--------------------------------------------------------------------------------
/aws/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | # All rights reserved.
5 | #
6 | # This source code is licensed under the BSD-style license found in the
7 | # LICENSE file in the root directory of this source tree.
8 | 


--------------------------------------------------------------------------------
/aws/auth/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | from .session import AwsSessionProvider
10 | 
11 | 
12 | def get_session(region):
13 |     return AwsSessionProvider().get_session(region)
14 | 
15 | 
16 | try:
17 |     from .static_init import *  # noqa: F401 F403
18 | except ModuleNotFoundError:
19 |     pass
20 | 


--------------------------------------------------------------------------------
/aws/auth/session.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import abc
10 | 
11 | import boto3
12 | 
13 | 
14 | class AwsSessionProvider:
15 |     """
16 |     Provides AWS credentials in the form of boto3 Session.
17 |     This class may be sub-classed to provide custom methods
18 |     of getting aws_access_key_id and aws_secret_access_key.
19 |     Child classes are expected to provide overriding implementations
20 |     of the three `_get_*` methods below.
21 | 
22 |     When used directly, it follows the default credential
23 |     lookup chain as documented in:
24 |     https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
25 |     """
26 | 
27 |     def get_session(self, region=None) -> boto3.Session:
28 |         access_key = self._get_access_key()
29 |         secret_key = self._get_secret_key()
30 |         session_token = self._get_session_token()
31 | 
32 |         # either both access and secret keys are None
33 |         # or both are not None; just check one to assume
34 |         # the presence of the other
35 |         if access_key is None:
36 |             return boto3.session.Session()
37 |         else:
38 |             return boto3.session.Session(
39 |                 aws_access_key_id=access_key,
40 |                 aws_secret_access_key=secret_key,
41 |                 aws_session_token=session_token,
42 |                 region_name=region,
43 |             )
44 | 
45 |     def _get_access_key(self):
46 |         """
47 |         Returns the aws_access_key_id. Override when sub-classing.
48 |         """
49 |         return None
50 | 
51 |     def _get_secret_key(self):
52 |         """
53 |         Returns the aws_secret_access_key. Override when sub-classing.
54 |         """
55 |         return None
56 | 
57 |     def _get_session_token(self):
58 |         """
59 |         Returns the aws_session_token. Override when sub-classing.
60 |         """
61 |         return None
62 | 


--------------------------------------------------------------------------------
/aws/autoscaling.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | # All rights reserved.
  5 | #
  6 | # This source code is licensed under the BSD-style license found in the
  7 | # LICENSE file in the root directory of this source tree.
  8 | 
  9 | import logging
 10 | import os
 11 | from enum import Enum, unique
 12 | 
 13 | from jinja2 import Template
 14 | from util import wait_for
 15 | 
 16 | 
 17 | log = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | @unique
 21 | class Accelerator(Enum):
 22 |     NONE = 0
 23 |     GPU = 1
 24 | 
 25 |     @classmethod
 26 |     def get_accelerator(cls, instance_type):
 27 |         """
 28 |         get_accelerator("p3.2xlarge") returns Accelerator.GPU
 29 |         get_accelerator("i3.xlarge") returns Accelerator.NONE
 30 |         """
 31 | 
 32 |         instance_accelerators = {
 33 |             "g2": Accelerator.GPU,
 34 |             "g3": Accelerator.GPU,
 35 |             "g4": Accelerator.GPU,
 36 |             "p2": Accelerator.GPU,
 37 |             "p3": Accelerator.GPU,
 38 |         }
 39 | 
 40 |         instance_family = instance_type[0:2]
 41 |         return instance_accelerators.get(instance_family, Accelerator.NONE)
 42 | 
 43 |     @classmethod
 44 |     def from_str(cls, accelerator_str):
 45 |         """
 46 |         returns the enum Accelerator value from a string representation
 47 |         """
 48 |         accelerators = {"none": Accelerator.NONE, "gpu": Accelerator.GPU}
 49 |         return accelerators.get(accelerator_str.lower(), Accelerator.NONE)
 50 | 
 51 |     def describe(self):
 52 |         """
 53 |         Returns a string representation of the enum.
 54 |         This method is intended to be used to label certain AWS
 55 |         resources in their descriptions/names for informative purposes
 56 | 
 57 |         e.g. launch template created for GPUs can be named as: torchelastic_gpu
 58 |         """
 59 | 
 60 |         string_rep = {Accelerator.NONE.value(): "cpu", Accelerator.GPU.value(): "gpu"}
 61 |         return string_rep.get(self, "unknown_accelerator")
 62 | 
 63 | 
 64 | class AutoScalingGroup:
 65 |     def __init__(self, session):
 66 |         self._session = session
 67 |         self._asg = session.client("autoscaling")
 68 |         self._ec2 = session.client("ec2")
 69 | 
 70 |     def get_user_data(self, user_data_template, **kwargs):
 71 |         if os.path.isabs(user_data_template):
 72 |             user_data_path = user_data_template
 73 |         else:
 74 |             user_data_path = os.path.join(os.path.dirname(__file__), user_data_template)
 75 | 
 76 |         with open(user_data_path) as f:
 77 |             user_data_template = Template(f.read())
 78 |             user_data = user_data_template.render(**kwargs)
 79 |             return user_data
 80 | 
 81 |     def get_ami_id(self, accelerator):
 82 |         """
 83 |         Use EKS optimized AMI since it has everything we need pre-installed
 84 |         """
 85 | 
 86 |         eks_owner_id = "602401143452"
 87 |         eks_amis = {
 88 |             Accelerator.NONE: "amazon-eks-node-1.14-v20190927",
 89 |             Accelerator.GPU: "amazon-eks-gpu-node-1.14-v20190927",
 90 |         }
 91 | 
 92 |         res = self._ec2.describe_images(
 93 |             Filters=[
 94 |                 {"Name": "owner-id", "Values": [eks_owner_id]},
 95 |                 {
 96 |                     "Name": "name",
 97 |                     "Values": [eks_amis.get(accelerator, Accelerator.NONE)],
 98 |                 },
 99 |             ]
100 |         )
101 |         images = res["Images"]
102 |         assert (
103 |             len(images) == 1
104 |         ), f"Multiple EKS AMIs found for {self._session.aws_region()}"
105 |         return images[0]["ImageId"]
106 | 
107 |     def create_launch_config(
108 |         self,
109 |         name,
110 |         instance_type,
111 |         instance_role,
112 |         user_data_template,
113 |         security_groups=None,
114 |         accelerator="gpu",
115 |         max_spot_price=None,
116 |         ebs_volume_gb=128,
117 |         **user_data_kwargs,
118 |     ):
119 | 
120 |         req = {
121 |             "LaunchConfigurationName": name,
122 |             "InstanceType": instance_type,
123 |             "IamInstanceProfile": instance_role,
124 |             "ImageId": self.get_ami_id(Accelerator.from_str(accelerator)),
125 |             "SecurityGroups": security_groups,
126 |             "AssociatePublicIpAddress": True,
127 |             "UserData": self.get_user_data(user_data_template, **user_data_kwargs),
128 |             "BlockDeviceMappings": [
129 |                 {
130 |                     "DeviceName": "/dev/xvda",
131 |                     "Ebs": {
132 |                         "VolumeSize": ebs_volume_gb,
133 |                         "VolumeType": "gp2",
134 |                         "DeleteOnTermination": True,
135 |                     },
136 |                 }
137 |             ],
138 |         }
139 | 
140 |         if max_spot_price:
141 |             req["SpotMaxPrice"] = str(max_spot_price)
142 | 
143 |         log.info(f"Creating launch config: {name}")
144 |         self._asg.create_launch_configuration(**req)
145 | 
146 |     def describe_launch_config(self, name):
147 |         res = self._asg.describe_launch_configurations(LaunchConfigurationNames=[name])
148 |         lcs = res["LaunchConfigurations"]
149 |         return lcs[0] if len(lcs) == 1 else None
150 | 
151 |     def delete_launch_config(self, name):
152 |         if self.describe_launch_config(name):
153 |             log.info(f"Deleting asg launch config: {name}")
154 |             self._asg.delete_launch_configuration(LaunchConfigurationName=name)
155 | 
156 |     def create_asg(self, name, size, min_size=None, max_size=None, **kwargs):
157 |         """
158 |         Creates an asg. For specifications on kwargs see config/sample_specs.json
159 |         """
160 | 
161 |         if not min_size:
162 |             min_size = size
163 | 
164 |         if not max_size:
165 |             max_size = size
166 | 
167 |         assert min_size <= size <= max_size
168 | 
169 |         kwargs["size"] = size
170 |         kwargs["min_size"] = min_size
171 |         kwargs["max_size"] = max_size
172 |         self.create_launch_config(name, **kwargs)
173 | 
174 |         log.info(f"Creating autoscaling group: {name}")
175 |         self._asg.create_auto_scaling_group(
176 |             AutoScalingGroupName=name,
177 |             LaunchConfigurationName=name,
178 |             VPCZoneIdentifier=",".join(kwargs["subnets"]),
179 |             MinSize=min_size,
180 |             MaxSize=max_size,
181 |             DesiredCapacity=size,
182 |         )
183 | 
184 |     def create_asg_sync(self, name, size, min_size=None, max_size=None, **kwargs):
185 |         self.create_asg(name, size, min_size, max_size, **kwargs)
186 |         _, hostnames = self.get_hostnames(name, size)
187 |         return hostnames
188 | 
189 |     def describe_asg(self, name):
190 |         res = self._asg.describe_auto_scaling_groups(AutoScalingGroupNames=[name])
191 |         asgs = res["AutoScalingGroups"]
192 |         num_asgs = len(asgs)
193 | 
194 |         return asgs[0] if num_asgs == 1 else None
195 | 
196 |     def delete_asg(self, name):
197 |         if self.describe_asg(name):
198 |             log.info(f"Deleting autoscaling group: {name}")
199 |             self._asg.delete_auto_scaling_group(
200 |                 AutoScalingGroupName=name, ForceDelete=True
201 |             )
202 | 
203 |             for _ in wait_for(f"instances in {name} to terminate"):
204 |                 if not self.describe_asg(name):
205 |                     log.info(f"Deleted autoscaling group: {name}")
206 |                     break
207 | 
208 |         # launch config needs to be deleted after asg
209 |         self.delete_launch_config(name)
210 | 
211 |     def list_hostnames(self, name):
212 |         return self.get_hostnames(name, 1)
213 | 
214 |     def get_hostnames(self, name, size):
215 |         """
216 |         Waits until the asg has at least <size> instances in "InService"
217 |         state and returns their public dns names.
218 |         """
219 |         for _ in wait_for(f"autoscaling group: {name} to reach size >= {size}"):
220 |             asg_desc = self.describe_asg(name)
221 |             if not asg_desc:
222 |                 return []
223 |             else:
224 |                 instances = asg_desc["Instances"]
225 |                 ready_instance_ids = [
226 |                     e["InstanceId"]
227 |                     for e in instances
228 |                     if e["LifecycleState"] == "InService"
229 |                 ]
230 |                 if len(ready_instance_ids) >= size:
231 |                     paginator = self._ec2.get_paginator("describe_instances")
232 | 
233 |                     hostnames = []
234 |                     instance_ids = []
235 |                     for e in paginator.paginate(InstanceIds=ready_instance_ids):
236 |                         for r in e["Reservations"]:
237 |                             for i in r["Instances"]:
238 |                                 hostnames.append(i["PublicDnsName"])
239 |                                 instance_ids.append(i["InstanceId"])
240 |                     return instance_ids, hostnames
241 | 


--------------------------------------------------------------------------------
/aws/cfn/setup.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | Description: "cfn template that creates the minimum set of aws resources required for petctl"
  3 | Parameters:
  4 |   S3BucketName:
  5 |     Description: "name of s3 bucket to create for torchelastic use-case"
  6 |     Type: "String"
  7 |     Default: ""
  8 | 
  9 |   EFSFileSystemId:
 10 |     Description: "efs file system id (e.g. fs-d1234567)"
 11 |     Type: "String"
 12 |     Default: ""
 13 | 
 14 |   WorkerRoleName:
 15 |     Description: "name of the worker node iam role and ec2 instance profile"
 16 |     Type: "String"
 17 |     Default: "torchelastic_worker_role"
 18 | 
 19 |   RendezvousRoleName:
 20 |     Description: "name of the rendezvous node iam role and ec2 instance profile"
 21 |     Type: "String"
 22 |     Default: "torchelastic_rendezvous_role"
 23 | 
 24 | Conditions:
 25 |   CreateEFSCondition:
 26 |     Fn::Equals:
 27 |       - Ref: "EFSFileSystemId"
 28 |       - ""
 29 | 
 30 |   CreateS3BucketCondition:
 31 |     Fn::Equals:
 32 |       - Ref: "S3BucketName"
 33 |       - ""
 34 | 
 35 | Resources:
 36 |   InternetGateway:
 37 |     Type: AWS::EC2::InternetGateway
 38 | 
 39 |   VPC:
 40 |     Type: AWS::EC2::VPC
 41 |     Properties:
 42 |       CidrBlock: "172.31.0.0/16"
 43 |       EnableDnsHostnames: True
 44 |       EnableDnsSupport: True
 45 | 
 46 |   VPCGatewayAttachment:
 47 |     Type: AWS::EC2::VPCGatewayAttachment
 48 |     Properties:
 49 |         VpcId:
 50 |             Ref: "VPC"
 51 |         InternetGatewayId:
 52 |             Ref: "InternetGateway"
 53 | 
 54 |   RouteTable:
 55 |     Type: AWS::EC2::RouteTable
 56 |     Properties:
 57 |       VpcId:
 58 |         Ref: "VPC"
 59 | 
 60 |   InternetRoute:
 61 |     Type: AWS::EC2::Route
 62 |     DependsOn: VPCGatewayAttachment
 63 |     Properties:
 64 |       DestinationCidrBlock: 0.0.0.0/0
 65 |       GatewayId:
 66 |         Ref: InternetGateway
 67 |       RouteTableId:
 68 |         Ref: RouteTable
 69 | 
 70 |   Subnet0:
 71 |     Type: AWS::EC2::Subnet
 72 |     Properties:
 73 |       VpcId:
 74 |         Ref: "VPC"
 75 |       CidrBlock: "172.31.0.0/20"
 76 |       MapPublicIpOnLaunch: True
 77 |       AvailabilityZone:
 78 |         Fn::Select:
 79 |           - 0
 80 |           - Fn::GetAZs:
 81 |               Ref: AWS::Region
 82 | 
 83 |   Subnet1:
 84 |     Type: AWS::EC2::Subnet
 85 |     Properties:
 86 |       VpcId:
 87 |         Ref: "VPC"
 88 |       CidrBlock: "172.31.16.0/20"
 89 |       MapPublicIpOnLaunch: True
 90 |       AvailabilityZone:
 91 |         Fn::Select:
 92 |           - 1
 93 |           - Fn::GetAZs:
 94 |               Ref: AWS::Region
 95 | 
 96 |   SubnetRouteTableAssociation0:
 97 |     Type: AWS::EC2::SubnetRouteTableAssociation
 98 |     Properties:
 99 |       RouteTableId:
100 |         Ref: "RouteTable"
101 |       SubnetId:
102 |         Ref: "Subnet0"
103 | 
104 |   SubnetRouteTableAssociation1:
105 |     Type: AWS::EC2::SubnetRouteTableAssociation
106 |     Properties:
107 |       RouteTableId:
108 |         Ref: "RouteTable"
109 |       SubnetId:
110 |         Ref: "Subnet1"
111 | 
112 |   InstanceSecurityGroup:
113 |     Type: AWS::EC2::SecurityGroup
114 |     Properties:
115 |         GroupDescription: "security group for ec2 instances in the VPC"
116 |         GroupName: "torchelastic instance security group"
117 |         VpcId:
118 |           Ref: "VPC"
119 |         SecurityGroupIngress:
120 |         - Description: "allow ssh"
121 |           IpProtocol: "tcp"
122 |           FromPort: "22"
123 |           ToPort: "22"
124 |           CidrIp: "0.0.0.0/0"
125 |         - Description: "allow ssh"
126 |           IpProtocol: "tcp"
127 |           FromPort: "22"
128 |           ToPort: "22"
129 |           CidrIpv6: "::/0"
130 |         SecurityGroupEgress:
131 |         - Description: "* egress"
132 |           IpProtocol: "-1"
133 |           CidrIp: "0.0.0.0/0"
134 | 
135 |   SecurityGroupIngress:
136 |       Type: AWS::EC2::SecurityGroupIngress
137 |       Properties:
138 |           Description: "* ingress within the same security group"
139 |           GroupId:
140 |             Ref: "InstanceSecurityGroup"
141 |           IpProtocol: "-1"
142 |           SourceSecurityGroupId:
143 |             Ref: "InstanceSecurityGroup"
144 | 
145 |   EFSSecurityGroup:
146 |     Type: AWS::EC2::SecurityGroup
147 |     Properties:
148 |       GroupDescription: "security group for efs mount targets in the VPC"
149 |       GroupName: "torchelastic efs security group"
150 |       VpcId:
151 |         Ref: "VPC"
152 |       SecurityGroupIngress:
153 |         - Description: "allow NFS from ec2"
154 |           IpProtocol: "tcp"
155 |           FromPort: "2049"
156 |           ToPort: "2049"
157 |           SourceSecurityGroupId:
158 |             Ref: "InstanceSecurityGroup"
159 | 
160 |   EFS:
161 |     Type: AWS::EFS::FileSystem
162 |     Condition: "CreateEFSCondition"
163 | 
164 |   EFSMountTarget0:
165 |     Type: AWS::EFS::MountTarget
166 |     Properties:
167 |       FileSystemId:
168 |         Fn::If:
169 |           - "CreateEFSCondition"
170 |           - Ref: "EFS"
171 |           - Ref: "EFSFileSystemId"
172 |       SubnetId:
173 |         Ref: "Subnet0"
174 |       SecurityGroups:
175 |         - Ref: "EFSSecurityGroup"
176 | 
177 |   EFSMountTarget1:
178 |     Type: AWS::EFS::MountTarget
179 |     Properties:
180 |       FileSystemId:
181 |         Fn::If:
182 |           - "CreateEFSCondition"
183 |           - Ref: "EFS"
184 |           - Ref: "EFSFileSystemId"
185 |       SubnetId:
186 |         Ref: "Subnet1"
187 |       SecurityGroups:
188 |         - Ref: "EFSSecurityGroup"
189 | 
190 |   S3Bucket:
191 |     Type: AWS::S3::Bucket
192 |     Condition: "CreateS3BucketCondition"
193 | 
194 |   InstanceProfileWorker:
195 |     Type: AWS::IAM::InstanceProfile
196 |     Properties:
197 |       InstanceProfileName:
198 |         Ref: "WorkerRoleName"
199 |       Roles:
200 |         - Ref: "IAMRoleWorker"
201 | 
202 |   IAMRoleWorker:
203 |     Type: AWS::IAM::Role
204 |     Properties:
205 |       RoleName:
206 |         Ref: "WorkerRoleName"
207 |       AssumeRolePolicyDocument:
208 |         Version: "2012-10-17"
209 |         Statement:
210 |           - Effect: "Allow"
211 |             Principal:
212 |               Service:
213 |                 - "ec2.amazonaws.com"
214 |             Action:
215 |               - "sts:AssumeRole"
216 |       Path: "/"
217 |       ManagedPolicyArns:
218 |         - "arn:aws:iam::aws:policy/AmazonS3FullAccess"
219 |         - "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
220 |         - "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
221 |         - Ref: "ContainerCloudWatchLogsPolicy"
222 | 
223 |   InstanceProfileRendezvous:
224 |     Type: AWS::IAM::InstanceProfile
225 |     Properties:
226 |       InstanceProfileName:
227 |         Ref: "RendezvousRoleName"
228 |       Roles:
229 |         - Ref: "IAMRoleRendezvous"
230 | 
231 |   IAMRoleRendezvous:
232 |     Type: AWS::IAM::Role
233 |     Properties:
234 |       RoleName:
235 |         Ref: "RendezvousRoleName"
236 |       AssumeRolePolicyDocument:
237 |         Version: "2012-10-17"
238 |         Statement:
239 |           - Effect: "Allow"
240 |             Principal:
241 |               Service:
242 |                 - "ec2.amazonaws.com"
243 |             Action:
244 |               - "sts:AssumeRole"
245 |       Path: "/"
246 |       ManagedPolicyArns:
247 |         - "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
248 |         - "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
249 |         - Ref: "ContainerCloudWatchLogsPolicy"
250 | 
251 |   ContainerCloudWatchLogsPolicy:
252 |     Type: AWS::IAM::ManagedPolicy
253 |     Properties:
254 |       Description: "Allows container instances to use CloudWatch APIs"
255 |       Path: "/"
256 |       PolicyDocument:
257 |         Version: "2012-10-17"
258 |         Statement:
259 |             - Effect: Allow
260 |               Action:
261 |                 - "logs:CreateLogGroup"
262 |                 - "logs:CreateLogStream"
263 |                 - "logs:PutLogEvents"
264 |                 - "logs:DescribeLogStreams"
265 |               Resource:
266 |                 - "arn:aws:logs:*:*:*"
267 | Outputs:
268 |   VPCId:
269 |     Value:
270 |       Ref: "VPC"
271 | 
272 |   SubnetId0:
273 |     Value:
274 |       Ref: "Subnet0"
275 | 
276 |   SubnetId1:
277 |     Value:
278 |       Ref: "Subnet1"
279 | 
280 |   SecurityGroupId:
281 |     Value:
282 |       Ref: "InstanceSecurityGroup"
283 | 
284 |   EFSId:
285 |       Value:
286 |         Fn::If:
287 |           - "CreateEFSCondition"
288 |           - Ref: "EFS"
289 |           - Ref: "EFSFileSystemId"
290 | 
291 |   S3Bucket:
292 |     Value:
293 |       Fn::If:
294 |         - "CreateS3BucketCondition"
295 |         - Ref: "S3Bucket"
296 |         - Ref: "S3BucketName"
297 | 
298 |   WorkerInstanceProfile:
299 |     Value:
300 |       Ref: "InstanceProfileWorker"
301 | 
302 |   RendezvousInstanceProfile:
303 |     Value:
304 |       Ref: "InstanceProfileRendezvous"
305 | 
306 | 
307 | 


--------------------------------------------------------------------------------
/aws/cloudformation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | # All rights reserved.
  5 | #
  6 | # This source code is licensed under the BSD-style license found in the
  7 | # LICENSE file in the root directory of this source tree.
  8 | 
  9 | 
 10 | import getpass
 11 | import logging
 12 | import os
 13 | import random
 14 | import string
 15 | 
 16 | from jinja2 import Template
 17 | from util import wait_for
 18 | 
 19 | 
 20 | log = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class CloudFormation:
 24 |     def __init__(self, session):
 25 |         self._session = session
 26 |         self._cfn = session.client("cloudformation")
 27 | 
 28 |     def create_specs_file(self, specs_file, s3_bucket_name, efs_id):
 29 |         username = getpass.getuser()
 30 |         rand = "".join(random.choices(string.ascii_uppercase + string.digits, k=5))
 31 |         hash = f"{username}-{rand}"
 32 |         stack_name = f"torchelastic-{hash}"
 33 |         this_dir = os.path.dirname(__file__)
 34 |         cfn_template = os.path.join(this_dir, "cfn/setup.yml")
 35 |         sample_specs = os.path.join(this_dir, "config/sample_specs.json")
 36 | 
 37 |         params = {
 38 |             "WorkerRoleName": f"torchelastic_worker_role-{hash}",
 39 |             "RendezvousRoleName": f"torchelastic_rendezvous_role-{hash}",
 40 |         }
 41 | 
 42 |         if s3_bucket_name:
 43 |             params["S3BucketName"] = s3_bucket_name
 44 |         if efs_id:
 45 |             params["EFSFileSystemId"] = efs_id
 46 | 
 47 |         self.create_stack(stack_name, cfn_template, **params)
 48 | 
 49 |         for _ in wait_for(
 50 |             f"cfn stack: {stack_name} to create", timeout=600, interval=2
 51 |         ):
 52 |             status, outputs = self.describe_stack(stack_name)
 53 |             if status == "CREATE_COMPLETE":
 54 |                 break
 55 |             elif status == "CREATE_FAILED" or status.startswith("ROLLBACK_"):
 56 |                 # when stack creation fails cfn starts rolling the stack back
 57 |                 raise RuntimeError(
 58 |                     f"Error creating stack {stack_name}, status = {status}"
 59 |                 )
 60 | 
 61 |         outputs["User"] = username
 62 | 
 63 |         log.info(f"Writing specs file to: {specs_file}")
 64 |         with open(sample_specs) as f:
 65 |             specs_template = Template(f.read())
 66 |             specs_template.stream(**outputs).dump(specs_file)
 67 | 
 68 |     def describe_stack(self, stack_name):
 69 |         describe_res = self._cfn.describe_stacks(StackName=stack_name)
 70 | 
 71 |         stacks = describe_res["Stacks"]
 72 |         if len(stacks) > 1:
 73 |             raise RuntimeError(f"Found more than one stack with name {stack_name}")
 74 | 
 75 |         stack_desc = stacks[0]
 76 |         status = stack_desc["StackStatus"]
 77 | 
 78 |         # cfn outputs an array of maps, each element in the array is
 79 |         # a single output of the form "{OutputKey: <key>, OutputValue: <value>}"
 80 |         # simplify to a map of <key>, <value>  pairs
 81 |         outputs = {}
 82 |         if "Outputs" in stack_desc:
 83 |             for cfn_output in stack_desc["Outputs"]:
 84 |                 key = cfn_output["OutputKey"]
 85 |                 value = cfn_output["OutputValue"]
 86 |                 outputs[key] = value
 87 |         return status, outputs
 88 | 
 89 |     def create_stack(self, stack_name, cfn_template, **params):
 90 |         log.info(f"Creating cloudformation stack with template: {cfn_template}")
 91 | 
 92 |         with open(cfn_template) as f:
 93 |             template_body = f.read()
 94 | 
 95 |         cfn_parameters = []
 96 |         for key, value in params.items():
 97 |             cfn_parameters.append({"ParameterKey": key, "ParameterValue": value})
 98 | 
 99 |         res = self._cfn.create_stack(
100 |             StackName=stack_name,
101 |             TemplateBody=template_body,
102 |             Capabilities=["CAPABILITY_NAMED_IAM"],
103 |             Parameters=cfn_parameters,
104 |         )
105 | 
106 |         return res["StackId"]
107 | 


--------------------------------------------------------------------------------
/aws/config/sample_specs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "rdzv": {
 3 |     "instance_type" : "t2.small",
 4 |     "accelerator" : "none",
 5 |     "instance_role" : "{{ RendezvousInstanceProfile }}",
 6 |     "subnets" : [
 7 |       "{{ SubnetId0 }}"
 8 |     ],
 9 |     "security_groups" : [
10 |       "{{ SecurityGroupId }}"
11 |     ],
12 |     "user_data_template": "config/user_data_rdzv",
13 |     "ebs_volume_gb" : 64
14 |   },
15 |   "worker": {
16 |     "instance_type": "p3.2xlarge",
17 |     "accelerator" : "gpu",
18 |     "instance_role" : "{{ WorkerInstanceProfile }}",
19 |     "efs_file_system_id" : "{{ EFSId }}",
20 |     "subnets" : [
21 |       "{{ SubnetId0 }}",
22 |       "{{ SubnetId1 }}"
23 |     ],
24 |     "security_groups" : [
25 |       "{{ SecurityGroupId }}"
26 |     ],
27 | 
28 |     "user_data_template" : "config/user_data_worker",
29 |     "ebs_volume_gb" : 64,
30 |     "docker_image" : "torchelastic/examples:0.1.0rc1",
31 |     "s3_bucket" : "{{ S3Bucket }}",
32 |     "s3_prefix" : "petctl/{{ User }}"
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/aws/config/user_data_rdzv:
--------------------------------------------------------------------------------
 1 | Content-Type: multipart/mixed; boundary="//"
 2 | MIME-Version: 1.0
 3 | 
 4 | --//
 5 | Content-Type: text/cloud-config; charset="us-ascii"
 6 | MIME-Version: 1.0
 7 | Content-Transfer-Encoding: 7bit
 8 | Content-Disposition: attachment; filename="cloud-config.txt"
 9 | 
10 | #cloud-config
11 | repo_update: true
12 | repo_upgrade: all
13 | cloud_final_modules:
14 | - [scripts-user, always]
15 | runcmd:
16 | - yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm
17 | - systemctl restart amazon-ssm-agent
18 | 
19 | --//
20 | Content-Type: text/x-shellscript; charset="us-ascii"
21 | MIME-Version: 1.0
22 | Content-Transfer-Encoding: 7bit
23 | Content-Disposition: attachment; filename="userdata.txt"
24 | 
25 | #!/bin/bash
26 | #-------------------------
27 | # Install etcd
28 | #-------------------------
29 | ETCD_VER=v3.4.3
30 | BASE_DOWNLOAD_URL=https://github.com/etcd-io/etcd/releases/download
31 | DOWNLOAD_URL="${BASE_DOWNLOAD_URL}/${ETCD_VER}/etcd-${ETCD_VER}-linux-amd64.tar.gz"
32 | 
33 | TMP_DIR="/tmp/etcd-${ETCD_VER}"
34 | INSTALL_DIR="/opt/etcd"
35 | BIN_DIR="${INSTALL_DIR}/bin"
36 | 
37 | mkdir -p "${TMP_DIR}"
38 | mkdir -p "${BIN_DIR}"
39 | 
40 | echo "etcd-${ETCD_VER}" > "${INSTALL_DIR}/version.info"
41 | 
42 | echo "Downloading pre-built etcd binary from ${DOWNLOAD_URL}"
43 | curl -L "${DOWNLOAD_URL}" -o "${TMP_DIR}/etcd-${ETCD_VER}-linux-amd64.tar.gz"
44 | tar xzf "${TMP_DIR}/etcd-${ETCD_VER}-linux-amd64.tar.gz" -C "${TMP_DIR}" --strip-components=1
45 | 
46 | echo "Installing etcd into ${INSTALL_DIR}"
47 | cp -p "${TMP_DIR}/etcd" "${BIN_DIR}"
48 | cp -p "${TMP_DIR}/etcdctl" "${BIN_DIR}"
49 | 
50 | rm -rf "${TMP_DIR}"
51 | 
52 | #-------------------------
53 | # Add etcd to systemctl
54 | #-------------------------
55 | PUBLIC_HOSTNAME=$(curl http://169.254.169.254/latest/meta-data/public-hostname)
56 | 
57 | cat > /etc/etcd.conf <<EOL
58 | ETCD_ENABLE_V2=1
59 | ETCD_DATA_DIR=/var/lib/etcd
60 | ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379,http://127.0.0.1:4001
61 | ETCD_ADVERTISE_CLIENT_URLS=http://${PUBLIC_HOSTNAME}:2379
62 | EOL
63 | 
64 | cat > /etc/systemd/system/etcd.service <<EOL
65 | [Unit]
66 | Description=etcd server
67 | Documentation=https://github.com/etcd-io/etcd
68 | After=network.target
69 | 
70 | [Service]
71 | Type=notify
72 | EnvironmentFile=/etc/etcd.conf
73 | ExecStartPre=/bin/mkdir -p /var/lib/etcd
74 | ExecStart=/opt/etcd/bin/etcd
75 | Restart=always
76 | RestartSec=2
77 | LimitNOFILE=40000
78 | StartLimitBurst=5
79 | StartLimitInterval=30s
80 | KillMode=process
81 | 
82 | [Install]
83 | WantedBy=multi-user.target
84 | EOL
85 | 
86 | #-------------------------
87 | # Enable and start etcd
88 | #-------------------------
89 | systemctl enable etcd
90 | systemctl start etcd
91 | --//


--------------------------------------------------------------------------------
/aws/config/user_data_worker:
--------------------------------------------------------------------------------
 1 | Content-Type: multipart/mixed; boundary="//"
 2 | MIME-Version: 1.0
 3 | 
 4 | --//
 5 | Content-Type: text/cloud-config; charset="us-ascii"
 6 | MIME-Version: 1.0
 7 | Content-Transfer-Encoding: 7bit
 8 | Content-Disposition: attachment; filename="cloud-config.txt"
 9 | 
10 | #cloud-config
11 | repo_update: true
12 | repo_upgrade: all
13 | runcmd:
14 | - yum install -y awslogs
15 | - yum install -y amazon-efs-utils
16 | - file_system_id={{ efs_file_system_id }}
17 | - efs_mount_point=/mnt/efs/fs1
18 | - mkdir -p "${efs_mount_point}"
19 | - echo "${file_system_id}:/ ${efs_mount_point} efs tls,_netdev,nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport 0 0" >> /etc/fstab
20 | - mount -a -t efs
21 | - chmod 777 ${efs_mount_point}
22 | - mkdir -p /var/torchelastic
23 | - yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm
24 | - systemctl restart amazon-ssm-agent
25 | --//
26 | Content-Type: text/x-shellscript; charset="us-ascii"
27 | MIME-Version: 1.0
28 | Content-Transfer-Encoding: 7bit
29 | Content-Disposition: attachment; filename="userdata.txt"
30 | 
31 | cat > /var/torchelastic/ecr_login <<\EOL
32 | #!/bin/bash
33 | region=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq .region -r)
34 | $(aws ecr get-login --no-include-email --region ${region})
35 | EOL
36 | 
37 | cat > /var/torchelastic/worker.env <<\EOL
38 | RDZV_ENDPOINT={{ rdzv_endpoint }}
39 | JOB_ID={{ job_name }}
40 | MIN_SIZE={{ min_size }}
41 | MAX_SIZE={{ max_size }}
42 | SIZE={{ size }}
43 | EOL
44 | 
45 | cat > /var/torchelastic/run_worker <<\EOL
46 | #!/bin/bash
47 | container_name=$1
48 | shift
49 | 
50 | region=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq .region -r)
51 | instance_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
52 | 
53 | docker run \
54 |   --init \
55 |   --net=host \
56 |   --restart=on-failure \
57 |   --shm-size=32g \
58 |   --env-file /var/torchelastic/worker.env \
59 |   -v /mnt/efs/fs1:/mnt/efs/fs1 \
60 |   --name ${container_name} \
61 |   --log-driver=awslogs \
62 |   --log-opt awslogs-region=${region} \
63 |   --log-opt awslogs-group=torchelastic/{{ user }} \
64 |   --log-opt awslogs-create-group=true \
65 |   --log-opt awslogs-stream=${container_name}/${instance_id} \
66 |   {{ docker_image }} $*
67 | EOL
68 | 
69 | chmod 755 /var/torchelastic/ecr_login
70 | chmod 755 /var/torchelastic/run_worker
71 | 
72 | cat > /etc/systemd/system/torchelastic_worker.service <<\EOL
73 | [Unit]
74 | Description=torchelastic worker
75 | Documentation=https://github.com/pytorch/torchelastic
76 | After=docker.service
77 | Requires=docker.service
78 | 
79 | [Service]
80 | Type=exec
81 | ExecStartPre=-/var/torchelastic/ecr_login
82 | ExecStart=/var/torchelastic/run_worker {{ job_name }} {{ script }} {{ args }}
83 | ExecStop=-/usr/bin/docker kill {{ job_name }}
84 | ExecStopPost=-/usr/bin/docker rm -f {{ job_name }}
85 | Restart=no
86 | LimitNOFILE=40000
87 | KillMode=control-group
88 | 
89 | [Install]
90 | WantedBy=multi-user.target
91 | EOL
92 | 
93 | #-------------------------
94 | # Enable and start worker
95 | #-------------------------
96 | systemctl enable torchelastic_worker
97 | systemctl start torchelastic_worker
98 | --//
99 | 


--------------------------------------------------------------------------------
/aws/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.9.148
2 | jinja2>=2.10
3 | 


--------------------------------------------------------------------------------
/aws/s3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import logging
10 | import os
11 | import shutil
12 | import tarfile as tar
13 | import tempfile
14 | 
15 | 
16 | log = logging.getLogger(__name__)
17 | 
18 | 
19 | class S3:
20 |     def __init__(self, session):
21 |         self._session = session
22 |         self._s3 = session.client("s3")
23 | 
24 |     def cp(self, target_path, bucket, key):
25 |         """
26 |         Uploads target_path to s3://bucket/key. If the target_path is a file
27 |         then uploads to s3://bucket/key/file_name, if the target_path is a
28 |         directory, then a tarball is created with the contents of target_path
29 |         and uploaded to s3://bucket/key/dir_name.tar.gz. The tar is created as
30 |         if created by running the command:
31 | 
32 |         cd target_path && tar xzf /tmp/$(basename target_path).tar.gz *
33 | 
34 |         Returns the destination s3 url
35 |         """
36 | 
37 |         target_basename = os.path.basename(target_path)
38 | 
39 |         if os.path.isdir(target_path):
40 |             tmpdir = tempfile.mkdtemp(prefix="petctl_")
41 |             tar_basename = f"{target_basename}.tar.gz"
42 |             tar_file = os.path.join(tmpdir, tar_basename)
43 |             log.info(f"Compressing {target_path} into {tar_basename}")
44 |             with tar.open(tar_file, "x:gz") as f:
45 |                 f.add(target_path, arcname="", recursive=True)
46 | 
47 |             dest_key = f"{key}/{tar_basename}"
48 |             target_file = tar_file
49 |         else:
50 |             tmpdir = None
51 |             dest_key = f"{key}/{target_basename}"
52 |             target_file = target_path
53 | 
54 |         log.info(f"Uploading {target_file} to s3://{bucket}/{dest_key}")
55 |         self._s3.upload_file(target_file, bucket, dest_key)
56 | 
57 |         if tmpdir:
58 |             log.info(f"Deleting tmp dir: {tmpdir}")
59 |             shutil.rmtree(tmpdir)
60 |         return f"s3://{bucket}/{dest_key}"
61 | 


--------------------------------------------------------------------------------
/aws/util.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import sys
10 | import time
11 | 
12 | 
13 | def wait_for(msg, timeout: float = 300, interval: int = 1, print_spinner: bool = True):
14 |     """
15 |     for _ in wait_for("asg to provision", timeout_sec, interval_sec):
16 |         if check_condition():.
17 |             break
18 |     """
19 |     spin = ["-", "/", "|", "\\", "-", "/", "|", "\\"]
20 |     idx = 0
21 |     start = time.time()
22 |     max_time = start + timeout
23 |     while True:
24 |         if print_spinner:
25 |             elapsed = time.time() - start
26 |             print(
27 |                 f"Waiting for {msg}"
28 |                 f" ({elapsed:03.0f}/{timeout:3.0f}s elapsed) {spin[idx]}\r",
29 |                 end="",
30 |             )
31 |             sys.stdout.flush()
32 |             idx = (idx + 1) % len(spin)
33 | 
34 |         if time.time() >= max_time:
35 |             raise RuntimeError(f"Timed out while waiting for: {msg}")
36 |         else:
37 |             time.sleep(interval)
38 |             yield
39 | 


--------------------------------------------------------------------------------
/azure/README.md:
--------------------------------------------------------------------------------
  1 | # Pytorch Elastic on Azure
  2 | This directory contains scripts and libraries that help users run pytorch elastic jobs on Azure.
  3 | 
  4 | ## Prerequisites
  5 | 1. Familiarity with [Azure](https://azure.microsoft.com/en-us/), [aks-engine](https://github.com/Azure/aks-engine), [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/)
  6 | 2. Quota available for Standard_DS1_v2 instance and Standard_NC6s_v3 instance.
  7 | 3. Access to Azure subscription, Resource Group and Storage. (Refer [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) for setup instructions).
  8 | 4. Run the Azure login command `az login`.
  9 | 
 10 | # Sample Usage
 11 | 
 12 | 1. #### Configure your job yaml and kubernetes.json
 13 | ```
 14 | python petctl.py configure --name "test_job" --min_size 1 --max_size 5
 15 | ```
 16 | This will create a spec for aks-engine instances and training job. Aks-engine launch [spec](config/kubernetes.json) is a simple json file that specifies the count and type of Rendezvous and Worker instances. A training job [spec](config/sample_specs.yaml) file is created with the specified job name and min, max worker count.
 17 | By default master node creates Standard DS1_v2 instance and worker nodes create Standard_NC6_v3 instances. Other Azure instances could be specified using --master_vm and --worker_vm options.
 18 | 
 19 | 2. #### Setup your Kubernetes cluster
 20 | 
 21 | This step requires service prinicpal to create aks cluster.  
 22 | Instructions for generating service principal can be found at [portal](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal), [CLI](https://docs.microsoft.com/en-us/cli/azure/create-an-azure-service-principal-azure-cli?view=azure-cli-latest), [Powershell](https://docs.microsoft.com/en-us/powershell/azure/create-azure-service-principal-azureps).  
 23 | ```
 24 | python petctl.py setup --dns_prefix azure-pytorch-elastic 
 25 |                        --rg "<resource_group>" 
 26 |                        --location "<location>" 
 27 |                        --subscription_id <subscription_id>
 28 |                        --client_id <service principal client id>
 29 |                        --client_secret <service principal client secret>                       
 30 | ```
 31 | This creates an Azure Kubernetes cluster with 1 Standard_DS1_v2 master instances and specified number of Standard_NC6s_v3 worker instance in the resource group created in [Prerequisites](#Prerequisites) #3.
 32 | 
 33 | 3. #### Upload to Azure Blob storage
 34 | 
 35 | This is an optional step to upload code and data to Azure blob storage. This step can be skipped if the training script and data are already available in Azure Blob storage.
 36 | ```
 37 | python petctl.py upload_storage --source_path <path to files>
 38 |                                 --account_name <storage account name>
 39 |                                 --container_name <name of blob container>
 40 |                                 --sas_token <SAS token for blob storage>
 41 | ```
 42 | Instructions to generate SAS token are available [here](https://adamtheautomator.com/azure-sas-token/).
 43 | 
 44 | 4. #### Generate Storage and Docker Image secrets
 45 | 
 46 | This step requires user blob storage account and docker image details.
 47 | Instructions for accessing storage account keys can be found at [portal](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-keys-manage), [CLI](https://docs.microsoft.com/en-us/cli/azure/storage/account/keys)  
 48 | 
 49 | ##### Generate Storage secret
 50 | ```
 51 | python petctl.py storage_secret --account_name <storage account name> 
 52 |                                 --account_key "<storage account key>" 
 53 | ```
 54 | ##### Generate Docker image secret
 55 | ```
 56 | python petctl.py docker_secret --server <docker server> 
 57 |                                --username <docker username> 
 58 |                                --password <docker password>
 59 |                                --image_name <docker image name>
 60 | ```
 61 | 
 62 | Training job file is updated to mount users storage account onto worker instances and apply the user provided docker image.
 63 | Base docker image to run Pytorch Elastic on Azure is at [Dockerfile](config/Dockerfile). Instructions on publishing docker image to  AzureContainer registry can be found at [ACR](https://docs.microsoft.com/en-us/azure/container-registry/container-registry-get-started-docker-cli).
 64 | Docker image secret generation can be skipped for running [imagenet](../../examples/imagenet/main.py) example as job specs yaml is already populated with public AzureML image with pytorch elastic support.
 65 | 
 66 | 5. #### Start your training job
 67 | 
 68 | Submit the training job.
 69 | ```
 70 | python petctl.py run_job
 71 | ```
 72 | To run the provided imagenet example, training script and data can be uploaded to Azure blob storage by running
 73 | ```
 74 | python petctl.py --upload_storage --source_path ../examples/imagenet/main.py
 75 |                                   --account_name <storage account name>
 76 |                                   --container_name code
 77 |                                   --sas_token <sas token for blob storage>
 78 | ```
 79 | ```
 80 | python petctl.py --upload_storage --source_path <path to imagenet train folder>
 81 |                                   --account_name <storage account name>
 82 |                                   --container_name data
 83 |                                   --sas_token <sas token for blob storage>
 84 | ```
 85 | 
 86 | 6. #### Check status of your job
 87 | ```
 88 | python petctl.py check_status
 89 | ```
 90 | 7. #### Scale worker instances
 91 | ```
 92 | python petctl.py scale --rg "<resource_group>"
 93 |                        --location "<location>"
 94 |                        --subscription_id <subscription_id>
 95 |                        --client_id <service principal client id>
 96 |                        --client_secret <service principal client secret>
 97 |                        --new_node_count <worker instances count>    
 98 | ```
 99 | (Here subscription id and resource group is the one setup in [Prerequisites](#Prerequisites) #3.
100 | 
101 | 8. #### Delete resources
102 | ````
103 | python petctl.py delete_resources
104 | ````
105 | This deletes the aks-engine cluster and all associated namespaces and secrets.
106 | 


--------------------------------------------------------------------------------
/azure/config/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/azureml/base-gpu:openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04
 2 | 
 3 | RUN apt-get update -y && \
 4 |     apt-get install -y curl
 5 | 
 6 | RUN ldconfig /usr/local/cuda/lib64/stubs && \
 7 |     conda install -y conda=4.6.14 python=3.6.2 && conda clean -ay && \
 8 |     # Install AzureML SDK
 9 |     pip install --no-cache-dir azureml-defaults && \
10 |     pip install pyyaml && \
11 |     # Install PyTorch
12 |     pip install torch==1.5.0
13 |     pip install torchvision==0.6.0
14 |     pip install --no-cache-dir mkl==2018.0.3 && \
15 |     ldconfig && \
16 |     pip install tensorboard==1.14.0 && \
17 |     pip install future==0.17.1 && \
18 |     pip install python-etcd==0.4.5 && \
19 |     pip install torchelastic
20 | 


--------------------------------------------------------------------------------
/azure/config/kubernetes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "apiVersion": "vlabs",
 3 |     "properties": {
 4 |         "orchestratorProfile": {
 5 |             "orchestratorType": "Kubernetes",
 6 |             "kubernetesConfig": {
 7 |                 "apiServerConfig": {
 8 |                     "--tls-cipher-suites": "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256"
 9 |                 }
10 |             }
11 |         },
12 |         "masterProfile": {
13 |             "count": 1,
14 |             "dnsPrefix": "",
15 |             "vmSize": "Standard_DS1_v2"
16 |         },
17 |         "agentPoolProfiles": [
18 |             {
19 |                 "name": "agentpool1",
20 |                 "count": 1,
21 |                 "vmSize": "Standard_NC6s_v3"
22 |             }
23 |         ],
24 |         "linuxProfile": {
25 |             "adminUsername": "azureuser",
26 |             "ssh": {
27 |                 "publicKeys": [
28 |                     {}
29 |                 ]
30 |             }
31 |         },
32 |         "servicePrincipalProfile": {
33 |             "clientId": "",
34 |             "secret": ""
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/azure/config/sample_specs.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   labels:
 5 |     app: azure-pytorch-elastic
 6 |   name: azure-pytorch-elastic
 7 | spec:
 8 |   template:
 9 |     metadata:
10 |       labels:
11 |         app: azure-pytorch-elastic
12 |     spec:
13 |       containers:
14 |       - name: petimage
15 |         image: mcr.microsoft.com/azureml/elastic:pytorch-elastic-openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04
16 |         command: ["/bin/bash", "-c"]
17 |         args: ["python /mnt/azure/pet/code/imagenet/main.py --input_path /mnt/azure/pet/data/train/"]
18 |         imagePullPolicy: Always
19 |         env:
20 |           - name: RDZV_ENDPOINT
21 |             value: 10.255.255.5:2379
22 |           - name: ETCD_PROTOCOL
23 |             value: https
24 |           - name: ETCD_CACERT
25 |             value: /etc/kubernetes/certs/ca.crt
26 |           - name: ETCD_CERT
27 |             value: /etc/kubernetes/certs/client.crt
28 |           - name: ETCD_KEY
29 |             value: /etc/kubernetes/certs/client.key
30 |         resources:
31 |           limits:
32 |            nvidia.com/gpu: 1
33 |         volumeMounts:
34 |           - name: pet
35 |             mountPath: /mnt/azure/pet
36 |           - name: etc
37 |             mountPath: /etc/kubernetes/certs
38 |       imagePullSecrets:
39 |       - name: pet-docker-secret
40 |       volumes:
41 |       - name: pet
42 |         flexVolume:
43 |           driver: "azure/blobfuse"
44 |           readOnly: true
45 |           secretRef:
46 |             name: pet-blob-secret
47 |           options:
48 |             container: petimagenet
49 |       - name: etc
50 |         hostPath:
51 |           path: /etc/kubernetes/certs
52 |       restartPolicy: Never
53 | 


--------------------------------------------------------------------------------
/azure/petctl.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function, unicode_literals
  2 | 
  3 | import util
  4 | 
  5 | 
  6 | # Create a Kubernetes specs and YAML job file based on user inputs
  7 | def configure(args):
  8 |     util.configure_yaml(args)
  9 |     util.configure_json(args)
 10 | 
 11 | 
 12 | # Deploys a Kubernetes cluster
 13 | def setup(args):
 14 |     # Install AKS Engine
 15 |     util.install_aks_engine()
 16 |     # Deploy an AKS cluster using kubernetes.json
 17 |     util.deploy_aks_cluster(args)
 18 | 
 19 | 
 20 | # Upload code/data to Azure blob storage
 21 | def upload_storage(args):
 22 |     util.upload_to_azure_blob(args)
 23 | 
 24 | 
 25 | # Create Azure blob storage secret
 26 | def storage_secret(args):
 27 |     util.create_storage_secrets(args)
 28 | 
 29 | 
 30 | # Create docker image secrets
 31 | def docker_secret(args):
 32 |     util.create_docker_image_secret(args)
 33 | 
 34 | 
 35 | # Scale the cluster
 36 | def scale_cluster(args):
 37 |     util.scale_cluster(args)
 38 | 
 39 | 
 40 | # Submits your training job
 41 | def run_job(args):
 42 |     util.install_blobfuse_drivers()
 43 |     commands = [
 44 |         "kubectl delete -f config/azure-pytorch-elastic.yaml",
 45 |         "kubectl apply -f config/azure-pytorch-elastic.yaml",
 46 |         "kubectl describe pods",
 47 |         "kubectl get pods --selector app=azure-pytorch-elastic",
 48 |     ]
 49 | 
 50 |     util.run_commands(commands)
 51 | 
 52 | 
 53 | # Check current status of your pods
 54 | def check_status():
 55 |     commands = [
 56 |         "kubectl describe pods",
 57 |         "kubectl get pods --selector app=azure-pytorch-elastic",
 58 |     ]
 59 | 
 60 |     util.run_commands(commands)
 61 | 
 62 | 
 63 | # Get logs of your job from each pod
 64 | def get_logs():
 65 |     util.run_commands(["kubectl logs --selector app=azure-pytorch-elastic "])
 66 | 
 67 | 
 68 | # Deletes secrets and cluster
 69 | def delete_resources():
 70 |     util.delete_resources_util()
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 |     parser = util.argparse.ArgumentParser()
 75 | 
 76 |     subparser = parser.add_subparsers(
 77 |         title="actions", description="setup | configure | run job", dest="command"
 78 |     )
 79 | 
 80 |     # ---------------------------------- #
 81 |     #               SETUP                #
 82 |     # ---------------------------------- #
 83 | 
 84 |     parser_setup = subparser.add_parser(
 85 |         "setup", help="set up aks-engine, cluster and other dependencies"
 86 |     )
 87 | 
 88 |     parser_setup.add_argument(
 89 |         "--dns_prefix",
 90 |         type=str,
 91 |         required=False,
 92 |         default="azure-pytorch-elastic",
 93 |         help="Dns prefix of the app",
 94 |     )
 95 | 
 96 |     parser_setup.add_argument(
 97 |         "--subscription_id",
 98 |         type=str,
 99 |         required=True,
100 |         help="Subscription id of the cluster",
101 |     )
102 | 
103 |     parser_setup.add_argument(
104 |         "--rg", type=str, required=True, help="Resource group of the cluster"
105 |     )
106 | 
107 |     parser_setup.add_argument(
108 |         "--location", type=str, required=True, help="Location of the cluster"
109 |     )
110 | 
111 |     parser_setup.add_argument(
112 |         "--client_id", type=str, required=True, help="Service principal client id"
113 |     )
114 | 
115 |     parser_setup.add_argument(
116 |         "--client_secret",
117 |         type=str,
118 |         required=True,
119 |         help="Service Principal client secret",
120 |     )
121 | 
122 |     parser_setup.set_defaults(func=setup)
123 | 
124 |     # ---------------------------------- #
125 |     #        CONFIGURE JOB YAML          #
126 |     # ---------------------------------- #
127 | 
128 |     parser_configure = subparser.add_parser("configure", help="Generate yaml job file")
129 | 
130 |     parser_configure.add_argument("--name", required=True, help="config parameters")
131 |     parser_configure.add_argument(
132 |         "--min_size",
133 |         type=int,
134 |         required=False,
135 |         help="minimum number of worker hosts to continue training",
136 |     )
137 |     parser_configure.add_argument(
138 |         "--max_size",
139 |         type=int,
140 |         required=False,
141 |         help="maximum number of worker hosts to allow scaling out",
142 |     )
143 |     parser_configure.add_argument(
144 |         "--size",
145 |         type=int,
146 |         required=False,
147 |         help="set size to automatically set min_size = max_size = size",
148 |     )
149 |     parser_configure.add_argument(
150 |         "--master_vm",
151 |         type=str,
152 |         required=False,
153 |         default="Standard_DS1_v2",
154 |         help="Azure VM instance for master node",
155 |     )
156 |     parser_configure.add_argument(
157 |         "--worker_vm",
158 |         type=str,
159 |         required=False,
160 |         default="Standard_NC6s_v3",
161 |         help="Azure VM instance for woker nodes",
162 |     )
163 |     parser_configure.set_defaults(func=configure)
164 | 
165 |     # ---------------------------------- #
166 |     #              UPLOAD STORAGE        #
167 |     # ---------------------------------- #
168 | 
169 |     parser_upload_storage = subparser.add_parser(
170 |         "upload_storage", help="Upload to Azure Blob storage"
171 |     )
172 | 
173 |     parser_upload_storage.add_argument(
174 |         "--account_name",
175 |         type=str,
176 |         required=True,
177 |         help="Azure Blob storage Account name",
178 |     )
179 | 
180 |     parser_upload_storage.add_argument(
181 |         "--container_name",
182 |         type=str,
183 |         required=True,
184 |         help="Azure Blob storage container name",
185 |     )
186 | 
187 |     parser_upload_storage.add_argument(
188 |         "--sas_token", type=str, required=True, help="Azure Blob storage SAS token"
189 |     )
190 | 
191 |     parser_upload_storage.add_argument(
192 |         "--source_path", type=str, required=True, help="Path to local files"
193 |     )
194 | 
195 |     parser_upload_storage.set_defaults(func=upload_storage)
196 | 
197 |     # ---------------------------------- #
198 |     #              SETUP SECRETS         #
199 |     # ---------------------------------- #
200 | 
201 |     parser_storage_secret = subparser.add_parser(
202 |         "storage_secret", help="Generate secret for Azure Blob storage"
203 |     )
204 | 
205 |     parser_storage_secret.add_argument(
206 |         "--account_name",
207 |         type=str,
208 |         required=True,
209 |         help="Azure Blob storage account name",
210 |     )
211 | 
212 |     parser_storage_secret.add_argument(
213 |         "--account_key", type=str, required=True, help="Azure Blob storage account key"
214 |     )
215 | 
216 |     parser_storage_secret.set_defaults(func=storage_secret)
217 | 
218 |     parser_docker_secret = subparser.add_parser(
219 |         "docker_secret", help="Generate secret for Docker Image"
220 |     )
221 | 
222 |     parser_docker_secret.add_argument(
223 |         "--server", type=str, required=True, help="Docker server"
224 |     )
225 | 
226 |     parser_docker_secret.add_argument(
227 |         "--username", type=str, required=True, help="Docker username"
228 |     )
229 | 
230 |     parser_docker_secret.add_argument(
231 |         "--password", type=str, required=True, help="Docker password"
232 |     )
233 | 
234 |     parser_docker_secret.add_argument(
235 |         "--image_name", type=str, required=True, help="Docker Imagename"
236 |     )
237 | 
238 |     parser_docker_secret.set_defaults(func=docker_secret)
239 | 
240 |     # ---------------------------------- #
241 |     #              RUN JOB               #
242 |     # ---------------------------------- #
243 | 
244 |     parser_run_job = subparser.add_parser("run_job", help="Run your training job")
245 | 
246 |     parser_run_job.set_defaults(func=run_job)
247 | 
248 |     # ---------------------------------- #
249 |     #              CHECK STATUS          #
250 |     # ---------------------------------- #
251 | 
252 |     parser_check_status = subparser.add_parser(
253 |         "check_status", help="Check status of your jobs"
254 |     )
255 |     parser_run_job.set_defaults(func=check_status)
256 | 
257 |     # ---------------------------------- #
258 |     #            DELETE RESOURCES        #
259 |     # ---------------------------------- #
260 |     parser_delete_resources = subparser.add_parser(
261 |         "delete_resources",
262 |         help="Deletes the kubernetes cluster and all namespaces and secrets",
263 |     )
264 |     parser_delete_resources.set_defaults(func=delete_resources)
265 | 
266 |     # ---------------------------------- #
267 |     #            GET LOGS                #
268 |     # ---------------------------------- #
269 | 
270 |     parser_get_logs = subparser.add_parser(
271 |         "get_logs", help="Get logs from all your pods"
272 |     )
273 | 
274 |     parser_get_logs.set_defaults(func=get_logs)
275 | 
276 |     # ---------------------------------- #
277 |     #            SCALE CLUSTER           #
278 |     # ---------------------------------- #
279 |     parser_scale = subparser.add_parser("scale", help="Scale up/down your cluster")
280 | 
281 |     parser_scale.add_argument(
282 |         "--subscription_id",
283 |         type=str,
284 |         required=True,
285 |         help="Subscription id of the cluster",
286 |     )
287 | 
288 |     parser_scale.add_argument(
289 |         "--rg", type=str, required=True, help="Resource group of the cluster"
290 |     )
291 | 
292 |     parser_scale.add_argument(
293 |         "--location", type=str, required=True, help="Location of the cluster"
294 |     )
295 | 
296 |     parser_scale.add_argument(
297 |         "--client_id", type=str, required=True, help="Service principal client id"
298 |     )
299 | 
300 |     parser_scale.add_argument(
301 |         "--client_secret",
302 |         type=str,
303 |         required=True,
304 |         help="Service Principal client secret",
305 |     )
306 | 
307 |     parser_scale.add_argument(
308 |         "--new_node_count",
309 |         type=int,
310 |         required=True,
311 |         help="New node count to scale cluster to",
312 |     )
313 | 
314 |     parser_scale.set_defaults(func=util.scale_cluster)
315 | 
316 |     args = parser.parse_args()
317 | 
318 |     # -----
319 |     # Execution order: Configure --> Setup --> Run
320 |     # -----
321 |     if args.command == "configure":
322 |         configure(args)
323 |     elif args.command == "setup":
324 |         setup(args)
325 |     elif args.command == "upload_storage":
326 |         upload_storage(args)
327 |     elif args.command == "storage_secret":
328 |         storage_secret(args)
329 |     elif args.command == "docker_secret":
330 |         docker_secret(args)
331 |     elif args.command == "run_job":
332 |         run_job(args)
333 |     elif args.command == "check_status":
334 |         check_status()
335 |     elif args.command == "delete_resources":
336 |         delete_resources()
337 |     elif args.command == "get_logs":
338 |         get_logs()
339 |     elif args.command == "scale":
340 |         scale_cluster(args)
341 | 


--------------------------------------------------------------------------------
/design/kubernetes/torchelastic-operator-design.md:
--------------------------------------------------------------------------------
  1 | # TorchElastic Controller for Kubernetes
  2 | 
  3 | ## Background
  4 | 
  5 | PyTorch continues to be used for the latest state-of-the-art research, making up nearly 70% of [papers](https://chillee.github.io/pytorch-vs-tensorflow/) that cite a framework.
  6 | 
  7 | The current PyTorch Distributed Data Parallel (DDP) module enables data parallel training where each process trains the same model but on different shards of data. It enables bulk synchronous, multi-host, multi-GPU/CPU execution of ML training. However, DDP has several shortcomings; e.g.jobs cannot start without acquiring all the requested nodes; jobs cannot continue after a node fails due to an error or transient issue; jobs cannot incorporate a node that joined later; and lastly; progress cannot be made with the presence of a slow/stuck node.
  8 | 
  9 | The focus of [PyTorch Elastic](https://github.com/pytorch/elastic), which uses Elastic Distributed Data Parallelism, is to address these issues and build a generic framework/APIs for PyTorch to enable reliable and elastic execution of these data parallel training workloads. It will provide better programmability, higher resilience to failures of all kinds, higher-efficiency and larger-scale training compared with pure DDP.
 10 | 
 11 | ## Motivation
 12 | 
 13 | With job fault tolerance and elastic training, we can unlock a lot of features.
 14 | 
 15 | Users can enable job priority and preemption in the cluster. Losing a task becomes acceptable and the user won't lose the entire job progress. More importantly, it will help guarantee SLAs of crititcal jobs, even in a cluster under resource pressure.
 16 | 
 17 | Cost and GPU utilization will be further optimized with this feature, since users can launch jobs with partial resources and spot GPU instances can be used as well without worrying.
 18 | 
 19 | ## User Experience
 20 | 
 21 | * Users should define the `minReplicas` and `maxReplicas` number of tasks of a job instead of a fixed number. TorchElastic controller will launch jobs in Kubernetes, setup the needed network topology and manage the job lifecycle.
 22 | * Users need to specify the etcd endpoint used as the RDZV service for task coordination.
 23 | * The desired `spec.replicaSpecs[Worker].replicas`, being number of tasks, has to be within the range from `minReplicas` to `maxReplicas`.
 24 | * Users can easily create/delete a torch elastic job using `kubectl` using a job manifest.
 25 | * Users are able to describe custom resources to monitor the job status.
 26 | 
 27 | ## High Level Design
 28 | 
 29 | Workers in torch elastic job are equivalent and their communication is peer to peer. In this case, every pod should be able to talk with every other pod, and we need to create a `headless` service for every pod. Once the job is done, controller won't terminate any pods, user can check logs for any worker. Manual job deletion will delete all pods belong to it.
 30 | 
 31 | A config with kind `ElasticJob` defines the job spec and the controller will reconcile against this definition. It will create/update/delete pods and services if there are any changes in the job orin the kubernetes resources (pods, services) changes owned by `ElasticJob`.
 32 | 
 33 | ```
 34 | apiVersion: "elastic.pytorch.org/v1alpha1"
 35 | kind: "ElasticJob"
 36 | metadata:
 37 |   name: "classy-vision-job"
 38 | spec:
 39 |   rdzvEndpoint: "etcd-service:2379"
 40 |   minReplicas: 2
 41 |   maxReplicas: 5
 42 |   replicaSpecs:
 43 |     Worker:
 44 |       replicas: 3
 45 |       restartPolicy: ExitCode
 46 |       template:
 47 |         apiVersion: v1
 48 |         kind: Pod
 49 |         spec:
 50 |           containers:
 51 |           - name: torchelasticworker
 52 |             image: torchelastic/examples:0.1.0rc1
 53 |             imagePullPolicy: Always
 54 |             args:
 55 |               - "s3://code_path/petctl/user/my_job/main.py"
 56 |               - --config_file
 57 |               - "/data/classy_vision/resnet50_synthetic_image_classy_config.json"
 58 |               - "--checkpoint_folder"
 59 |               - "/data/classy_vision/checkpoint""
 60 | 
 61 | ```
 62 | 
 63 | *Network Communication*
 64 | 
 65 | In this case, every pod should be able to talk with each other and we need to create headless service for every pod since they use hostname registered in rdzv endpoint to find peers.
 66 | 
 67 | *Failure condition*
 68 | 
 69 | Torch Elastic controller will only fail a job if active workers is under minReplicas size user specified. Otherwise, it will try to reschedule failed pods and maintain the desired task size.
 70 | 
 71 | *rdzvEndpoint*
 72 | 
 73 | `rdzvEndpoint` needs to be specified by user. It could be high available etcd quorum or single etcd pod on Kubernetes cluster.
 74 | 
 75 | *Replicas*
 76 | 
 77 | `replicas` represents the desired task size. Torch elastic job doesn't need all the workers to be ready to start training. We can set this field to job.spec.maxReplicas and try to allocate more resources. If cluster doesn't have enough resources, some tasks maybe pending and job can still start.
 78 | 
 79 | 
 80 | These are the resources the controller creates from a `TorchElasticJob`:
 81 | 
 82 | **Pod**
 83 | 
 84 | ```
 85 | apiVersion: v1
 86 | kind: Pod
 87 | metadata:
 88 |   name: classy-vision-job-worker-${index}
 89 |   labels:
 90 |     job-name: classy-vision-job
 91 |     group-name=elastic.pytorch.org
 92 |     replica-index: 0
 93 |     replica-type=worker
 94 | spec:
 95 |   containers:
 96 |     image: torchelastic/examples:0.1.0rc1
 97 |     imagePullPolicy: Always
 98 |     name: torchelasticworker
 99 |     env:
100 |       - name: RDZV_ENDPOINT
101 |         value: "etcd-:2379"
102 |       - name: JOB_ID
103 |         value: "classy-vision-job"
104 |       - name: SIZE
105 |         value: "3"
106 |       - name: MIN_SIZE
107 |         value: "2"
108 |       - name: MAX_SIZE
109 |         value: "5"
110 |   restartPolicy: OnFailure
111 | ```
112 | 
113 | **Service**
114 | 
115 | ```
116 | apiVersion: v1
117 | kind: Service
118 | metadata:
119 |   name: classy-vision-job-worker-${index}
120 | spec:
121 |   selector:
122 |     job-name: classy-vision-job
123 |     group-name=elastic.pytorch.org
124 |     replica-index: 0
125 |     replica-type=worker
126 |   clusterIP: None
127 | ```
128 | 
129 | **Job Status**
130 | 
131 | ``` yaml
132 | kubectl describe elasticjob classy-vision-job
133 | Name:         classy-vision-job
134 | Namespace:    default
135 | API Version:  elastic.pytorch.org/v1alpha1
136 | Kind:         ElasticJob
137 | Spec:
138 |   ...
139 | Status:
140 |   Conditions:
141 |     Last Transition Time:  2020-01-22T23:10:44Z
142 |     Last Update Time:      2020-01-22T23:10:44Z
143 |     Message:               job classy-vision-job is created.
144 |     Reason:                ElasticJobCreated
145 |     Status:                True
146 |     Type:                  Created
147 |     Last Transition Time:  2020-01-22T23:10:49Z
148 |     Last Update Time:      2020-01-22T23:10:49Z
149 |     Message:               ElasticJob classy-vision-job is running.
150 |     Reason:                ElasticJobRunning
151 |     Status:                False
152 |     Type:                  Running
153 |     Last Transition Time:  2020-01-22T23:10:49Z
154 |     Last Update Time:      2020-01-22T23:10:49Z
155 |     Message:               ElasticJob classy-vision-job is failed because 2 workers replica(s) failed.
156 |     Reason:                ElasticJobFailed
157 |     Status:                True
158 |     Type:                  Failed
159 |   Replica Statuses:
160 |     Worker:
161 |       Active:  1
162 |       Failed:  2
163 | Events:
164 |   Type     Reason                     Age                From                     Message
165 |   ----     ------                     ----               ----                     -------
166 |   Normal   SuccessfulCreatePod        39m                elastic-job-controller   Created pod: classy-vision-job-worker-0
167 |   Normal   SuccessfulCreatePod        39m                elastic-job-controller   Created pod: classy-vision-job-worker-1
168 |   Normal   SuccessfulCreateService    39m                elastic-job-controller   Created service: classy-vision-job-worker-0
169 |   Normal   SuccessfulCreateService    39m                elastic-job-controller   Created service: classy-vision-job-worker-1
170 |   Normal   ExitedWithCode             39m (x3 over 39m)  elastic-job-controller   Pod: default.classy-vision-job-worker-0 exited with code 1
171 |   Warning  ElasticJobRestarting       39m (x3 over 39m)  elastic-job-controller   ElasticJob classy-vision-job is restarting because 1 Worker replica(s) failed.
172 |   Normal   ElasticJobFailed           39m                elastic-job-controller   ElasticJob classy-vision-job is failed because 2 Worker replica(s) failed.
173 | ```
174 | 
175 | ## Not in scope
176 | 
177 | TorchElastic Controller for Kubernetes can simplify the setups to run torch elastic jobs and manage entire job lifecycle. It is hard for the controller to monitor cluster resources and dynamically adjust the task size. Instead, having a separate component like a batch scheduler to make the decision is a better option at this stage, to limit the scope of this project.
178 | 
179 | Currently, for each `ElasticJob`, it has to accept an etcd service as `rdzvEndpoint`. We may consider to make this field optional and provide etcd service by controller if it's not set.


--------------------------------------------------------------------------------
/design/torchelastic/0.2.0/design_doc.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | PyTorch Elastic Trainer (PET) provides a framework for conveniently training 
  3 | models across a compute cluster in a _fault tolerant_ and _elastic_ manner. 
  4 | PET provides these features in two ways:
  5 | 
  6 | 1. When a PyTorch worker process throws a certain class of retriable errors, it is caught by PET and the training process is retried.
  7 | 2. A new worker can leave or join the process pool for an existing training job at any point as long as the number of workers stays within the bounds specified when starting the job. When a membership change happens, all the workers re-rendezvous to establish a new process group and training resumes from the previous well-known good state. 
  8 | 
  9 | In order to integrate with PET, a PyTorch user needs to make the following 
 10 | changes to their training logic:
 11 | 
 12 | 1. They need to enable PET to control their training loop. 
 13 |    Essentially, they provide an "inner training" loop that is wrapped in a 
 14 |    retryable loop by PET. All aspects of establishing or re-establishing the
 15 |    process group as well as restoring the user's trainer to a known good state 
 16 |    is handled by the retryable PET loop.   
 17 | 2. They need to specify _what_ the state is that needs to be restored in case 
 18 |    a new worker joins the pool and _how_ the state is applied to a new worker.
 19 |    The API for specifying these is described by the `State` object.
 20 | 
 21 | PET v.0.1 was released on GitHub, PyPI and Docker Hub in November 2019 and since
 22 | then the community has contributed integrations with Amazon Web Services
 23 | (via Elastic Kubernetes Service) and Microsoft Azure (via Azure Kubernetes Service). 
 24 | 
 25 | # Lessons learned from PET v0.1
 26 | In porting existing PyTorch-based projects such as 
 27 | [ClassyVision](https://github.com/facebookresearch/ClassyVision) and 
 28 | [PyText](https://github.com/facebookresearch/pytext) to use PET, we encountered 
 29 | a few areas for refinement in the v0.1 design. 
 30 | 
 31 | **First**, adapting a mature training library such as ClassyVision to use the
 32 | elastic training APIs often requires a significant amount of restructuring often
 33 | causing bifurcation of code paths between the elastic and non-elastic implementations. 
 34 | 
 35 | **Second**, it is non-trivial to correctly implement the state restore logic for
 36 | each application during in-process recovery. While explicit state such as weight
 37 | tensors are easy to save and restore, there is often "hidden" or implicit state
 38 | in the application that is hard for the developer to reason about. For example,
 39 | after a rendezvous round, a worker process might be expected to restore the state
 40 | of C++ objects either in CPU or GPU memory which are extremely error-prone,
 41 | especially after failures or exceptions. To compound this issue, several 
 42 | applications such as PyText already implement some form of checkpoint/restart and 
 43 | this logic often needs to be taken into account when implementing the elastic state. 
 44 | 
 45 | 
 46 | **Finally**, one of the goals of PET v0.1 was to detect and restart straggler workers.
 47 | This was not possible when running the training loop in process and necessitated
 48 | writing an additional watchdog process to monitor the main training process. 
 49 | 
 50 | For the next iteration of PET, we would like to propose a design that makes it 
 51 | significantly simpler to port existing training workflows to an elastic
 52 | infrastructure and results in applications that can recover more reliably
 53 | from workflow failures. 
 54 | 
 55 | # Overview of the new design
 56 | In PET v.0.2, _we no longer attempt to recover errors in the training function_.
 57 | Instead, PET attempts to maintain the number of worker processes such that they 
 58 | stay within the \[_min_, _max_\] bounds required for the job. 
 59 | The application writer is responsible for loading and restarting from an existing
 60 | checkpoint file is available. Unlike v0.1, PET v0.2 does not mandate how 
 61 | checkpoints are managed. An application writer is free to use just `torch.save` 
 62 | and `torch.load` from PyTorch or a higher-level framework such as
 63 | [PyTorch Lightening](https://github.com/PyTorchLightning/pytorch-lightning).
 64 | 
 65 | PET v0.2 is implemented using a new process named `elastic-agent`. 
 66 | There is a single `elastic-agent` per job, per node. Each agent process is only
 67 | responsible for managing a set of worker process local to that node and coordinating
 68 | process group membership changes with elastic agents on other nodes allocated to 
 69 | that job. This is illustrated in the diagram below:
 70 | 
 71 | ![image](torchelastic_diagram.jpg)
 72 | 
 73 | Membership changes are handled as followed: When a worker process fails, 
 74 | the corresponding elastic agent managing it kills all the workers on that node, 
 75 | establishes rendezvous with the other agents and restarts workers with the new 
 76 | rendezvous information. However, when an agent exits with a non-zero error code, 
 77 | it is up to a higher-level orchestrator such as Kubernetes to restart the agent 
 78 | (which in turn will restart all the workers it is responsible for). 
 79 | The same recovery mechanism holds for node-level failures.
 80 | An orchestrator such as Kubernetes will schedule a job such that a minimum replicas 
 81 | of the elastic agent are running and each agent will in turn orchestrate the 
 82 | user's training script. 
 83 | 
 84 | ![image](torchelastic_agent_diagram.jpg)
 85 | 
 86 | To adopt PET v0.2, an application simply needs its entry-point or `main` function
 87 | to be compatible with the 
 88 | [PyTorch distributed launcher](https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py). 
 89 | We expect distributed training jobs that are started via the distributed launcher
 90 | to be seamlessly started via the elastic agent with none to minimal code changes. 
 91 | The only difference is that in the latter case, the application will be able to
 92 |  make progress in the presence of certain failures. 
 93 | 
 94 | # Overview of the API
 95 | As mentioned above, with PET v0.2, there is no separate library for a training
 96 | application to integrate with. Instead, the user simply launches a training job
 97 | via the elastic agent monitor process. For example, if a user starts their job
 98 | using PyTorch distributed launcher using:
 99 | ```sh
100 | python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_ON_NODE
101 |                TRAINING_SCRIPT.py (... train script args ...)
102 | ```
103 | they would instead use:
104 | 
105 | ```sh
106 | python -m torchelastic.distributed.launch --nproc_per_node=NUM_GPUS_ON_NODE
107 |                 --nnodes=1:4
108 |                 --rdzv_id=JOB_ID
109 |                 --rdzv_backend=etcd
110 |                 --rdzv_endpoint=ETCD_HOST:ETCD_PORT
111 |                 TRAINING_SCRIPT.py (... train script args ...)
112 | ```
113 | Notice that it adds a few additional parameters:
114 | 1. The min and max number of nodes. During a rendezvous, if the number of nodes
115 |    drops below the specified threshold, the job is aborted.
116 | 2. A rendezvous type and its configuration.
117 | 
118 | In side the training script, the only potential change the user needs to do is 
119 | to make sure that they use environment variables to initialize the process group,
120 |  i.e., create the process group as follows:
121 | ```py
122 | import torch.distributed as dist
123 | 
124 | dist.init_process_group(init_method="env://", backend="gloo")
125 | # or
126 | dist.init_process_group(init_method="env://", backend="nccl")
127 | ```
128 | 
129 | All the parameters for initializing the group (the world size, the numerical 
130 | rank, the master address and port) are passed in as environment variables
131 | by the parent elastic agent. 
132 | 
133 | The new PET design is intentionally "bare-bones": it trade-offs the granularity
134 | with which an application can recover for simplicity and robustness. 
135 | In the future, we hope to provide more APIs for convenient checkpointing that a 
136 | developer can optionally use for more efficient restart semantics.
137 | 
138 | # Implementation details and next steps
139 | An implementation of the above ideas is available in [PR #65](https://github.com/pytorch/elastic/pull/65). 
140 | We encourage the community to give evaluate the new functionality and 
141 | give us feedback on the trade-offs we have made in the design either in the PR 
142 | or in this issue. We look forward to hearing from you!
143 | 


--------------------------------------------------------------------------------
/design/torchelastic/0.2.0/torchelastic_agent_diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch/elastic/bc88e6982961d4117e53c4c8163ecf277f35c2c5/design/torchelastic/0.2.0/torchelastic_agent_diagram.jpg


--------------------------------------------------------------------------------
/design/torchelastic/0.2.0/torchelastic_diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch/elastic/bc88e6982961d4117e53c4c8163ecf277f35c2c5/design/torchelastic/0.2.0/torchelastic_diagram.jpg


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Minimal makefile for Sphinx documentation
 3 | # Usage:
 4 | #    make html
 5 | #
 6 | 
 7 | # You can set these variables from the command line.
 8 | SPHINXOPTS    =
 9 | SPHINXBUILD   = sphinx-build
10 | SPHINXPROJ    = torchelastic
11 | SOURCEDIR     = source
12 | BUILDDIR      = build
13 | VERSION       := "0.2.3.dev0"
14 | 
15 | # Put it first so that "make" without argument is like "make help".
16 | help:
17 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
18 | 
19 | clean:
20 | 	@echo "Deleting build directory"
21 | 	rm -rf "$(BUILDDIR)"
22 | 
23 | .PHONY: help Makefile clean
24 | 
25 | # Catch-all target: route all unknown targets to Sphinx using the new
26 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
27 | %: Makefile
28 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)/$(VERSION)" $(SPHINXOPTS) $(O)
29 | 


--------------------------------------------------------------------------------
/docs/doc_push.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Copyright (c) Facebook, Inc. and its affiliates.
  4 | # All rights reserved.
  5 | #
  6 | # This source code is licensed under the BSD-style license found in the
  7 | # LICENSE file in the root directory of this source tree.
  8 | 
  9 | #
 10 | # Builds docs from the checkedout HEAD
 11 | # and pushes the artifacts to gh-pages branch in github.com/pytorch/elastic
 12 | #
 13 | # 1. sphinx generated docs are copied to <repo-root>/<version>
 14 | # 2. if a release tag is found on HEAD then redirects are copied to <repo-root>/latest
 15 | # 3. if no release tag is found on HEAD then redirects are copied to <repo-root>/master
 16 | #
 17 | # gh-pages branch should look as follows:
 18 | # <repo-root>
 19 | #           |- 0.1.0rc2
 20 | #           |- 0.1.0rc3
 21 | #           |- <versions...>
 22 | #           |- master (redirects to the most recent ver in trunk, including release)
 23 | #           |- latest (redirects to the most recent release)
 24 | # If the most recent  release is 0.1.0 and master is at 0.1.1rc1 then,
 25 | # https://pytorch.org/elastic/master -> https://pytorch.org/elastic/0.1.1rc1
 26 | # https://pytorch.org/elastic/latest -> https://pytorch.org/elastic/0.1.0
 27 | #
 28 | # Redirects are done via Jekyll redirect-from  plugin. See:
 29 | #   sources/scripts/create_redirect_md.py
 30 | #   Makefile (redirect target)
 31 | #  (on gh-pages branch) _layouts/docs_redirect.html
 32 | 
 33 | dry_run=0
 34 | for arg in "$@"; do
 35 |     shift
 36 |     case "$arg" in
 37 |         "--dry-run") dry_run=1 ;;
 38 |         "--help") echo "Usage $0 [--dry-run]"; exit 0 ;;
 39 |     esac
 40 | done
 41 | 
 42 | repo_root=$(git rev-parse --show-toplevel)
 43 | branch=$(git rev-parse --abbrev-ref HEAD)
 44 | commit_id=$(git rev-parse --short HEAD)
 45 | 
 46 | if ! release_tag=$(git describe --tags --exact-match HEAD 2>/dev/null); then
 47 |     echo "No release tag found, building docs for master..."
 48 |     redirects=(master)
 49 |     release_tag="master"
 50 | else
 51 |     echo "Release tag $release_tag found, building docs for release..."
 52 |     redirects=(latest master)
 53 | fi
 54 | 
 55 | echo "Installing torchelastic from $repo_root..."
 56 | cd "$repo_root" || exit
 57 | pip uninstall -y torchelastic
 58 | python setup.py install
 59 | 
 60 | torchelastic_ver=$(python -c "import torchelastic; print(torchelastic.__version__)")
 61 | 
 62 | echo "Building PyTorch Elastic v$torchelastic_ver docs..."
 63 | docs_dir=$repo_root/docs
 64 | build_dir=$docs_dir/build
 65 | cd "$docs_dir" || exit
 66 | pip install -r requirements.txt
 67 | make clean html
 68 | echo "Doc build complete"
 69 | 
 70 | if [ $dry_run -eq 1 ]; then
 71 |     echo "*** dry-run mode, building only. See build artifacts in: $build_dir"
 72 |     exit
 73 | fi
 74 | 
 75 | tmp_dir=/tmp/torchelastic_docs_tmp
 76 | rm -rf "${tmp_dir:?}"
 77 | 
 78 | echo "Checking out gh-pages branch..."
 79 | gh_pages_dir="$tmp_dir/elastic_gh_pages"
 80 | git clone -b gh-pages --single-branch git@github.com:pytorch/elastic.git  $gh_pages_dir
 81 | 
 82 | echo "Copying doc pages for $torchelastic_ver into $gh_pages_dir..."
 83 | rm -rf "${gh_pages_dir:?}/${torchelastic_ver:?}"
 84 | cp -R "$build_dir/$torchelastic_ver/html" "$gh_pages_dir/$torchelastic_ver"
 85 | 
 86 | for redirect in "${redirects[@]}"; do
 87 |   echo "Copying redirects for $redirect -> $torchelastic_ver..."
 88 |   rm -rf "${gh_pages_dir:?}/${redirect:?}"
 89 |   cp -R "$build_dir/redirects" "$gh_pages_dir/$redirect"
 90 | done
 91 | 
 92 | if [ "$release_tag" != "master" ]; then
 93 |     echo "Copying redirects for default(latest) -> $torchelastic_ver..."
 94 |     cp -R "$build_dir/redirects/." "$gh_pages_dir"
 95 | fi
 96 | 
 97 | cd $gh_pages_dir || exit
 98 | git add .
 99 | git commit --quiet -m "[doc_push][$release_tag] built from $commit_id ($branch). Redirects: ${redirects[*]} -> $torchelastic_ver."
100 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | -e git+http://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
3 | sphinxcontrib.katex
4 | matplotlib
5 | 


--------------------------------------------------------------------------------
/docs/source/_static/img/efs-setup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch/elastic/bc88e6982961d4117e53c4c8163ecf277f35c2c5/docs/source/_static/img/efs-setup.jpg


--------------------------------------------------------------------------------
/docs/source/_static/img/pytorch-logo-dark.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 22.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 199.7 40.2" style="enable-background:new 0 0 199.7 40.2;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#EE4C2C;}
 7 | 	.st1{fill:#252525;}
 8 | </style>
 9 | <g>
10 | 	<path class="st0" d="M40.8,9.3l-2.1,2.1c3.5,3.5,3.5,9.2,0,12.7c-3.5,3.5-9.2,3.5-12.7,0c-3.5-3.5-3.5-9.2,0-12.7l0,0l5.6-5.6
11 | 		L32.3,5l0,0V0.8l-8.5,8.5c-4.7,4.7-4.7,12.2,0,16.9s12.2,4.7,16.9,0C45.5,21.5,45.5,13.9,40.8,9.3z"/>
12 | 	<circle class="st0" cx="36.6" cy="7.1" r="1.6"/>
13 | </g>
14 | <g>
15 | 	<g>
16 | 		<path class="st1" d="M62.6,20l-3.6,0v9.3h-2.7V2.9c0,0,6.3,0,6.6,0c7,0,10.3,3.4,10.3,8.3C73.2,17,69.1,19.9,62.6,20z M62.8,5.4
17 | 			c-0.3,0-3.9,0-3.9,0v12.1l3.8-0.1c5-0.1,7.7-2.1,7.7-6.2C70.4,7.5,67.8,5.4,62.8,5.4z"/>
18 | 		<path class="st1" d="M85.4,29.2l-1.6,4.2c-1.8,4.7-3.6,6.1-6.3,6.1c-1.5,0-2.6-0.4-3.8-0.9l0.8-2.4c0.9,0.5,1.9,0.8,3,0.8
19 | 			c1.5,0,2.6-0.8,4-4.5l1.3-3.4L75.3,10h2.8l6.1,16l6-16h2.7L85.4,29.2z"/>
20 | 		<path class="st1" d="M101.9,5.5v23.9h-2.7V5.5h-9.3V2.9h21.3v2.5H101.9z"/>
21 | 		<path class="st1" d="M118.8,29.9c-5.4,0-9.4-4-9.4-10.2c0-6.2,4.1-10.3,9.6-10.3c5.4,0,9.3,4,9.3,10.2
22 | 			C128.3,25.8,124.2,29.9,118.8,29.9z M118.9,11.8c-4.1,0-6.8,3.3-6.8,7.8c0,4.7,2.8,7.9,6.9,7.9s6.8-3.3,6.8-7.8
23 | 			C125.8,15,123,11.8,118.9,11.8z"/>
24 | 		<path class="st1" d="M135,29.4h-2.6V10l2.6-0.5v4.1c1.3-2.5,3.2-4.1,5.7-4.1c1.3,0,2.5,0.4,3.4,0.9l-0.7,2.5
25 | 			c-0.8-0.5-1.9-0.8-3-0.8c-2,0-3.9,1.5-5.5,5V29.4z"/>
26 | 		<path class="st1" d="M154.4,29.9c-5.8,0-9.5-4.2-9.5-10.2c0-6.1,4-10.3,9.5-10.3c2.4,0,4.4,0.6,6.1,1.7l-0.7,2.4
27 | 			c-1.5-1-3.3-1.6-5.4-1.6c-4.2,0-6.8,3.1-6.8,7.7c0,4.7,2.8,7.8,6.9,7.8c1.9,0,3.9-0.6,5.4-1.6l0.5,2.4
28 | 			C158.7,29.3,156.6,29.9,154.4,29.9z"/>
29 | 		<path class="st1" d="M176.7,29.4V16.9c0-3.4-1.4-4.9-4.1-4.9c-2.2,0-4.4,1.1-6,2.8v14.7h-2.6V0.9l2.6-0.5c0,0,0,12.1,0,12.2
30 | 			c2-2,4.6-3.1,6.7-3.1c3.8,0,6.1,2.4,6.1,6.6v13.3H176.7z"/>
31 | 	</g>
32 | </g>
33 | </svg>
34 | 


--------------------------------------------------------------------------------
/docs/source/_static/img/pytorch-logo-flame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pytorch/elastic/bc88e6982961d4117e53c4c8163ecf277f35c2c5/docs/source/_static/img/pytorch-logo-flame.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # PyTorch documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Dec 23 13:31:47 2016.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | import pytorch_sphinx_theme
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | #
 22 | # import os
 23 | # import sys
 24 | # sys.path.insert(0, os.path.abspath('.'))
 25 | from docutils import nodes
 26 | from sphinx import addnodes
 27 | from sphinx.util.docfields import TypedField
 28 | 
 29 | 
 30 | # -- General configuration ------------------------------------------------
 31 | 
 32 | # If your documentation needs a minimal Sphinx version, state it here.
 33 | #
 34 | needs_sphinx = "1.6"
 35 | 
 36 | # Add any Sphinx extension module names here, as strings. They can be
 37 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 38 | # ones.
 39 | extensions = [
 40 |     "sphinx.ext.autodoc",
 41 |     "sphinx.ext.autosummary",
 42 |     "sphinx.ext.doctest",
 43 |     "sphinx.ext.intersphinx",
 44 |     "sphinx.ext.todo",
 45 |     "sphinx.ext.coverage",
 46 |     "sphinx.ext.napoleon",
 47 |     "sphinx.ext.viewcode",
 48 |     "sphinxcontrib.katex",
 49 |     "sphinx.ext.autosectionlabel",
 50 | ]
 51 | 
 52 | # katex options
 53 | #
 54 | #
 55 | 
 56 | katex_options = r"""
 57 | delimiters : [
 58 |    {left: "$$", right: "$$", display: true},
 59 |    {left: "\\(", right: "\\)", display: false},
 60 |    {left: "\\[", right: "\\]", display: true}
 61 | ]
 62 | """
 63 | 
 64 | napoleon_use_ivar = True
 65 | 
 66 | # Add any paths that contain templates here, relative to this directory.
 67 | templates_path = ["_templates"]
 68 | 
 69 | # The suffix(es) of source filenames.
 70 | # You can specify multiple suffix as a list of string:
 71 | #
 72 | # source_suffix = ['.rst', '.md']
 73 | source_suffix = [".rst", ".md"]
 74 | 
 75 | # The master toctree document.
 76 | master_doc = "index"
 77 | 
 78 | # General information about the project.
 79 | project = "PyTorch/Elastic"
 80 | copyright = "2020, PyTorch Elastic Contributors"
 81 | author = "PyTorch Elastic Contributors"
 82 | 
 83 | # The version info for the project you're documenting, acts as replacement for
 84 | # |version| and |release|, also used in various other places throughout the
 85 | # built documents.
 86 | #
 87 | # The short X.Y version.
 88 | # TODO: change to [:2] at v1.0
 89 | version = "v0.2.3.dev0"
 90 | # The full version, including alpha/beta/rc tags.
 91 | # TODO: verify this works as expected
 92 | release = "master"
 93 | 
 94 | # The language for content autogenerated by Sphinx. Refer to documentation
 95 | # for a list of supported languages.
 96 | #
 97 | # This is also used if you do content translation via gettext catalogs.
 98 | # Usually you set "language" from the command line for these cases.
 99 | language = None
100 | 
101 | # List of patterns, relative to source directory, that match files and
102 | # directories to ignore when looking for source files.
103 | # This patterns also effect to html_static_path and html_extra_path
104 | exclude_patterns = []
105 | 
106 | # The name of the Pygments (syntax highlighting) style to use.
107 | pygments_style = "sphinx"
108 | 
109 | # If true, `todo` and `todoList` produce output, else they produce nothing.
110 | todo_include_todos = True
111 | 
112 | 
113 | # -- Options for HTML output ----------------------------------------------
114 | 
115 | # The theme to use for HTML and HTML Help pages.  See the documentation for
116 | # a list of builtin themes.
117 | #
118 | html_theme = "pytorch_sphinx_theme"
119 | html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
120 | 
121 | # Theme options are theme-specific and customize the look and feel of a theme
122 | # further.  For a list of options available for each theme, see the
123 | # documentation.
124 | #
125 | html_theme_options = {
126 |     "pytorch_project": "elastic",
127 |     "collapse_navigation": False,
128 |     "display_version": True,
129 |     "logo_only": True,
130 | }
131 | 
132 | html_logo = "_static/img/pytorch-logo-dark.svg"
133 | 
134 | # Add any paths that contain custom static files (such as style sheets) here,
135 | # relative to this directory. They are copied after the builtin static files,
136 | # so a file named "default.css" will overwrite the builtin "default.css".
137 | html_static_path = ["_static"]
138 | 
139 | 
140 | def setup(app):
141 |     # NOTE: in Sphinx 1.8+ `html_css_files` is an official configuration value
142 |     # and can be moved outside of this function (and the setup(app) function
143 |     # can be deleted).
144 |     html_css_files = [
145 |         "https://cdn.jsdelivr.net/npm/katex@0.10.0-beta/dist/katex.min.css"
146 |     ]
147 | 
148 |     # In Sphinx 1.8 it was renamed to `add_css_file`, 1.7 and prior it is
149 |     # `add_stylesheet` (deprecated in 1.8).
150 |     add_css = getattr(
151 |         app, "add_css_file", getattr(app, "add_stylesheet", None)
152 |     )  # noqa B009
153 |     for css_file in html_css_files:
154 |         add_css(css_file)
155 | 
156 | 
157 | # -- Options for HTMLHelp output ------------------------------------------
158 | 
159 | # Output file base name for HTML help builder.
160 | htmlhelp_basename = "TorchElasticdoc"
161 | 
162 | 
163 | # -- Options for LaTeX output ---------------------------------------------
164 | 
165 | latex_elements = {
166 |     # The paper size ('letterpaper' or 'a4paper').
167 |     #
168 |     # 'papersize': 'letterpaper',
169 |     # The font size ('10pt', '11pt' or '12pt').
170 |     #
171 |     # 'pointsize': '10pt',
172 |     # Additional stuff for the LaTeX preamble.
173 |     #
174 |     # 'preamble': '',
175 |     # Latex figure (float) alignment
176 |     #
177 |     # 'figure_align': 'htbp',
178 | }
179 | 
180 | # Grouping the document tree into LaTeX files. List of tuples
181 | # (source start file, target name, title,
182 | #  author, documentclass [howto, manual, or own class]).
183 | latex_documents = [
184 |     (
185 |         master_doc,
186 |         "pytorch.tex",
187 |         "Torchelastic Documentation",
188 |         "Torch Contributors",
189 |         "manual",
190 |     )
191 | ]
192 | 
193 | 
194 | # -- Options for manual page output ---------------------------------------
195 | 
196 | # One entry per manual page. List of tuples
197 | # (source start file, name, description, authors, manual section).
198 | man_pages = [(master_doc, "Torchelastic", "Torchelastic Documentation", [author], 1)]
199 | 
200 | 
201 | # -- Options for Texinfo output -------------------------------------------
202 | 
203 | # Grouping the document tree into Texinfo files. List of tuples
204 | # (source start file, target name, title, author,
205 | #  dir menu entry, description, category)
206 | texinfo_documents = [
207 |     (
208 |         master_doc,
209 |         "Torchelastic",
210 |         "Torchelastic Documentation",
211 |         author,
212 |         "Torchelastic",
213 |         "PyTorch Elastic Training",
214 |         "Miscellaneous",
215 |     )
216 | ]
217 | 
218 | 
219 | # Example configuration for intersphinx: refer to the Python standard library.
220 | intersphinx_mapping = {
221 |     "python": ("https://docs.python.org/", None),
222 |     "numpy": ("https://docs.scipy.org/doc/numpy/", None),
223 |     "torch": ("https://pytorch.org/docs/stable/", None),
224 | }
225 | 
226 | # -- A patch that prevents Sphinx from cross-referencing ivar tags -------
227 | # See http://stackoverflow.com/a/41184353/3343043
228 | 
229 | 
230 | def patched_make_field(self, types, domain, items, **kw):
231 |     # `kw` catches `env=None` needed for newer sphinx while maintaining
232 |     #  backwards compatibility when passed along further down!
233 | 
234 |     def handle_item(fieldarg, content):
235 |         par = nodes.paragraph()
236 |         par += addnodes.literal_strong("", fieldarg)  # Patch: this line added
237 |         # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
238 |         #                           addnodes.literal_strong))
239 |         if fieldarg in types:
240 |             par += nodes.Text(" (")
241 |             # NOTE: using .pop() here to prevent a single type node to be
242 |             # inserted twice into the doctree, which leads to
243 |             # inconsistencies later when references are resolved
244 |             fieldtype = types.pop(fieldarg)
245 |             if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
246 |                 typename = "".join(n.astext() for n in fieldtype)
247 |                 typename = typename.replace("int", "python:int")
248 |                 typename = typename.replace("long", "python:long")
249 |                 typename = typename.replace("float", "python:float")
250 |                 typename = typename.replace("type", "python:type")
251 |                 par.extend(
252 |                     self.make_xrefs(
253 |                         self.typerolename,
254 |                         domain,
255 |                         typename,
256 |                         addnodes.literal_emphasis,
257 |                         **kw,
258 |                     )
259 |                 )
260 |             else:
261 |                 par += fieldtype
262 |             par += nodes.Text(")")
263 |         par += nodes.Text(" -- ")
264 |         par += content
265 |         return par
266 | 
267 |     fieldname = nodes.field_name("", self.label)
268 |     if len(items) == 1 and self.can_collapse:
269 |         fieldarg, content = items[0]
270 |         bodynode = handle_item(fieldarg, content)
271 |     else:
272 |         bodynode = self.list_type()
273 |         for fieldarg, content in items:
274 |             bodynode += nodes.list_item("", handle_item(fieldarg, content))
275 |     fieldbody = nodes.field_body("", bodynode)
276 |     return nodes.field("", fieldname, fieldbody)
277 | 
278 | 
279 | TypedField.make_field = patched_make_field
280 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | :github_url: https://github.com/pytorch/elastic
2 | 
3 | TorchElastic
4 | ==================
5 | 
6 | .. important:: TorchElastic has been upstreamed to `PyTorch 1.9 <https://pytorch.org/docs/stable/distributed.elastic.html>`_.
7 |                TSM has been upstreamed to `TorchX <https://pytorch.org/torchx>`_.
8 | 


--------------------------------------------------------------------------------
/docs/source/scripts/create_redirect_md.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | """
10 | For each rst file, generates a corresponding rst file
11 | that redirects http://pytorch.org/elastic/<version>/<file_name>.html
12 | to http://pytorch.org/elastic/latest/<file_name>.html
13 | """
14 | 
15 | import argparse
16 | import glob
17 | import os
18 | import sys
19 | 
20 | import torchelastic
21 | 
22 | 
23 | def parse_args(args):
24 |     parser = argparse.ArgumentParser()
25 |     parser.add_argument(
26 |         "--source_dir", required=True, help="directory where rst files are"
27 |     )
28 |     parser.add_argument("--build_dir", required=True, help="directory to drop md files")
29 | 
30 |     return parser.parse_args(args[1:])
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     args = parse_args(sys.argv)
35 |     build_ver = torchelastic.__version__
36 |     source_dir = args.source_dir
37 |     build_dir = args.build_dir
38 |     print(f"Creating redirect files from source_dir: {source_dir} into {build_dir}")
39 |     for rst_file in glob.glob(os.path.join(source_dir, "**/*.rst"), recursive=True):
40 |         rst_relative_path = os.path.relpath(rst_file, source_dir)
41 |         md_relative_path = os.path.splitext(rst_relative_path)[0] + ".md"
42 |         html_relative_path = os.path.splitext(rst_relative_path)[0] + ".html"
43 |         md_file = os.path.join(build_dir, md_relative_path)
44 |         os.makedirs(os.path.dirname(md_file), exist_ok=True)
45 | 
46 |         print(f"Creating redirect md for {rst_relative_path} --> {md_file}")
47 |         with open(md_file, "w") as f:
48 |             f.write("---\n")
49 |             f.write("layout: docs_redirect\n")
50 |             f.write("title: PyTorch | Redirect\n")
51 |             f.write(f'redirect_url: "/elastic/{build_ver}/{html_relative_path}"\n')
52 |             f.write("---\n")
53 | 


--------------------------------------------------------------------------------
/docs/src/pip-delete-this-directory.txt:
--------------------------------------------------------------------------------
1 | This file is placed here by pip to indicate the source was put
2 | here by pip.
3 | 
4 | Once this package is successfully installed this source code will be
5 | deleted (unless you remove this file).
6 | 


--------------------------------------------------------------------------------
/examples/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_IMAGE=pytorch/pytorch:1.8.0-cuda11.1-cudnn8-runtime
 2 | FROM $BASE_IMAGE
 3 | 
 4 | # install utilities and dependencies
 5 | RUN pip install awscli --upgrade
 6 | RUN pip install classy-vision
 7 | 
 8 | RUN pip uninstall -y torch
 9 | # TODO remove and make the BASE_IMAGE pytorch:1.9.0-cuda11.1-cudnn8-runtime when torch-1.9 releases
10 | RUN pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html
11 | 
12 | WORKDIR /workspace
13 | 
14 | # download imagenet tiny for data
15 | RUN apt-get -q update && apt-get -q install -y wget unzip
16 | RUN wget -q http://cs231n.stanford.edu/tiny-imagenet-200.zip && unzip -q tiny-imagenet-200.zip -d data && rm tiny-imagenet-200.zip
17 | 
18 | COPY . ./examples
19 | RUN chmod -R u+x ./examples/bin
20 | RUN examples/bin/install_etcd -d examples/bin
21 | ENV PATH=/workspace/examples/bin:${PATH}
22 | 
23 | # create a template classy project in /workspace/classy_vision
24 | # (see https://classyvision.ai/#quickstart)
25 | RUN classy-project classy_vision
26 | 
27 | USER root
28 | ENTRYPOINT ["python", "-m", "torch.distributed.run"]
29 | CMD ["--help"]
30 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
  1 | Examples
  2 | =============
  3 | 
  4 | The examples below run on the [torchelastic/examples](https://hub.docker.com/r/torchelastic/examples)
  5 | Docker image, built from the [examples/Dockerfile](https://github.com/pytorch/elastic/blob/master/examples/Dockerfile).
  6 | 
  7 | .. note:: The ``$VERSION`` (e.g. ``0.2.0``) variable is used throughout this page,
  8 |           this should be substituted with the version of torchelastic you are using.
  9 |           The examples below only work on torchelastic ``>=0.2.0``.
 10 | 
 11 | Prerequisite
 12 | --------------
 13 | 
 14 | 1. (recommended) Instance with GPU(s)
 15 | 2. [Docker](https://docs.docker.com/install/)
 16 | 3. [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker)
 17 | 4. ``export VERSION=<torchelastic version>``
 18 | 
 19 | > **NOTE:** PyTorch data loaders use ``shm``. The default docker ``shm-size``
 20 | > is not large enough and will OOM when using multiple data loader workers.
 21 | > you must pass ``--shm-size`` to the ``docker run`` command or set the
 22 | > number of data loader workers to ``0`` (run on the same process)
 23 | > by passing the appropriate option to the script (use the ``--help`` flag
 24 | > to see all script options). In the examples below we set ``--shm-size``.
 25 | 
 26 | Classy Vision
 27 | --------------
 28 | [Classy Vision](https://classyvision.ai/) is an end-to-end framework
 29 | for image and video classification built on PyTorch. It works out-of-the-box
 30 | with torchelastic's launcher.
 31 | 
 32 | Launch two trainers on a single node:
 33 | 
 34 | ```
 35 | >>> docker run --shm-size=2g torchelastic/examples:$VERSION
 36 |                --standalone
 37 |                --nnodes=1
 38 |                --nproc_per_node=2
 39 |                /workspace/classy_vision/classy_train.py
 40 |                --config_file /workspace/classy_vision/configs/template_config.json
 41 | ```
 42 | 
 43 | If you have an instance with GPUs, run a worker on each GPU:
 44 | 
 45 | ```
 46 | >>> docker run --shm-size=2g
 47 |                --gpus=all
 48 |                torchelastic/examples:$VERSION
 49 |                --standalone
 50 |                --nnodes=1
 51 |                --nproc_per_node=$NUM_CUDA_DEVICES
 52 |                /workspace/classy_vision/classy_train.py
 53 |                --device=gpu
 54 |                --config_file /workspace/classy_vision/configs/template_config.json
 55 | ```
 56 | 
 57 | Imagenet
 58 | ----------
 59 | 
 60 | > **NOTE:** an instance with at least one GPU is required for this example
 61 | 
 62 | Launch ``$NUM_CUDA_DEVICES`` number of workers on a single node:
 63 | 
 64 | ```
 65 | >>> docker run --shm-size=2g --gpus=all torchelastic/examples:$VERSION
 66 |                --standalone
 67 |                --nnodes=1
 68 |                --nproc_per_node=$NUM_CUDA_DEVICES
 69 |                /workspace/examples/imagenet/main.py
 70 |                --arch resnet18
 71 |                --epochs 20
 72 |                --batch-size 32
 73 |                /workspace/data/tiny-imagenet-200
 74 | ```
 75 | 
 76 | Multi-container
 77 | ----------------
 78 | We now show how to use the PyTorch Elastic Trainer launcher
 79 | to start a distributed application spanning more than one container. The
 80 | application is intentionally kept "bare bones" since the
 81 | objective is to show how to create a ``torch.distributed.ProcessGroup``
 82 | instance. Once a ``ProcessGroup`` is created, you can use any
 83 | functionality needed from the ``torch.distributed`` package.
 84 | 
 85 | The ``docker-compose.yml`` file is based on the example provided with
 86 | the [Bitnami ETCD container image](https://hub.docker.com/r/bitnami/etcd/).
 87 | 
 88 | 
 89 | 
 90 | ### Obtaining the example repo
 91 | 
 92 | Clone the PyTorch Elastic Trainer Git repo using
 93 | 
 94 | ```
 95 | git clone https://github.com/pytorch/elastic.git
 96 | ```
 97 | 
 98 | make an environment variable that points to the elastic repo, e.g.
 99 | 
100 | ```
101 | export TORCHELASTIC_HOME=~/elastic
102 | ```
103 | 
104 | ### Building the samples Docker container
105 | 
106 | While you can run the rest of this example using a pre-built Docker
107 | image, you can also build one for yourself. This is especially useful if
108 | you would like to customize the image. To build the image, run:
109 | 
110 | ```
111 | cd $TORCHELASTIC_HOME && docker build -t hello_elastic:dev .
112 | ```
113 | 
114 | ### Running an existing sample
115 | 
116 | This example uses ``docker-compose`` to run two containers: one for the
117 | ETCD service and one for the sample application itself. Docker compose
118 | takes care of all aspects of establishing the network interfaces so the
119 | application container can communicate with the ETCD container.
120 | 
121 | To start the example, run
122 | 
123 | ```
124 | cd $TORCHELASTIC_HOME/examples/multi_container && docker-compose up
125 | ```
126 | 
127 | You should see two sets of outputs, one from ETCD starting up and one
128 | from the application itself. The output from the application looks
129 | something like this:
130 | 
131 | ```
132 |    example_1      | INFO 2020-04-03 17:36:31,582 Etcd machines: ['http://etcd-server:2379']
133 |    example_1      | *****************************************
134 |    example_1      | Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
135 |    example_1      | *****************************************
136 |    example_1      | INFO 2020-04-03 17:36:31,922 Attempting to join next rendezvous
137 |    example_1      | INFO 2020-04-03 17:36:31,929 New rendezvous state created: {'status': 'joinable', 'version': '1', 'participants': []}
138 |    example_1      | INFO 2020-04-03 17:36:32,032 Joined rendezvous version 1
139 | ```
140 | 
141 | The high-level differences between a single-container vs multi-container
142 | launches are:
143 | 
144 | 1. Specify ``--nnodes=$MIN_NODE:$MAX_NODE`` instead of ``--nnodes=1``.
145 | 2. An etcd server must be setup before starting the worker containers.
146 | 3. Remove ``--standalone`` and specify ``--rdzv_backend``, ``--rdzv_endpoint`` and ``--rdzv_id``.
147 | 
148 | For more information see [torch.distributed.run](https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py).
149 | 
150 | 
151 | 
152 | Multi-node
153 | -----------
154 | 
155 | The multi-node, multi-worker case is similar to running multi-container, multi-worker.
156 | Simply run each container on a separate node, occupying the entire node.
157 | Alternatively, you can use our kubernetes
158 | [elastic job controller](https://github.com/pytorch/elastic/tree/master/kubernetes) to launch a multi-node job.
159 | 
160 | > **WARNING**: We recommend you setup a highly available etcd server when
161 | > deploying multi-node jobs in production as this is the single
162 | > point of failure for your jobs. Depending on your usecase
163 | > you can either sidecar an etcd server with each job or setup
164 | > a shared etcd server. If etcd does not meet your requirements
165 | > you can implement your own rendezvous handler and use our
166 | > APIs to create a custom launcher.
167 | 


--------------------------------------------------------------------------------
/examples/bin/fetch_and_run:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import os
10 | import sys
11 | import tarfile as tar
12 | import tempfile
13 | from urllib.parse import urlparse
14 | 
15 | 
16 | """
17 | Fetch a script or tar.gz from s3 and runs it.
18 | 
19 | Usage:
20 | 
21 | fetch_and_run $HOME/my_script [<script args>]
22 |   - or -
23 | 
24 | fetch_and_run s3://my_bucket/my_script [<script args>]
25 |   - or -
26 | 
27 | fetch_and_run s3://my_bucket/my_workspace.tar.gz my_script [<script args>]
28 | 
29 | """
30 | 
31 | 
32 | def fetch_and_run_local(script, args):
33 |     executable = os.path.abspath(script)
34 |     exit_code = os.system(f"chmod u+x {executable}")
35 |     if exit_code:
36 |         sys.exit(f"Failed to chmod {executable}")
37 | 
38 |     os.execvp(executable, [os.path.basename(script), *args])
39 | 
40 | 
41 | def fetch_and_run_s3(bucket, key, args):
42 |     # key may be tar or a file
43 |     # download into tmp dir
44 |     tmp_dir = tempfile.mkdtemp(prefix="fetch_and_run_")
45 |     s3_path = f"s3://{bucket}/{key}"
46 | 
47 |     # if the file is a tar we extract in tmp_dir
48 |     # to avoid clobbering, download into a tmp file
49 |     # then delete the file after extracting
50 |     _, tmp_file = tempfile.mkstemp(prefix="s3_file_", dir=tmp_dir)
51 | 
52 |     # using aws cli in favor of boto3 to not have to take extra dependencies
53 |     exit_code = os.system(f"aws s3 cp {s3_path} {tmp_file}")
54 |     if exit_code:
55 |         sys.exit(f"Error downloading {s3_path} to {tmp_file}. Exit code: {exit_code}")
56 | 
57 |     if tar.is_tarfile(tmp_file):
58 |         if not args:
59 |             sys.exit("Must specify a script when running fetch_and_run with a tarball")
60 | 
61 |         with tar.open(tmp_file) as t:
62 |             t.extractall(path=tmp_dir)
63 |         os.remove(tmp_file)
64 |         fetch_and_run_local(os.path.join(tmp_dir, args[0]), args[1:])
65 |     else:
66 |         # since we downloaded into a tmp file, rename to original file name
67 |         file = os.path.join(tmp_dir, os.path.basename(key))
68 |         os.rename(tmp_file, file)
69 |         fetch_and_run_local(file, args)
70 | 
71 | 
72 | # target may be a local script or a script or tarball in s3
73 | target = sys.argv[1]
74 | parsed = urlparse(target)
75 | scheme = parsed.scheme
76 | 
77 | if scheme == "":
78 |     fetch_and_run_local(target, sys.argv[2:])
79 | elif scheme == "s3":
80 |     fetch_and_run_s3(
81 |         bucket=parsed.netloc, key=parsed.path.strip("/"), args=sys.argv[2:]
82 |     )
83 | else:
84 |     sys.exit(f"Unsupported scheme: {scheme} in script url: {target}")
85 | 


--------------------------------------------------------------------------------
/examples/bin/install_etcd:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | set -e
10 | 
11 | usage() { echo "Usage: $0 [-v <etcd ver>] [-d <install bin dir>]"; exit 2; }
12 | 
13 | ETCD_VER="v3.4.3"
14 | BIN_DIR="/usr/local/bin"
15 | 
16 | while getopts v:d: opts; do
17 |   case ${opts} in
18 |     v) ETCD_VER=v${OPTARG} ;;
19 |     d) BIN_DIR=${OPTARG} ;;
20 |     *) usage ;;
21 |   esac
22 | done
23 | 
24 | echo "Installing etcd ${ETCD_VER} to ${BIN_DIR}"
25 | 
26 | BASE_DOWNLOAD_URL=https://github.com/etcd-io/etcd/releases/download
27 | ETCD_TAR_NAME="etcd-${ETCD_VER}-linux-amd64.tar.gz"
28 | DOWNLOAD_URL="${BASE_DOWNLOAD_URL}/${ETCD_VER}/${ETCD_TAR_NAME}"
29 | 
30 | TMP_DIR="/tmp/etcd-${ETCD_VER}"
31 | rm -rf "${TMP_DIR}" && mkdir -p "${TMP_DIR}"
32 | 
33 | echo "Downloading pre-built binary from ${DOWNLOAD_URL}"
34 | wget "${DOWNLOAD_URL}" -O "${TMP_DIR}/${ETCD_TAR_NAME}"
35 | tar xzf "${TMP_DIR}/${ETCD_TAR_NAME}" -C "${TMP_DIR}" --strip-components=1
36 | 
37 | mkdir -p "${BIN_DIR}"
38 | cp -p "${TMP_DIR}/etcd" "${BIN_DIR}"
39 | cp -p "${TMP_DIR}/etcdctl" "${BIN_DIR}"
40 | 
41 | rm -rf "${TMP_DIR}"
42 | 
43 | echo "------------------------"
44 | "${BIN_DIR}/etcd" -version
45 | echo "------------------------"
46 | "${BIN_DIR}/etcdctl" version
47 | echo "------------------------"
48 | 
49 | echo "Finished installing etcd ${ETCD_VER}. To use: ${BIN_DIR}/(etcd | etcdctl)"


--------------------------------------------------------------------------------
/examples/multi_container/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8-buster
 2 | WORKDIR /workspace
 3 | RUN pip install numpy python-etcd
 4 | RUN pip install torch==1.5.0
 5 | # TODO Replace this with the PIP version when available
 6 | ADD torchelastic torchelastic
 7 | ADD echo.py echo.py
 8 | ENV PYTHONPATH /workspace
 9 | ENV ALLOW_NONE_AUTHENTICATION yes
10 | ENTRYPOINT ["python", "/workspace/torchelastic/distributed/launch.py"]
11 | 


--------------------------------------------------------------------------------
/examples/multi_container/README.md:
--------------------------------------------------------------------------------
  1 | # A minimal elastic agent example
  2 | In this example, we show how to use the PyTorch Elastic Trainer launcher to start a distributed application in an elastic and fault tolerant manner. The application is intentionally kept "bare bones" since the objective is to show how to create a `torch.distributed.ProcessGroup` instance. Once a `ProcessGroup` is created, you can use any functionality needed from the `torch.distributed` package.
  3 | 
  4 | This application can be run on practically any machine that supports Docker containers and does not require installing additional software or modifying your existing Python environment.
  5 | 
  6 | > The `docker-compose.yml` file is based on the example provided with the [Bitnami ETCD container image](https://hub.docker.com/r/bitnami/etcd/).
  7 | 
  8 | ## Prerequisites
  9 | We assume you have a recent version of Docker (version 18.03 or above) and Docker Compose installed on your machine. Verify the version by running
 10 | ```
 11 | docker --version
 12 | ```
 13 | and
 14 | ```
 15 | docker-compose --version
 16 | ```
 17 | which should print something like
 18 | ```
 19 | Docker version 19.03.8, build afacb8b
 20 | ```
 21 | and
 22 | ```
 23 | docker-compose version 1.25.4, build 8d51620a
 24 | ```
 25 | respectively.
 26 | ## Obtaining the example repo
 27 | Clone the PyTorch Elastic Trainer Git repo using
 28 | ```
 29 | git clone https://github.com/pytorch/elastic.git
 30 | ```
 31 | make an environment variable that points to the elastic repo, e.g.
 32 | ```
 33 | export TORCHELASTIC_HOME=~/elastic
 34 | ```
 35 | 
 36 | # Building the samples Docker container
 37 | While you can run the rest of this example using a pre-built Docker image, you can also build one for yourself. This is especially useful if you would like to customize the image. To build the image, run:
 38 | ```
 39 | cd $TORCHELASTIC_HOME && docker build -t hello_elastic:dev .
 40 | ```
 41 | 
 42 | # Running an existing sample
 43 | This example uses `docker-compose` to run two containers: one for the ETCD service and one for the sample application itself. Docker compose takes care of all aspects of establishing the network interfaces so the application container can communicate with the ETCD container.
 44 | 
 45 | To start the example, run
 46 | ```
 47 | cd $TORCHELASTIC_HOME/examples/multi_container && docker-compose up
 48 | ```
 49 | You should see two sets of outputs, one from ETCD starting up and one from the application itself. The output from the application looks something like this:
 50 | 
 51 | ```
 52 | example_1      | INFO 2020-04-03 17:36:31,582 Etcd machines: ['http://etcd-server:2379']
 53 | example_1      | *****************************************
 54 | example_1      | Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
 55 | example_1      | *****************************************
 56 | example_1      | INFO 2020-04-03 17:36:31,922 Attempting to join next rendezvous
 57 | example_1      | INFO 2020-04-03 17:36:31,929 New rendezvous state created: {'status': 'joinable', 'version': '1', 'participants': []}
 58 | example_1      | INFO 2020-04-03 17:36:32,032 Joined rendezvous version 1 as rank 0. Full state: {'status': 'frozen', 'version': '1', 'participants': [0], 'keep_alives': []}
 59 | example_1      | INFO 2020-04-03 17:36:32,032 Waiting for remaining peers.
 60 | example_1      | INFO 2020-04-03 17:36:32,033 All peers arrived. Confirming membership.
 61 | example_1      | INFO 2020-04-03 17:36:32,116 Waiting for confirmations from all peers.
 62 | example_1      | INFO 2020-04-03 17:36:32,118 Rendezvous version 1 is complete. Final state: {'status': 'final', 'version': '1', 'participants': [0], 'keep_alives': ['/torchelastic/p2p/run_None/rdzv/v_1/rank_0'], 'num_workers_waiting': 0}
 63 | example_1      | INFO 2020-04-03 17:36:32,118 Creating EtcdStore as the c10d::Store implementation
 64 | example_1      | ======================================================
 65 | example_1      | Environment variables set by the agent on PID 51:
 66 | example_1      | {'GROUP_RANK': '0',
 67 | example_1      |  'LOCAL_RANK': '1',
 68 | example_1      |  'MASTER_ADDR': '6002aeb7c496',
 69 | example_1      |  'MASTER_PORT': '46289',
 70 | example_1      |  'RANK': '1',
 71 | example_1      |  'TORCHELASTIC_MAX_RESTARTS': '100',
 72 | example_1      |  'TORCHELASTIC_RESTART_COUNT': '0',
 73 | example_1      |  'WORLD_SIZE': '2'}
 74 | example_1      | ======================================================
 75 | example_1      |
 76 | example_1      | ======================================================
 77 | example_1      | Environment variables set by the agent on PID 52:
 78 | example_1      | {'GROUP_RANK': '0',
 79 | example_1      |  'LOCAL_RANK': '0',
 80 | example_1      |  'MASTER_ADDR': '6002aeb7c496',
 81 | example_1      |  'MASTER_PORT': '46289',
 82 | example_1      |  'RANK': '0',
 83 | example_1      |  'TORCHELASTIC_MAX_RESTARTS': '100',
 84 | example_1      |  'TORCHELASTIC_RESTART_COUNT': '0',
 85 | example_1      |  'WORLD_SIZE': '2'}
 86 | example_1      | ======================================================
 87 | example_1      |
 88 | example_1      | On PID 51, after init process group, rank=1, world_size = 2
 89 | example_1      |
 90 | example_1      | On PID 52, after init process group, rank=0, world_size = 2
 91 | example_1      |
 92 | lib_example_1 exited with code 0
 93 | ```
 94 | As you can see above, the application starts a process group with two peers and all information needed to initialize the group is provided by the elastic agent via environment variables.
 95 | 
 96 | ## Customizing parameters
 97 | The application Docker container can run an arbitrary user-provided script and run . To do this, simply mount the path containing your script into the application container by creating a volume mount in the `docker-compose.yaml` file as shown below:
 98 | 
 99 | ```yaml
100 |   example:
101 |     image: 'hello_elastic:dev'
102 |     volumes:
103 |        - /path/to/your/app:/workspace
104 |     command: --nnode=1 --nproc_per_node=2 --rdzv_endpoint=etcd-server /workspace/your_app.py
105 |     networks:
106 |       - app-tier
107 | ```
108 | Note that the path to your application (`/path/to/your/app` in the example above) must be an absolute path to the directory containing your script (`your_app.py` above)
109 | 
110 | ## Conclusions
111 | In this simple example, we illustrated the following principles when using PyTorch Elastic Trainer:
112 | 1. How to launch a PyTorch distributed application using the elastic launcher.
113 | 2. How to obtain parameters such as the world size, local rank and the master URL within an application to establish the process group.
114 | 3. How to configure parameters for an elastic job such as the number of workers per node and the number of times your application should be restarted in the event of failures.
115 | 
116 | In the next set of samples, we will cover more advanced topics such as checkpointing state in your application and deploying it to an orchestrator such as Kubernetes.
117 | 


--------------------------------------------------------------------------------
/examples/multi_container/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | 
 3 | networks:
 4 |   app-tier:
 5 |     driver: bridge
 6 | 
 7 | services:
 8 |   etcd-server:
 9 |     image: 'bitnami/etcd:latest'
10 |     environment:
11 |       - ALLOW_NONE_AUTHENTICATION=yes
12 |       - ETCD_ADVERTISE_CLIENT_URLS=http://etcd-server:2379
13 |       - ETCD_ENABLE_V2=1
14 |     ports:
15 |       - 2379:2379
16 |       - 2380:2380
17 |     networks:
18 |       - app-tier
19 |   example:
20 |     image: 'drdarshan/hello-elastic:dev'
21 |     # Use a volume mount to point to your own scripts without rebuilding the image
22 |     # volumes:
23 |     #   - /path/to/your/application:/workspace
24 |     command: --nnode=1 --nproc_per_node=2 --rdzv_endpoint=etcd-server /workspace/echo.py
25 |     networks:
26 |       - app-tier
27 | 


--------------------------------------------------------------------------------
/examples/multi_container/echo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import io
 3 | import os
 4 | import pprint
 5 | import sys
 6 | 
 7 | import torch.distributed as dist
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 | 
12 |     env_dict = {
13 |         k: os.environ[k]
14 |         for k in (
15 |             "LOCAL_RANK",
16 |             "RANK",
17 |             "GROUP_RANK",
18 |             "WORLD_SIZE",
19 |             "MASTER_ADDR",
20 |             "MASTER_PORT",
21 |             "TORCHELASTIC_RESTART_COUNT",
22 |             "TORCHELASTIC_MAX_RESTARTS",
23 |         )
24 |     }
25 | 
26 |     with io.StringIO() as buff:
27 |         print("======================================================", file=buff)
28 |         print(
29 |             f"Environment variables set by the agent on PID {os.getpid()}:", file=buff
30 |         )
31 |         pprint.pprint(env_dict, stream=buff)
32 |         print("======================================================", file=buff)
33 |         print(buff.getvalue())
34 |         sys.stdout.flush()
35 | 
36 |     dist.init_process_group(backend="gloo")
37 |     dist.barrier()
38 | 
39 |     print(
40 |         (
41 |             f"On PID {os.getpid()}, after init process group, "
42 |             f"rank={dist.get_rank()}, world_size = {dist.get_world_size()}\n"
43 |         )
44 |     )
45 | 


--------------------------------------------------------------------------------
/kubernetes/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | ## Development Guidance
 2 | 
 3 | This project uses go modules. We suggest to use golang 1.13.x for development and make sure`GO111MODULE` is enabled.
 4 | 
 5 | ### Setup development environment
 6 | 
 7 | Fork [PyTorch/Elastic](https://github.com/pytorch/elastic)
 8 | 
 9 | ```shell
10 | mkdir -p ${GOPATH}/src/github.com/pytorch/
11 | cd ${GOPATH}/src/github.com/pytorch
12 | git clone git@github.com:${GITHUB_USER}/elastic.git
13 | 
14 | # operator codes is under kubernetes directory
15 | cd elastic/kubernetes
16 | ```
17 | 
18 | ### Download dependencies.
19 | 
20 | ```shell
21 | go mod download
22 | ```
23 | 
24 | ### Build the binary locally
25 | 
26 | ```shell
27 | make manager
28 | ```
29 | 
30 | ### Test Binaries locally
31 | 
32 | ```shell
33 | ./bin/manager
34 | ```
35 | 
36 | ### Run Tests
37 | 
38 | ```shell
39 | go test ./... -coverprofile cover.out
40 | ```
41 | 
42 | ### Build container image
43 | 
44 | ```shell
45 | # It requires you to build binary locally first.
46 | docker build -t ${your_dockerhub_username}/elastic-job-k8s-controller:latest .
47 | ```
48 | 


--------------------------------------------------------------------------------
/kubernetes/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Image name: torchelastic/kubernetes:$VERSION
 2 | # If the image name changes, also change config/manager/manager.yaml
 3 | 
 4 | # Build the manager binary
 5 | FROM golang:1.13 as builder
 6 | 
 7 | WORKDIR /workspace
 8 | # Copy the Go Modules manifests
 9 | COPY go.mod go.mod
10 | COPY go.sum go.sum
11 | # cache deps before building and copying source so that we don't need to re-download as much
12 | # and so that source changes don't invalidate our downloaded layer
13 | RUN go mod download
14 | 
15 | # Copy the go source
16 | COPY main.go main.go
17 | COPY api/ api/
18 | COPY controllers/ controllers/
19 | 
20 | # Build
21 | RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -a -o manager main.go
22 | 
23 | # Use distroless as minimal base image to package the manager binary
24 | # Refer to https://github.com/GoogleContainerTools/distroless for more details
25 | FROM gcr.io/distroless/static:nonroot
26 | WORKDIR /
27 | COPY --from=builder /workspace/manager .
28 | USER nonroot:nonroot
29 | 
30 | ENTRYPOINT ["/manager"]
31 | 


--------------------------------------------------------------------------------
/kubernetes/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Image URL to use all building/pushing image targets
 3 | IMG ?= controller:latest
 4 | # Produce CRDs that work back to Kubernetes 1.11 (no version conversion)
 5 | CRD_OPTIONS ?= "crd:trivialVersions=true"
 6 | 
 7 | # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set)
 8 | ifeq (,$(shell go env GOBIN))
 9 | GOBIN=$(shell go env GOPATH)/bin
10 | else
11 | GOBIN=$(shell go env GOBIN)
12 | endif
13 | 
14 | all: manager
15 | 
16 | # Run tests
17 | test: generate fmt vet manifests
18 | 	go test ./... -coverprofile cover.out
19 | 
20 | # Build manager binary
21 | manager: generate fmt vet
22 | 	go build -o bin/manager main.go
23 | 
24 | # Run against the configured Kubernetes cluster in ~/.kube/config
25 | run: generate fmt vet manifests
26 | 	go run ./main.go
27 | 
28 | # Install CRDs into a cluster
29 | install: manifests
30 | 	kustomize build config/crd | kubectl apply -f -
31 | 
32 | # Uninstall CRDs from a cluster
33 | uninstall: manifests
34 | 	kustomize build config/crd | kubectl delete -f -
35 | 
36 | # Deploy controller in the configured Kubernetes cluster in ~/.kube/config
37 | deploy: manifests
38 | 	cd config/manager && kustomize edit set image controller=${IMG}
39 | 	kustomize build config/default | kubectl apply -f -
40 | 
41 | # Generate manifests e.g. CRD, RBAC etc.
42 | manifests: controller-gen
43 | 	$(CONTROLLER_GEN) $(CRD_OPTIONS) rbac:roleName=manager-role webhook paths="./..." output:crd:artifacts:config=config/crd/bases
44 | 
45 | # Run go fmt against code
46 | fmt:
47 | 	go fmt ./...
48 | 
49 | # Run go vet against code
50 | vet:
51 | 	go vet ./...
52 | 
53 | # Generate code
54 | generate: controller-gen
55 | 	$(CONTROLLER_GEN) object:headerFile=./hack/boilerplate.go.txt paths="./..."
56 | 
57 | # Build the docker image
58 | docker-build: test
59 | 	docker build . -t ${IMG}
60 | 
61 | # Push the docker image
62 | docker-push:
63 | 	docker push ${IMG}
64 | 
65 | # find or download controller-gen
66 | # download controller-gen if necessary
67 | controller-gen:
68 | ifeq (, $(shell which controller-gen))
69 | 	@{ \
70 | 	set -e ;\
71 | 	CONTROLLER_GEN_TMP_DIR=$$(mktemp -d) ;\
72 | 	cd $$CONTROLLER_GEN_TMP_DIR ;\
73 | 	go mod init tmp ;\
74 | 	go get sigs.k8s.io/controller-tools/cmd/controller-gen@v0.2.4 ;\
75 | 	rm -rf $$CONTROLLER_GEN_TMP_DIR ;\
76 | 	}
77 | CONTROLLER_GEN=$(GOBIN)/controller-gen
78 | else
79 | CONTROLLER_GEN=$(shell which controller-gen)
80 | endif
81 | 


--------------------------------------------------------------------------------
/kubernetes/PROJECT:
--------------------------------------------------------------------------------
1 | domain: pytorch.org
2 | repo: github.com/pytorch/elastic/kubernetes
3 | resources:
4 | - group: elastic
5 |   kind: ElasticJob
6 |   version: v1alpha1
7 | version: "2"
8 | 


--------------------------------------------------------------------------------
/kubernetes/TROUBLESHOOTING.md:
--------------------------------------------------------------------------------
  1 | ## Trouble shooting
  2 | 
  3 | This doc is aimed at making controller work right.
  4 | It covers sections to debug why controller is not working in Kubernetes and the ways to debug your application failure.  
  5 | 
  6 | 
  7 | ### Listing your controller pods 
  8 | 
  9 | The first thing to debug in your cluster is if your controller has been installed correctly.
 10 | Verify that all pods you expect to see are present and they're in `Ready` state. 
 11 | 
 12 | 
 13 | ```shell
 14 | $ kubectl get all -n elastic-job
 15 | NAME                                              READY   STATUS    RESTARTS   AGE
 16 | pod/elastic-job-k8s-controller-79c568dcc9-kdbq2   1/1     Running   0          13m
 17 | 
 18 | NAME                                         READY   UP-TO-DATE   AVAILABLE   AGE
 19 | deployment.apps/elastic-job-k8s-controller   1/1     1            1           27m
 20 | 
 21 | NAME                                                    DESIRED   CURRENT   READY   AGE
 22 | replicaset.apps/elastic-job-k8s-controller-79c568dcc9   1         1         1       13m
 23 | ```
 24 | 
 25 | If something is wrong on the pod, you can run follow command to get detailed information.
 26 | 
 27 | ```
 28 |  kubectl describe pod/elastic-job-k8s-controller-79c568dcc9-kdbq2 -n elastic-job
 29 | ```
 30 | 
 31 | ### Listing your cluster roles
 32 | 
 33 | TorchElastic Controller for Kubernetes needs to watch `Pods`, `Services`, `ElasticJob` etc to take action on events from resource change.
 34 | If you didn't apply cluster roles, role bindings or carelessly delete roles operator needs, you may see error like this. 
 35 | 
 36 | 
 37 | __cluster role is missing__
 38 | ```shell
 39 | E0321 03:26:46.940863       1 reflector.go:125] Failed to list *v1alpha1.ElasticJob: elasticjobs.elastic.pytorch.org is forbidden: User "system:serviceaccount:elastic-job:default" cannot list resource "elasticjobs" in API group "elastic.pytorch.org" at the cluster scope: RBAC: clusterrole.rbac.authorization.k8s.io "elastic-job-k8s-controller-role" not found
 40 | E0321 03:26:46.940882       1 reflector.go:125] Failed to list *v1.Pod: pods is forbidden: User "system:serviceaccount:elastic-job:default" cannot list resource "pods" in API group "" at the cluster scope: RBAC: clusterrole.rbac.authorization.k8s.io "elastic-job-k8s-controller-role" not found
 41 | E0321 03:26:46.941037       1 reflector.go:125] Failed to list *v1.Service: services is forbidden: User "system:serviceaccount:elastic-job:default" cannot list resource "services" in API group "" at the cluster scope: RBAC: clusterrole.rbac.authorization.k8s.io "elastic-job-k8s-controller-role" not found
 42 | E0321 03:26:47.942903       1 reflector.go:125] Failed to list *v1alpha1.ElasticJob: elasticjobs.elastic.pytorch.org is forbidden: User "system:serviceaccount:elastic-job:default" cannot list resource "elasticjobs" in API group "elastic.pytorch.org" at the cluster scope: RBAC: clusterrole.rbac.authorization.k8s.io "elastic-job-k8s-controller-role" not found
 43 | ``` 
 44 | 
 45 | __cluster role lack of permission__
 46 | ```shell
 47 | E0323 00:48:47.713088       1 reflector.go:125] Failed to list *v1.Service: services is forbidden: User "system:serviceaccount:elastic-job:default" cannot list resource "services" in API group "" at the cluster scope
 48 | ```
 49 | 
 50 | If you meet this problem, check your `clusterrole`, `clusterrolebinding` and `service account` exist.
 51 | 
 52 | ```shell
 53 | kubectl get clusterroles  elastic-job-k8s-controller-role -o yaml
 54 | kubectl get clusterrolebindings elastic-job-k8s-controller-rolebinding -o yaml
 55 | kubectl get serviceaccount default -n elastic-job
 56 | ```
 57 | 
 58 | If everything is there, please compare if `clusterroles` and `clusterrolebindings` match ones in [role.yaml](./config/rbac/role.yaml) and [role_binding.yaml](./config/rbac/role_binding.yaml)
 59 | 
 60 | ### Looking at logs
 61 | 
 62 | If you create or delete a job and don't see any changes, it's possible job get stuck inside the controller with some issues. 
 63 | In this case, you have to check controller logs for more details.
 64 | 
 65 | ```shell
 66 | kubectl logs -f elastic-job-k8s-controller-79c568dcc9-kdbq2 -n elastic-job
 67 | ```
 68 | 
 69 | ### Debug your applications
 70 | 
 71 | To debug applications that are deployed into Kubernetes and not behaving correctly, the first step is to check your job status. 
 72 | 
 73 | This is an example that both workers of job `imagenet` are stopped. 
 74 | 
 75 | ```shell
 76 | $ kubectl get pods -n elastic-job
 77 | NAME                                          READY   STATUS    RESTARTS   AGE
 78 | elastic-job-k8s-controller-79c568dcc9-kdbq2   1/1     Running   0          38m
 79 | imagenet-worker-0                             0/1     Error     0          10m
 80 | imagenet-worker-1                             0/1     Error     0          10m
 81 | ```
 82 | Let's check logs to see any other details we can get.
 83 | 
 84 | ```shell
 85 | $ kubectl logs -f imagenet-worker-0 -n elastic-job
 86 | 
 87 | [INFO] 2020-03-23 00:47:23,416 main: rdzv init method=etcd://etcd-service:2379/imagenet?min_workers=1&max_workers=5&last_call_timeout=5
 88 | [INFO] 2020-03-23 00:47:23,417 main: Loading data from: /data/tiny-imagenet-200/train
 89 | [INFO] 2020-03-23 00:47:23,762 main: Loading model: resnet101
 90 | [INFO] 2020-03-23 00:47:29,745 main: Rank [0] running on GPU [0]
 91 | [WARNING] 2020-03-23 00:47:29,749 connectionpool: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f88a6d92c88>: Failed to establish a new connection: [Errno -2] Name or service not known',)': /v2/machines
 92 | [WARNING] 2020-03-23 00:47:29,750 connectionpool: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f88a6d92da0>: Failed to establish a new connection: [Errno -2] Name or service not known',)': /v2/machines
 93 | [WARNING] 2020-03-23 00:47:29,752 connectionpool: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f88a6d92e80>: Failed to establish a new connection: [Errno -2] Name or service not known',)': /v2/machines
 94 | [ERROR] 2020-03-23 00:47:29,754 client: Failed to get list of machines from http://etcd-service:2379/v2: MaxRetryError("HTTPConnectionPool(host='etcd-service', port=2379): Max retries exceeded with url: /v2/machines (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f88a6d92f98>: Failed to establish a new connection: [Errno -2] Name or service not known',))",)
 95 | Traceback (most recent call last):
 96 |   File "/opt/conda/lib/python3.6/site-packages/urllib3/connection.py", line 159, in _new_conn
 97 |     (self._dns_host, self.port), self.timeout, **extra_kw)
 98 |   File "/opt/conda/lib/python3.6/site-packages/urllib3/util/connection.py", line 57, in create_connection
 99 |     for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
100 |   File "/opt/conda/lib/python3.6/socket.py", line 745, in getaddrinfo
101 |     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
102 | socket.gaierror: [Errno -2] Name or service not known
103 | ......
104 | ```
105 | 
106 | From the error message, we know that we use `etcd://etcd-service:2379` in the example as rdzv endpoint but `etcd-service:2379` is not reachable. 
107 | Job failed because it can not connect to etcd. You need to double check liveness of etcd and try again.
108 | 
109 | 
110 | ### Bugs and Feature requests
111 | 
112 | If you have what looks like a bug, or you would like to make a feature request, please use the [GitHub issue tracking system](https://github.com/pytorch/elastic/issues).
113 | 
114 | Before you file an issue, please search existing issues to see if your issue is already covered.
115 | 
116 | If filing a bug, please include detailed information about how to reproduce the problem
117 | 


--------------------------------------------------------------------------------
/kubernetes/api/v1alpha1/constants.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | package v1alpha1
10 | 
11 | const (
12 | 	// Kind is the kind name.
13 | 	Kind = "ElasticJob"
14 | 
15 | 	DefaultContainerName     = "elasticjob-worker"
16 | 	DefaultContainerPortName = "elasticjob-port"
17 | 	DefaultPort              = 10291
18 | )
19 | 


--------------------------------------------------------------------------------
/kubernetes/api/v1alpha1/elasticjob_types.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | package v1alpha1
10 | 
11 | import (
12 | 	common "github.com/kubeflow/common/pkg/apis/common/v1"
13 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14 | )
15 | 
16 | // EDIT THIS FILE!  THIS IS SCAFFOLDING FOR YOU TO OWN!
17 | // NOTE: json tags are required.  Any new fields you add must have json tags for the fields to be serialized.
18 | 
19 | // +kubebuilder:printcolumn:name="Min",type=integer,JSONPath=`.spec.minReplicas`
20 | // +kubebuilder:printcolumn:name="Max",type=integer,JSONPath=`.spec.maxReplicas`
21 | // +kubebuilder:printcolumn:name="Desired",type=integer,JSONPath=`.spec.replicaSpecs[Worker].replicas`
22 | // +kubebuilder:printcolumn:name="rdzvEndpoint",type=string,JSONPath=`.spec.rdzvEndpoint`
23 | // ElasticJobSpec defines the desired state of ElasticJob
24 | type ElasticJobSpec struct {
25 | 	// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
26 | 	// Important: Run "make" to regenerate code after modifying this file
27 | 	RunPolicy common.RunPolicy `json:",inline"`
28 | 
29 | 	// +kubebuilder:validation:MinItems=1
30 | 	ReplicaSpecs map[common.ReplicaType]*common.ReplicaSpec `json:"replicaSpecs"`
31 | 	RdzvEndpoint string                                     `json:"rdzvEndpoint"`
32 | 
33 | 	// +kubebuilder:validation:Minimum=1
34 | 	MinReplicas *int32 `json:"minReplicas,omitempty"`
35 | 	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
36 | }
37 | 
38 | // ElasticJobStatus defines the observed state of ElasticJob
39 | type ElasticJobStatus struct {
40 | 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
41 | 	// Important: Run "make" to regenerate code after modifying this file
42 | 	common.JobStatus `json:",inline"`
43 | }
44 | 
45 | // +kubebuilder:object:root=true
46 | 
47 | // ElasticJob is the Schema for the elasticjobs API
48 | type ElasticJob struct {
49 | 	metav1.TypeMeta   `json:",inline"`
50 | 	metav1.ObjectMeta `json:"metadata,omitempty"`
51 | 
52 | 	Spec   ElasticJobSpec   `json:"spec,omitempty"`
53 | 	Status ElasticJobStatus `json:"status,omitempty"`
54 | }
55 | 
56 | // +kubebuilder:object:root=true
57 | 
58 | // ElasticJobList contains a list of ElasticJob
59 | type ElasticJobList struct {
60 | 	metav1.TypeMeta `json:",inline"`
61 | 	metav1.ListMeta `json:"metadata,omitempty"`
62 | 	Items           []ElasticJob `json:"items"`
63 | }
64 | 
65 | type ElasticJobReplicaType common.ReplicaType
66 | 
67 | const (
68 | 	// ElasticReplicaTypeEtcd is the type for etcd of Elastic Job.
69 | 	ElasticReplicaTypeEtcd ElasticJobReplicaType = "Etcd"
70 | 
71 | 	// ElasticReplicaTypeWorker is the type for workers of Elastic Job.
72 | 	ElasticReplicaTypeWorker ElasticJobReplicaType = "Worker"
73 | )
74 | 
75 | func init() {
76 | 	SchemeBuilder.Register(&ElasticJob{}, &ElasticJobList{})
77 | }
78 | 


--------------------------------------------------------------------------------
/kubernetes/api/v1alpha1/groupversion_info.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | // Package v1alpha1 contains API Schema definitions for the elastic v1alpha1 API group
10 | // +kubebuilder:object:generate=true
11 | // +groupName=elastic.pytorch.org
12 | package v1alpha1
13 | 
14 | import (
15 | 	"k8s.io/apimachinery/pkg/runtime/schema"
16 | 	"sigs.k8s.io/controller-runtime/pkg/scheme"
17 | )
18 | 
19 | var (
20 | 	// GroupVersion is group version used to register these objects
21 | 	GroupVersion = schema.GroupVersion{Group: "elastic.pytorch.org", Version: "v1alpha1"}
22 | 
23 | 	// SchemeBuilder is used to add go types to the GroupVersionKind scheme
24 | 	SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion}
25 | 
26 | 	// AddToScheme adds the types in this group-version to the given scheme.
27 | 	AddToScheme = SchemeBuilder.AddToScheme
28 | )
29 | 


--------------------------------------------------------------------------------
/kubernetes/api/v1alpha1/zz_generated.deepcopy.go:
--------------------------------------------------------------------------------
  1 | //go:build !ignore_autogenerated
  2 | // +build !ignore_autogenerated
  3 | 
  4 | /*
  5 | Copyright (c) Facebook, Inc. and its affiliates.
  6 | All rights reserved.
  7 | 
  8 | This source code is licensed under the BSD-style license found in the
  9 | LICENSE file in the root directory of this source tree.
 10 | */
 11 | 
 12 | // Code generated by controller-gen. DO NOT EDIT.
 13 | 
 14 | package v1alpha1
 15 | 
 16 | import (
 17 | 	"github.com/kubeflow/common/pkg/apis/common/v1"
 18 | 	runtime "k8s.io/apimachinery/pkg/runtime"
 19 | )
 20 | 
 21 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 22 | func (in *ElasticJob) DeepCopyInto(out *ElasticJob) {
 23 | 	*out = *in
 24 | 	out.TypeMeta = in.TypeMeta
 25 | 	in.ObjectMeta.DeepCopyInto(&out.ObjectMeta)
 26 | 	in.Spec.DeepCopyInto(&out.Spec)
 27 | 	in.Status.DeepCopyInto(&out.Status)
 28 | }
 29 | 
 30 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticJob.
 31 | func (in *ElasticJob) DeepCopy() *ElasticJob {
 32 | 	if in == nil {
 33 | 		return nil
 34 | 	}
 35 | 	out := new(ElasticJob)
 36 | 	in.DeepCopyInto(out)
 37 | 	return out
 38 | }
 39 | 
 40 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
 41 | func (in *ElasticJob) DeepCopyObject() runtime.Object {
 42 | 	if c := in.DeepCopy(); c != nil {
 43 | 		return c
 44 | 	}
 45 | 	return nil
 46 | }
 47 | 
 48 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 49 | func (in *ElasticJobList) DeepCopyInto(out *ElasticJobList) {
 50 | 	*out = *in
 51 | 	out.TypeMeta = in.TypeMeta
 52 | 	in.ListMeta.DeepCopyInto(&out.ListMeta)
 53 | 	if in.Items != nil {
 54 | 		in, out := &in.Items, &out.Items
 55 | 		*out = make([]ElasticJob, len(*in))
 56 | 		for i := range *in {
 57 | 			(*in)[i].DeepCopyInto(&(*out)[i])
 58 | 		}
 59 | 	}
 60 | }
 61 | 
 62 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticJobList.
 63 | func (in *ElasticJobList) DeepCopy() *ElasticJobList {
 64 | 	if in == nil {
 65 | 		return nil
 66 | 	}
 67 | 	out := new(ElasticJobList)
 68 | 	in.DeepCopyInto(out)
 69 | 	return out
 70 | }
 71 | 
 72 | // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
 73 | func (in *ElasticJobList) DeepCopyObject() runtime.Object {
 74 | 	if c := in.DeepCopy(); c != nil {
 75 | 		return c
 76 | 	}
 77 | 	return nil
 78 | }
 79 | 
 80 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 81 | func (in *ElasticJobSpec) DeepCopyInto(out *ElasticJobSpec) {
 82 | 	*out = *in
 83 | 	in.RunPolicy.DeepCopyInto(&out.RunPolicy)
 84 | 	if in.ReplicaSpecs != nil {
 85 | 		in, out := &in.ReplicaSpecs, &out.ReplicaSpecs
 86 | 		*out = make(map[v1.ReplicaType]*v1.ReplicaSpec, len(*in))
 87 | 		for key, val := range *in {
 88 | 			var outVal *v1.ReplicaSpec
 89 | 			if val == nil {
 90 | 				(*out)[key] = nil
 91 | 			} else {
 92 | 				in, out := &val, &outVal
 93 | 				*out = new(v1.ReplicaSpec)
 94 | 				(*in).DeepCopyInto(*out)
 95 | 			}
 96 | 			(*out)[key] = outVal
 97 | 		}
 98 | 	}
 99 | 	if in.MinReplicas != nil {
100 | 		in, out := &in.MinReplicas, &out.MinReplicas
101 | 		*out = new(int32)
102 | 		**out = **in
103 | 	}
104 | 	if in.MaxReplicas != nil {
105 | 		in, out := &in.MaxReplicas, &out.MaxReplicas
106 | 		*out = new(int32)
107 | 		**out = **in
108 | 	}
109 | }
110 | 
111 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticJobSpec.
112 | func (in *ElasticJobSpec) DeepCopy() *ElasticJobSpec {
113 | 	if in == nil {
114 | 		return nil
115 | 	}
116 | 	out := new(ElasticJobSpec)
117 | 	in.DeepCopyInto(out)
118 | 	return out
119 | }
120 | 
121 | // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
122 | func (in *ElasticJobStatus) DeepCopyInto(out *ElasticJobStatus) {
123 | 	*out = *in
124 | 	in.JobStatus.DeepCopyInto(&out.JobStatus)
125 | }
126 | 
127 | // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ElasticJobStatus.
128 | func (in *ElasticJobStatus) DeepCopy() *ElasticJobStatus {
129 | 	if in == nil {
130 | 		return nil
131 | 	}
132 | 	out := new(ElasticJobStatus)
133 | 	in.DeepCopyInto(out)
134 | 	return out
135 | }
136 | 


--------------------------------------------------------------------------------
/kubernetes/config/crd/bases/elastic.pytorch.org_elasticjobs.yaml:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | apiVersion: apiextensions.k8s.io/v1beta1
  4 | kind: CustomResourceDefinition
  5 | metadata:
  6 |   annotations:
  7 |     controller-gen.kubebuilder.io/version: v0.2.4
  8 |   creationTimestamp: null
  9 |   name: elasticjobs.elastic.pytorch.org
 10 | spec:
 11 |   group: elastic.pytorch.org
 12 |   names:
 13 |     kind: ElasticJob
 14 |     listKind: ElasticJobList
 15 |     plural: elasticjobs
 16 |     singular: elasticjob
 17 |   scope: Namespaced
 18 |   validation:
 19 |     openAPIV3Schema:
 20 |       description: ElasticJob is the Schema for the elasticjobs API
 21 |       properties:
 22 |         apiVersion:
 23 |           description: 'APIVersion defines the versioned schema of this representation
 24 |             of an object. Servers should convert recognized schemas to the latest
 25 |             internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
 26 |           type: string
 27 |         kind:
 28 |           description: 'Kind is a string value representing the REST resource this
 29 |             object represents. Servers may infer this from the endpoint the client
 30 |             submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
 31 |           type: string
 32 |         metadata:
 33 |           type: object
 34 |         spec:
 35 |           description: ElasticJobSpec defines the desired state of ElasticJob
 36 |           properties:
 37 |             activeDeadlineSeconds:
 38 |               description: Specifies the duration in seconds relative to the startTime
 39 |                 that the job may be active before the system tries to terminate it;
 40 |                 value must be positive integer.
 41 |               format: int64
 42 |               type: integer
 43 |             backoffLimit:
 44 |               description: Optional number of retries before marking this job failed.
 45 |               format: int32
 46 |               type: integer
 47 |             cleanPodPolicy:
 48 |               description: CleanPodPolicy defines the policy to kill pods after the
 49 |                 job completes. Default to Running.
 50 |               type: string
 51 |             maxReplicas:
 52 |               format: int32
 53 |               type: integer
 54 |             minReplicas:
 55 |               format: int32
 56 |               minimum: 1
 57 |               type: integer
 58 |             rdzvEndpoint:
 59 |               type: string
 60 |             replicaSpecs: {}
 61 |             schedulingPolicy:
 62 |               description: SchedulingPolicy defines the policy related to scheduling,
 63 |                 e.g. gang-scheduling
 64 |               properties:
 65 |                 minAvailable:
 66 |                   format: int32
 67 |                   type: integer
 68 |               type: object
 69 |             ttlSecondsAfterFinished:
 70 |               description: TTLSecondsAfterFinished is the TTL to clean up jobs. It
 71 |                 may take extra ReconcilePeriod seconds for the cleanup, since reconcile
 72 |                 gets called periodically. Default to infinite.
 73 |               format: int32
 74 |               type: integer
 75 |           required:
 76 |           - rdzvEndpoint
 77 |           - replicaSpecs
 78 |           type: object
 79 |         status:
 80 |           description: ElasticJobStatus defines the observed state of ElasticJob
 81 |           properties:
 82 |             completionTime:
 83 |               description: Represents time when the job was completed. It is not guaranteed
 84 |                 to be set in happens-before order across separate operations. It is
 85 |                 represented in RFC3339 form and is in UTC.
 86 |               format: date-time
 87 |               type: string
 88 |             conditions:
 89 |               description: Conditions is an array of current observed job conditions.
 90 |               items:
 91 |                 description: JobCondition describes the state of the job at a certain
 92 |                   point.
 93 |                 properties:
 94 |                   lastTransitionTime:
 95 |                     description: Last time the condition transitioned from one status
 96 |                       to another.
 97 |                     format: date-time
 98 |                     type: string
 99 |                   lastUpdateTime:
100 |                     description: The last time this condition was updated.
101 |                     format: date-time
102 |                     type: string
103 |                   message:
104 |                     description: A human readable message indicating details about
105 |                       the transition.
106 |                     type: string
107 |                   reason:
108 |                     description: The reason for the condition's last transition.
109 |                     type: string
110 |                   status:
111 |                     description: Status of the condition, one of True, False, Unknown.
112 |                     type: string
113 |                   type:
114 |                     description: Type of job condition.
115 |                     type: string
116 |                 required:
117 |                 - status
118 |                 - type
119 |                 type: object
120 |               type: array
121 |             lastReconcileTime:
122 |               description: Represents last time when the job was reconciled. It is
123 |                 not guaranteed to be set in happens-before order across separate operations.
124 |                 It is represented in RFC3339 form and is in UTC.
125 |               format: date-time
126 |               type: string
127 |             replicaStatuses:
128 |               description: ReplicaStatuses is map of ReplicaType and ReplicaStatus,
129 |                 specifies the status of each replica.
130 |             startTime:
131 |               description: Represents time when the job was acknowledged by the job
132 |                 controller. It is not guaranteed to be set in happens-before order
133 |                 across separate operations. It is represented in RFC3339 form and
134 |                 is in UTC.
135 |               format: date-time
136 |               type: string
137 |           required:
138 |           - conditions
139 |           - replicaStatuses
140 |           type: object
141 |       type: object
142 |   version: v1alpha1
143 |   versions:
144 |   - name: v1alpha1
145 |     served: true
146 |     storage: true
147 | status:
148 |   acceptedNames:
149 |     kind: ""
150 |     plural: ""
151 |   conditions: []
152 |   storedVersions: []
153 | 


--------------------------------------------------------------------------------
/kubernetes/config/crd/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # This kustomization.yaml is not intended to be run by itself,
 2 | # since it depends on service name and namespace that are out of this kustomize package.
 3 | # It should be run by config/default
 4 | resources:
 5 | - bases/elastic.pytorch.org_elasticjobs.yaml
 6 | # +kubebuilder:scaffold:crdkustomizeresource
 7 | 
 8 | # the following config is for teaching kustomize how to do kustomization for CRDs.
 9 | configurations:
10 | - kustomizeconfig.yaml
11 | 


--------------------------------------------------------------------------------
/kubernetes/config/crd/kustomizeconfig.yaml:
--------------------------------------------------------------------------------
 1 | # This file is for teaching kustomize how to substitute name and namespace reference in CRD
 2 | nameReference:
 3 | - kind: Service
 4 |   version: v1
 5 |   fieldSpecs:
 6 |   - kind: CustomResourceDefinition
 7 |     group: apiextensions.k8s.io
 8 |     path: spec/conversion/webhookClientConfig/service/name
 9 | 
10 | namespace:
11 | - kind: CustomResourceDefinition
12 |   group: apiextensions.k8s.io
13 |   path: spec/conversion/webhookClientConfig/service/namespace
14 |   create: false
15 | 
16 | varReference:
17 | - path: metadata/annotations
18 | 


--------------------------------------------------------------------------------
/kubernetes/config/default/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | # Adds namespace to all resources.
 2 | namespace: elastic-job
 3 | 
 4 | # Labels to add to all resources and selectors.
 5 | commonLabels:
 6 |   app: torch-elastic
 7 | 
 8 | bases:
 9 | - ../crd
10 | - ../rbac
11 | - ../manager


--------------------------------------------------------------------------------
/kubernetes/config/manager/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - manager.yaml
3 | 


--------------------------------------------------------------------------------
/kubernetes/config/manager/manager.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   labels:
 5 |     app: elastic-job-k8s-controller
 6 |   name: elastic-job
 7 | ---
 8 | apiVersion: apps/v1
 9 | kind: Deployment
10 | metadata:
11 |   name: elastic-job-k8s-controller
12 |   namespace: elastic-job
13 |   labels:
14 |     app: elastic-job-k8s-controller
15 | spec:
16 |   selector:
17 |     matchLabels:
18 |       app: elastic-job-k8s-controller
19 |   replicas: 1
20 |   template:
21 |     metadata:
22 |       labels:
23 |         app: elastic-job-k8s-controller
24 |     spec:
25 |       containers:
26 |       - command:
27 |         - /manager
28 |         args:
29 |         - --enable-leader-election
30 |         image: torchelastic/kubernetes:0.1.0
31 |         name: manager
32 |         resources:
33 |           limits:
34 |             cpu: 100m
35 |             memory: 30Mi
36 |           requests:
37 |             cpu: 100m
38 |             memory: 20Mi
39 |       terminationGracePeriodSeconds: 10
40 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/elasticjob_editor_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do edit elasticjobs.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: elasticjob-editor-role
 6 | rules:
 7 | - apiGroups:
 8 |   - elastic.pytorch.org
 9 |   resources:
10 |   - elasticjobs
11 |   verbs:
12 |   - create
13 |   - delete
14 |   - get
15 |   - list
16 |   - patch
17 |   - update
18 |   - watch
19 | - apiGroups:
20 |   - elastic.pytorch.org
21 |   resources:
22 |   - elasticjobs/status
23 |   verbs:
24 |   - get
25 |   - patch
26 |   - update
27 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/elasticjob_viewer_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do viewer elasticjobs.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: ClusterRole
 4 | metadata:
 5 |   name: elasticjob-viewer-role
 6 | rules:
 7 | - apiGroups:
 8 |   - elastic.pytorch.org
 9 |   resources:
10 |   - elasticjobs
11 |   verbs:
12 |   - get
13 |   - list
14 |   - watch
15 | - apiGroups:
16 |   - elastic.pytorch.org
17 |   resources:
18 |   - elasticjobs/status
19 |   verbs:
20 |   - get
21 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/kustomization.yaml:
--------------------------------------------------------------------------------
1 | resources:
2 | - role.yaml
3 | - role_binding.yaml
4 | - leader_election_role.yaml
5 | - leader_election_role_binding.yaml


--------------------------------------------------------------------------------
/kubernetes/config/rbac/leader_election_role.yaml:
--------------------------------------------------------------------------------
 1 | # permissions to do leader election.
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | kind: Role
 4 | metadata:
 5 |   name: leader-election-role
 6 | rules:
 7 | - apiGroups:
 8 |   - ""
 9 |   resources:
10 |   - configmaps
11 |   verbs:
12 |   - get
13 |   - list
14 |   - watch
15 |   - create
16 |   - update
17 |   - patch
18 |   - delete
19 | - apiGroups:
20 |   - ""
21 |   resources:
22 |   - configmaps/status
23 |   verbs:
24 |   - get
25 |   - update
26 |   - patch
27 | - apiGroups:
28 |   - ""
29 |   resources:
30 |   - events
31 |   verbs:
32 |   - create
33 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/leader_election_role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: leader-election-rolebinding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: Role
 8 |   name: leader-election-role
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: default
12 |   namespace: elastic-job
13 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/role.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | apiVersion: rbac.authorization.k8s.io/v1
 4 | kind: ClusterRole
 5 | metadata:
 6 |   creationTimestamp: null
 7 |   name: elastic-job-k8s-controller-role
 8 | rules:
 9 | - apiGroups:
10 |   - ""
11 |   resources:
12 |   - events
13 |   verbs:
14 |   - create
15 |   - delete
16 |   - get
17 |   - list
18 |   - patch
19 |   - update
20 |   - watch
21 | - apiGroups:
22 |   - ""
23 |   resources:
24 |   - pods
25 |   verbs:
26 |   - create
27 |   - delete
28 |   - get
29 |   - list
30 |   - patch
31 |   - update
32 |   - watch
33 | - apiGroups:
34 |   - ""
35 |   resources:
36 |   - services
37 |   verbs:
38 |   - create
39 |   - delete
40 |   - get
41 |   - list
42 |   - patch
43 |   - update
44 |   - watch
45 | - apiGroups:
46 |   - elastic.pytorch.org
47 |   resources:
48 |   - elasticjobs
49 |   verbs:
50 |   - create
51 |   - delete
52 |   - get
53 |   - list
54 |   - patch
55 |   - update
56 |   - watch
57 | - apiGroups:
58 |   - elastic.pytorch.org
59 |   resources:
60 |   - elasticjobs/status
61 |   verbs:
62 |   - get
63 |   - patch
64 |   - update
65 | 


--------------------------------------------------------------------------------
/kubernetes/config/rbac/role_binding.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: ClusterRoleBinding
 3 | metadata:
 4 |   name: elastic-job-k8s-controller-rolebinding
 5 | roleRef:
 6 |   apiGroup: rbac.authorization.k8s.io
 7 |   kind: ClusterRole
 8 |   name: elastic-job-k8s-controller-role
 9 | subjects:
10 | - kind: ServiceAccount
11 |   name: default
12 |   namespace: elastic-job
13 | 


--------------------------------------------------------------------------------
/kubernetes/config/samples/classy-vision.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: elastic.pytorch.org/v1alpha1
 2 | kind: ElasticJob
 3 | metadata:
 4 |   name: classy-vision
 5 |   namespace: elastic-job
 6 | spec:
 7 |   # Use "etcd-service:2379" if you already apply etcd.yaml
 8 |   rdzvEndpoint: "<your_etcd_endpoint>:<your_etcd_port>"
 9 |   minReplicas: 1
10 |   maxReplicas: 2
11 |   replicaSpecs:
12 |     Worker:
13 |       replicas: 2
14 |       restartPolicy: ExitCode
15 |       template:
16 |         apiVersion: v1
17 |         kind: Pod
18 |         spec:
19 |           containers:
20 |             - name: elasticjob-worker
21 |               image: torchelastic/examples:0.2.0
22 |               imagePullPolicy: Always
23 |               args:
24 |                 - "--nproc_per_node=1"
25 |                 - "/workspace/classy_vision/classy_train.py"
26 |                 - "--config_file"
27 |                 - "/workspace/classy_vision/configs/template_config.json"
28 |                 # number of data loader workers (NOT trainers)
29 |                 # zero means load the data on the same process as the trainer
30 |                 # this is set so that the container does not OOM since
31 |                 # pytorch data loaders use shm
32 |                 - "--num_workers=0"
33 | 


--------------------------------------------------------------------------------
/kubernetes/config/samples/etcd.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: etcd-service
 5 |   namespace: elastic-job
 6 | spec:
 7 |   ports:
 8 |   - name: etcd-client-port
 9 |     port: 2379
10 |     protocol: TCP
11 |     targetPort: 2379
12 |   selector:
13 |     app: etcd
14 | 
15 | ---
16 | apiVersion: v1
17 | kind: Pod
18 | metadata:
19 |   labels:
20 |     app: etcd
21 |   name: etcd
22 |   namespace: elastic-job
23 | spec:
24 |   containers:
25 |   - command:
26 |     - /usr/local/bin/etcd
27 |     - --data-dir
28 |     - /var/lib/etcd
29 |     - --enable-v2
30 |     - --listen-client-urls
31 |     - http://0.0.0.0:2379
32 |     - --advertise-client-urls
33 |     - http://0.0.0.0:2379
34 |     - --initial-cluster-state
35 |     - new
36 |     image: quay.io/coreos/etcd:latest
37 |     name: etcd
38 |     ports:
39 |     - containerPort: 2379
40 |       name: client
41 |       protocol: TCP
42 |     - containerPort: 2380
43 |       name: server
44 |       protocol: TCP
45 |   restartPolicy: Always
46 | 


--------------------------------------------------------------------------------
/kubernetes/config/samples/imagenet.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: elastic.pytorch.org/v1alpha1
 2 | kind: ElasticJob
 3 | metadata:
 4 |   name: imagenet
 5 |   namespace: elastic-job
 6 | spec:
 7 |   # Use "etcd-service:2379" if you already apply etcd.yaml
 8 |   rdzvEndpoint: "<your_etcd_endpoint>:<your_etcd_port>"
 9 |   minReplicas: 1
10 |   maxReplicas: 2
11 |   replicaSpecs:
12 |     Worker:
13 |       replicas: 2
14 |       restartPolicy: ExitCode
15 |       template:
16 |         apiVersion: v1
17 |         kind: Pod
18 |         spec:
19 |           containers:
20 |             - name: elasticjob-worker
21 |               image: torchelastic/examples:0.2.0
22 |               imagePullPolicy: Always
23 |               args:
24 |                 - "--nproc_per_node=1"
25 |                 - "/workspace/examples/imagenet/main.py"
26 |                 - "--arch=resnet18"
27 |                 - "--epochs=20"
28 |                 - "--batch-size=32"
29 |                 # number of data loader workers (NOT trainers)
30 |                 # zero means load the data on the same process as the trainer
31 |                 # this is set so that the container does not OOM since
32 |                 # pytorch data loaders use shm
33 |                 - "--workers=0"
34 |                 - "/workspace/data/tiny-imagenet-200"
35 |               resources:
36 |                 limits:
37 |                   nvidia.com/gpu: 1
38 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/elasticjob_controller.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) Facebook, Inc. and its affiliates.
  3 | All rights reserved.
  4 | 
  5 | This source code is licensed under the BSD-style license found in the
  6 | LICENSE file in the root directory of this source tree.
  7 | */
  8 | 
  9 | package controllers
 10 | 
 11 | import (
 12 | 	"context"
 13 | 	"github.com/go-logr/logr"
 14 | 	commonv1 "github.com/kubeflow/common/pkg/apis/common/v1"
 15 | 	"github.com/kubeflow/common/pkg/controller.v1/common"
 16 | 	logger "github.com/kubeflow/common/pkg/util"
 17 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
 18 | 	corev1 "k8s.io/api/core/v1"
 19 | 	"k8s.io/apimachinery/pkg/api/errors"
 20 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 21 | 	"k8s.io/apimachinery/pkg/runtime"
 22 | 	"k8s.io/apimachinery/pkg/runtime/schema"
 23 | 	"k8s.io/client-go/kubernetes"
 24 | 	"k8s.io/client-go/kubernetes/scheme"
 25 | 	"k8s.io/client-go/util/workqueue"
 26 | 	k8scontroller "k8s.io/kubernetes/pkg/controller"
 27 | 	ctrl "sigs.k8s.io/controller-runtime"
 28 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 29 | 	"sigs.k8s.io/controller-runtime/pkg/predicate"
 30 | 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 31 | )
 32 | 
 33 | const (
 34 | 	controllerName      = "elastic-job-controller"
 35 | 	elasticJobRoleLabel = "elastic-job-role"
 36 | )
 37 | 
 38 | var (
 39 | 	jobOwnerKey           = ".metadata.controller"
 40 | 	defaultTTLseconds     = int32(100)
 41 | 	defaultCleanPodPolicy = commonv1.CleanPodPolicyNone
 42 | )
 43 | 
 44 | // ElasticJobReconciler reconciles a ElasticJob object
 45 | type ElasticJobReconciler struct {
 46 | 	jobController common.JobController
 47 | 	client.Client
 48 | 	Log    logr.Logger
 49 | 	Scheme *runtime.Scheme
 50 | }
 51 | 
 52 | // Reconcile reads that state of the cluster for a ElasticJob object and makes changes based on the state read
 53 | // and what is in the ElasticJob.Spec
 54 | // Automatically generate RBAC rules to allow the Controller to read and write Deployments
 55 | // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch;create;update;patch;delete
 56 | // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
 57 | // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete
 58 | // +kubebuilder:rbac:groups=elastic.pytorch.org,resources=elasticjobs,verbs=get;list;watch;create;update;patch;delete
 59 | // +kubebuilder:rbac:groups=elastic.pytorch.org,resources=elasticjobs/status,verbs=get;update;patch
 60 | 
 61 | func (r *ElasticJobReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
 62 | 	// Fetch the ElasticJob instance
 63 | 	elasticJob := &v1alpha1.ElasticJob{}
 64 | 	err := r.Get(context.TODO(), req.NamespacedName, elasticJob)
 65 | 	if err != nil {
 66 | 		if errors.IsNotFound(err) {
 67 | 			// Object not found, return.  Created objects are automatically garbage collected.
 68 | 			// For additional cleanup logic use finalizers.
 69 | 			return reconcile.Result{}, nil
 70 | 		}
 71 | 		// Error reading the object - requeue the request.
 72 | 		return reconcile.Result{}, err
 73 | 	}
 74 | 
 75 | 	log := logger.LoggerForJob(elasticJob)
 76 | 	needSync := r.satisfiedExpections(elasticJob)
 77 | 	if !needSync {
 78 | 		log.Info("reconcile skipped, job does not need to sync")
 79 | 		return ctrl.Result{}, nil
 80 | 	}
 81 | 
 82 | 	if elasticJob.DeletionTimestamp != nil {
 83 | 		log.Info("reconcile skipped, job has been deleted.")
 84 | 		return ctrl.Result{}, nil
 85 | 	}
 86 | 
 87 | 	// Set default priorities for elastic job
 88 | 	scheme.Scheme.Default(elasticJob)
 89 | 
 90 | 	// Set default cleanPodPolicy for job
 91 | 	if elasticJob.Spec.RunPolicy.CleanPodPolicy == nil {
 92 | 		elasticJob.Spec.RunPolicy.CleanPodPolicy = &defaultCleanPodPolicy
 93 | 	}
 94 | 
 95 | 	// Use common to reconcile the job related pod and service
 96 | 	err = r.jobController.ReconcileJobs(elasticJob, elasticJob.Spec.ReplicaSpecs, elasticJob.Status.JobStatus, &elasticJob.Spec.RunPolicy)
 97 | 	if err != nil {
 98 | 		log.Infof("Reconcile ElasticJob error %v", err)
 99 | 		return ctrl.Result{}, err
100 | 	}
101 | 
102 | 	return ctrl.Result{}, nil
103 | }
104 | 
105 | func (r *ElasticJobReconciler) SetupWithManager(mgr ctrl.Manager) error {
106 | 	// setup FieldIndexer to inform the manager that this controller owns pods and services,
107 | 	// so that it will automatically call Reconcile on the underlying ElasticJob when a Pod or Service changes, is deleted, etc.
108 | 	if err := mgr.GetFieldIndexer().IndexField(&corev1.Pod{}, jobOwnerKey, func(rawObj runtime.Object) []string {
109 | 		pod := rawObj.(*corev1.Pod)
110 | 		owner := metav1.GetControllerOf(pod)
111 | 		if owner == nil {
112 | 			return nil
113 | 		}
114 | 
115 | 		// Make sure owner is ElasticJob Controller.
116 | 		if owner.APIVersion != r.GetAPIGroupVersion().Version || owner.Kind != r.GetAPIGroupVersionKind().Kind {
117 | 			return nil
118 | 		}
119 | 
120 | 		return []string{owner.Name}
121 | 	}); err != nil {
122 | 		return err
123 | 	}
124 | 
125 | 	if err := mgr.GetFieldIndexer().IndexField(&corev1.Service{}, jobOwnerKey, func(rawObj runtime.Object) []string {
126 | 		svc := rawObj.(*corev1.Service)
127 | 		owner := metav1.GetControllerOf(svc)
128 | 		if owner == nil {
129 | 			return nil
130 | 		}
131 | 
132 | 		if owner.APIVersion != r.GetAPIGroupVersion().Version || owner.Kind != r.GetAPIGroupVersionKind().Kind {
133 | 			return nil
134 | 		}
135 | 
136 | 		return []string{owner.Name}
137 | 	}); err != nil {
138 | 		return err
139 | 	}
140 | 
141 | 	// Setup ElasticJobReconciler
142 | 	r.Client = mgr.GetClient()
143 | 	r.Scheme = mgr.GetScheme()
144 | 
145 | 	// Create k8s clients to list pods and service objects
146 | 	kubeClientSet := kubernetes.NewForConfigOrDie(mgr.GetConfig())
147 | 
148 | 	r.jobController = common.JobController{
149 | 		Controller:    r,
150 | 		Config:        common.JobControllerConfiguration{EnableGangScheduling: false},
151 | 		Expectations:  k8scontroller.NewControllerExpectations(),
152 | 		WorkQueue:     workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), r.ControllerName()),
153 | 		Recorder:      mgr.GetEventRecorderFor(r.ControllerName()),
154 | 		KubeClientSet: kubeClientSet,
155 | 	}
156 | 
157 | 	return ctrl.NewControllerManagedBy(mgr).
158 | 		For(&v1alpha1.ElasticJob{}).
159 | 		Owns(&corev1.Pod{}).
160 | 		Owns(&corev1.Service{}).
161 | 		WithEventFilter(predicate.Funcs{CreateFunc: onDependentCreateFunc(r), DeleteFunc: onDependentDeleteFunc(r)}).
162 | 		Complete(r)
163 | }
164 | 
165 | func (r *ElasticJobReconciler) ControllerName() string {
166 | 	return controllerName
167 | }
168 | 
169 | func (r *ElasticJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind {
170 | 	return v1alpha1.GroupVersion.WithKind(v1alpha1.Kind)
171 | }
172 | 
173 | func (r *ElasticJobReconciler) GetAPIGroupVersion() schema.GroupVersion {
174 | 	return v1alpha1.GroupVersion
175 | }
176 | 
177 | func (r *ElasticJobReconciler) GetGroupNameLabelValue() string {
178 | 	return v1alpha1.GroupVersion.Group
179 | }
180 | 
181 | func (r *ElasticJobReconciler) GetDefaultContainerName() string {
182 | 	return v1alpha1.DefaultContainerName
183 | }
184 | 
185 | func (r *ElasticJobReconciler) GetDefaultContainerPortNumber() int32 {
186 | 	// elastic job doesn't use fixed port
187 | 	return -1
188 | }
189 | 
190 | func (r *ElasticJobReconciler) GetDefaultContainerPortName() string {
191 | 	// elastic job doesn't use fixed port
192 | 	return ""
193 | }
194 | 
195 | func (r *ElasticJobReconciler) GetJobRoleKey() string {
196 | 	return elasticJobRoleLabel
197 | }
198 | 
199 | func (r *ElasticJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec,
200 | 	rtype commonv1.ReplicaType, index int) bool {
201 | 	// This method is from commonv1.ControllerInterface.
202 | 	// All workers in PET are equivalent and this doesn't apply to PET, so always return false here.
203 | 	return false
204 | }
205 | 
206 | // SetClusterSpec sets the cluster spec for the pod
207 | func (r *ElasticJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error {
208 | 	return SetClusterSpecForPod(job, podTemplate)
209 | }
210 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/expectation.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) Facebook, Inc. and its affiliates.
  3 | All rights reserved.
  4 | 
  5 | This source code is licensed under the BSD-style license found in the
  6 | LICENSE file in the root directory of this source tree.
  7 | */
  8 | 
  9 | package controllers
 10 | 
 11 | import (
 12 | 	"fmt"
 13 | 	v1 "github.com/kubeflow/common/pkg/apis/common/v1"
 14 | 	"github.com/kubeflow/common/pkg/controller.v1/common"
 15 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
 16 | 	corev1 "k8s.io/api/core/v1"
 17 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 18 | 	"k8s.io/apimachinery/pkg/util/runtime"
 19 | 	"sigs.k8s.io/controller-runtime/pkg/event"
 20 | 	"sigs.k8s.io/controller-runtime/pkg/reconcile"
 21 | )
 22 | 
 23 | func (r *ElasticJobReconciler) satisfiedExpections(job *v1alpha1.ElasticJob) bool {
 24 | 	satisfied := false
 25 | 	key, err := common.KeyFunc(job)
 26 | 	if err != nil {
 27 | 		runtime.HandleError(fmt.Errorf("couldn't get key for job object %v: %v", job, err))
 28 | 		return false
 29 | 	}
 30 | 	for rtype := range job.Spec.ReplicaSpecs {
 31 | 		// Check the expectations of the pods.
 32 | 		expectationPodsKey := common.GenExpectationPodsKey(key, string(rtype))
 33 | 		satisfied = satisfied || r.jobController.Expectations.SatisfiedExpectations(expectationPodsKey)
 34 | 		// Check the expectations of the services.
 35 | 		expectationServicesKey := common.GenExpectationServicesKey(key, string(rtype))
 36 | 		satisfied = satisfied || r.jobController.Expectations.SatisfiedExpectations(expectationServicesKey)
 37 | 	}
 38 | 
 39 | 	return satisfied
 40 | }
 41 | 
 42 | // onDependentCreateFunc modify expectations when dependent (pod/service) creation observed.
 43 | func onDependentCreateFunc(r reconcile.Reconciler) func(event.CreateEvent) bool {
 44 | 	return func(e event.CreateEvent) bool {
 45 | 		ejr, ok := r.(*ElasticJobReconciler)
 46 | 		if !ok {
 47 | 			return true
 48 | 		}
 49 | 
 50 | 		// Reconcile any ElasticJob create Event
 51 | 		if _, ok := e.Object.(*v1alpha1.ElasticJob); ok {
 52 | 			return true
 53 | 		}
 54 | 
 55 | 		// Predicates are provided to filter events before they are given to the EventHandler.
 56 | 		// Events will be passed to the EventHandler iff all provided Predicates evaluate to true.
 57 | 		// In this case, it won't filter out pods/services whose owner are not TorchElasticJob,
 58 | 		// we need to check group label to filter them out.
 59 | 		value := e.Meta.GetLabels()[v1.GroupNameLabel]
 60 | 		if value == "" || value != ejr.GetAPIGroupVersion().Group {
 61 | 			return false
 62 | 		}
 63 | 
 64 | 		ejr.Log.Info(fmt.Sprintf("Update on create function: %s create object %s", ejr.ControllerName(), e.Meta.GetName()))
 65 | 		rtype := e.Meta.GetLabels()[v1.ReplicaTypeLabel]
 66 | 		if len(rtype) == 0 {
 67 | 			return false
 68 | 		}
 69 | 
 70 | 		if controllerRef := metav1.GetControllerOf(e.Meta); controllerRef != nil {
 71 | 			var expectKey string
 72 | 			if _, ok := e.Object.(*corev1.Pod); ok {
 73 | 				expectKey = common.GenExpectationPodsKey(e.Meta.GetNamespace()+"/"+controllerRef.Name, rtype)
 74 | 			}
 75 | 
 76 | 			if _, ok := e.Object.(*corev1.Service); ok {
 77 | 				expectKey = common.GenExpectationServicesKey(e.Meta.GetNamespace()+"/"+controllerRef.Name, rtype)
 78 | 			}
 79 | 
 80 | 			ejr.jobController.Expectations.CreationObserved(expectKey)
 81 | 			return true
 82 | 		}
 83 | 
 84 | 		return true
 85 | 	}
 86 | }
 87 | 
 88 | // onDependentDeleteFunc listens on elasticJob deletion and also modify expectations when dependent (pod/service) deletion observed.
 89 | func onDependentDeleteFunc(r reconcile.Reconciler) func(event.DeleteEvent) bool {
 90 | 	return func(e event.DeleteEvent) bool {
 91 | 		ejr, ok := r.(*ElasticJobReconciler)
 92 | 		if !ok {
 93 | 			return true
 94 | 		}
 95 | 
 96 | 		// Reconcile any ElasticJob delete Event
 97 | 		if _, ok := e.Object.(*v1alpha1.ElasticJob); ok {
 98 | 			return true
 99 | 		}
100 | 
101 | 		// Predicates are provided to filter events before they are given to the EventHandler.
102 | 		// Events will be passed to the EventHandler iff all provided Predicates evaluate to true.
103 | 		// In this case, it won't filter out pods/services whose owner are not ElasticJob,
104 | 		// we need to check group label to filter them out.
105 | 		value := e.Meta.GetLabels()[v1.GroupNameLabel]
106 | 		if value == "" || value != ejr.GetAPIGroupVersion().Group {
107 | 			return false
108 | 		}
109 | 
110 | 		ejr.Log.Info(fmt.Sprintf("Update on delete function: %s create object %s", ejr.ControllerName(), e.Meta.GetName()))
111 | 		rtype := e.Meta.GetLabels()[v1.ReplicaTypeLabel]
112 | 		if len(rtype) == 0 {
113 | 			return false
114 | 		}
115 | 
116 | 		if controllerRef := metav1.GetControllerOf(e.Meta); controllerRef != nil {
117 | 			var expectKey string
118 | 			if _, ok := e.Object.(*corev1.Pod); ok {
119 | 				expectKey = common.GenExpectationPodsKey(e.Meta.GetNamespace()+"/"+controllerRef.Name, rtype)
120 | 			}
121 | 
122 | 			if _, ok := e.Object.(*corev1.Service); ok {
123 | 				expectKey = common.GenExpectationServicesKey(e.Meta.GetNamespace()+"/"+controllerRef.Name, rtype)
124 | 			}
125 | 			ejr.jobController.Expectations.DeleteExpectations(expectKey)
126 | 			return true
127 | 		}
128 | 
129 | 		return true
130 | 	}
131 | }
132 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/job.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) Facebook, Inc. and its affiliates.
  3 | All rights reserved.
  4 | 
  5 | This source code is licensed under the BSD-style license found in the
  6 | LICENSE file in the root directory of this source tree.
  7 | */
  8 | 
  9 | package controllers
 10 | 
 11 | import (
 12 | 	"context"
 13 | 	"fmt"
 14 | 	v1 "github.com/kubeflow/common/pkg/apis/common/v1"
 15 | 	commonutil "github.com/kubeflow/common/pkg/util"
 16 | 	logger "github.com/kubeflow/common/pkg/util"
 17 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
 18 | 	corev1 "k8s.io/api/core/v1"
 19 | 	"k8s.io/apimachinery/pkg/api/errors"
 20 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 21 | 	"k8s.io/apimachinery/pkg/types"
 22 | 	"reflect"
 23 | )
 24 | 
 25 | // Reasons for job events.
 26 | const (
 27 | 	FailedDeleteJobReason     = "FailedDeleteJob"
 28 | 	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
 29 | 
 30 | 	// ElasticJobCreatedReason is added in a job when it is created.
 31 | 	ElasticJobCreatedReason    = "ElasticJobCreated"
 32 | 	ElasticJobSucceededReason  = "ElasticJobSucceeded"
 33 | 	ElasticJobRunningReason    = "ElasticJobRunning"
 34 | 	ElasticJobFailedReason     = "ElasticJobFailed"
 35 | 	ElasticJobRestartingReason = "ElasticJobRestarting"
 36 | )
 37 | 
 38 | // GetJobFromInformerCache returns the Job from Informer Cache
 39 | func (r *ElasticJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
 40 | 	job := &v1alpha1.ElasticJob{}
 41 | 	log := logger.LoggerForJob(job)
 42 | 
 43 | 	// Default reader for ElasticJob is cache reader.
 44 | 	err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
 45 | 	if err != nil {
 46 | 		if errors.IsNotFound(err) {
 47 | 			log.Errorf("ElasticJob %s/%s not found. %v", namespace, name, err)
 48 | 		} else {
 49 | 			log.Error(err, "failed to get job %s/%s from informer cache. %v", namespace, name, err)
 50 | 		}
 51 | 		return nil, err
 52 | 	}
 53 | 	return job, nil
 54 | }
 55 | 
 56 | // GetJobFromAPIClient returns the Job from API server
 57 | func (r *ElasticJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) {
 58 | 	job := &v1alpha1.ElasticJob{}
 59 | 	log := logger.LoggerForJob(job)
 60 | 
 61 | 	clientReader, err := getClientReaderFromClient(r.Client)
 62 | 	if err != nil {
 63 | 		return nil, err
 64 | 	}
 65 | 	err = clientReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
 66 | 	if err != nil {
 67 | 		if errors.IsNotFound(err) {
 68 | 			log.Errorf("ElasticJob %s/%s not found. %v", namespace, name, err)
 69 | 		} else {
 70 | 			log.Errorf("failed to get job %s/%s from api-server. %v", namespace, name, err)
 71 | 		}
 72 | 		return nil, err
 73 | 	}
 74 | 	return job, nil
 75 | }
 76 | 
 77 | // DeleteJob deletes the job
 78 | func (r *ElasticJobReconciler) DeleteJob(job interface{}) error {
 79 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
 80 | 	if !ok {
 81 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
 82 | 	}
 83 | 
 84 | 	log := logger.LoggerForJob(elasticJob)
 85 | 	if err := r.Delete(context.Background(), elasticJob); err != nil {
 86 | 		r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err)
 87 | 		log.Errorf("failed to delete job %s/%s, %v", elasticJob.Namespace, elasticJob.Name, err)
 88 | 		return err
 89 | 	}
 90 | 	r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", elasticJob.Name)
 91 | 	log.Infof("job %s/%s has been deleted", elasticJob.Namespace, elasticJob.Name)
 92 | 	return nil
 93 | }
 94 | 
 95 | // UpdateJobStatus updates the job status and job conditions
 96 | func (r *ElasticJobReconciler) UpdateJobStatus(job interface{}, replicas map[v1.ReplicaType]*v1.ReplicaSpec, jobStatus *v1.JobStatus) error {
 97 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
 98 | 	if !ok {
 99 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
100 | 	}
101 | 
102 | 	log := logger.LoggerForJob(elasticJob)
103 | 
104 | 	for rtype, spec := range replicas {
105 | 		status := jobStatus.ReplicaStatuses[rtype]
106 | 
107 | 		succeeded := status.Succeeded
108 | 		expected := *(spec.Replicas) - succeeded
109 | 		running := status.Active
110 | 		failed := status.Failed
111 | 
112 | 		log.Infof("ElasticJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d",
113 | 			elasticJob.Name, rtype, expected, running, succeeded, failed)
114 | 
115 | 		if rtype == v1.ReplicaType(v1alpha1.ElasticReplicaTypeWorker) {
116 | 			if running > 0 {
117 | 				msg := fmt.Sprintf("ElasticJob %s is running.", elasticJob.Name)
118 | 				err := commonutil.UpdateJobConditions(jobStatus, v1.JobRunning, ElasticJobRunningReason, msg)
119 | 				if err != nil {
120 | 					log.Errorf("Append job condition error: %v", err)
121 | 					return err
122 | 				}
123 | 			}
124 | 			// when all workers succeed, the job is finished.
125 | 			if expected == 0 {
126 | 				msg := fmt.Sprintf("ElasticJob %s is successfully completed.", elasticJob.Name)
127 | 				log.Info(msg)
128 | 				r.jobController.Recorder.Event(elasticJob, corev1.EventTypeNormal, ElasticJobSucceededReason, msg)
129 | 				if jobStatus.CompletionTime == nil {
130 | 					now := metav1.Now()
131 | 					jobStatus.CompletionTime = &now
132 | 				}
133 | 				err := commonutil.UpdateJobConditions(jobStatus, v1.JobSucceeded, ElasticJobSucceededReason, msg)
134 | 				if err != nil {
135 | 					log.Errorf("Append job condition error: %v", err)
136 | 					return err
137 | 				}
138 | 				return nil
139 | 			}
140 | 		}
141 | 		if failed > 0 {
142 | 			if spec.RestartPolicy == v1.RestartPolicyExitCode {
143 | 				msg := fmt.Sprintf("ElasticJob %s is restarting because %d %s replica(s) failed.", elasticJob.Name, failed, rtype)
144 | 				r.jobController.Recorder.Event(elasticJob, corev1.EventTypeWarning, ElasticJobRestartingReason, msg)
145 | 				err := commonutil.UpdateJobConditions(jobStatus, v1.JobRestarting, ElasticJobRestartingReason, msg)
146 | 				if err != nil {
147 | 					log.Errorf("Append job condition error: %v", err)
148 | 					return err
149 | 				}
150 | 			} else {
151 | 				msg := fmt.Sprintf("ElasticJob %s is failed because %d %s replica(s) failed.", elasticJob.Name, failed, rtype)
152 | 				r.jobController.Recorder.Event(elasticJob, corev1.EventTypeNormal, ElasticJobFailedReason, msg)
153 | 				if elasticJob.Status.CompletionTime == nil {
154 | 					now := metav1.Now()
155 | 					elasticJob.Status.CompletionTime = &now
156 | 				}
157 | 				err := commonutil.UpdateJobConditions(jobStatus, v1.JobFailed, ElasticJobFailedReason, msg)
158 | 				if err != nil {
159 | 					log.Errorf("Append job condition error: %v", err)
160 | 					return err
161 | 				}
162 | 			}
163 | 		}
164 | 	}
165 | 
166 | 	// Some workers are still running, leave a running condition.
167 | 	msg := fmt.Sprintf("ElasticJob %s is running.", elasticJob.Name)
168 | 	log.Infof(msg)
169 | 
170 | 	if err := commonutil.UpdateJobConditions(jobStatus, v1.JobRunning, ElasticJobRunningReason, msg); err != nil {
171 | 		log.Errorf("failed to update ElasticJob conditions %v", err)
172 | 		return err
173 | 	}
174 | 
175 | 	return nil
176 | }
177 | 
178 | // UpdateJobStatusInApiServer updates the job status in to cluster.
179 | func (r *ElasticJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *v1.JobStatus) error {
180 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
181 | 	if !ok {
182 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
183 | 	}
184 | 
185 | 	// Job status passed in differs with status in job, update in basis of the passed in one.
186 | 	if !reflect.DeepEqual(&elasticJob.Status.JobStatus, jobStatus) {
187 | 		elasticJob = elasticJob.DeepCopy()
188 | 		elasticJob.Status.JobStatus = *jobStatus.DeepCopy()
189 | 	}
190 | 
191 | 	result := r.Update(context.Background(), elasticJob)
192 | 	if result != nil {
193 | 		logger.LoggerForJob(elasticJob).Error(result, " failed to update ElasticJob conditions in the API server")
194 | 		return result
195 | 	}
196 | 
197 | 	return nil
198 | }
199 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/pod.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) Facebook, Inc. and its affiliates.
  3 | All rights reserved.
  4 | 
  5 | This source code is licensed under the BSD-style license found in the
  6 | LICENSE file in the root directory of this source tree.
  7 | */
  8 | 
  9 | package controllers
 10 | 
 11 | import (
 12 | 	"context"
 13 | 	"fmt"
 14 | 	"github.com/kubeflow/common/pkg/controller.v1/common"
 15 | 	logger "github.com/kubeflow/common/pkg/util"
 16 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
 17 | 	corev1 "k8s.io/api/core/v1"
 18 | 	"k8s.io/apimachinery/pkg/api/meta"
 19 | 	"sigs.k8s.io/controller-runtime/pkg/client"
 20 | 	"strconv"
 21 | )
 22 | 
 23 | // CreatePod creates the pod of the job
 24 | func (r *ElasticJobReconciler) CreatePod(job interface{}, pod *corev1.Pod) error {
 25 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
 26 | 	if !ok {
 27 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
 28 | 	}
 29 | 
 30 | 	log := logger.LoggerForJob(elasticJob)
 31 | 	log.Infof("Creating pod %s/%s, Job name: %s.", pod.Namespace, pod.Name, elasticJob.GetName())
 32 | 
 33 | 	if err := r.Create(context.Background(), pod); err != nil {
 34 | 		log.Infof("Error building a pod via Elastic operator: %s", err.Error())
 35 | 		return err
 36 | 	}
 37 | 
 38 | 	return nil
 39 | }
 40 | 
 41 | // DeletePod deletes the pod of the job
 42 | func (r *ElasticJobReconciler) DeletePod(job interface{}, pod *corev1.Pod) error {
 43 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
 44 | 	if !ok {
 45 | 		return fmt.Errorf("%+v is not a type of TorchElasticJob", elasticJob)
 46 | 	}
 47 | 
 48 | 	log := logger.LoggerForJob(elasticJob)
 49 | 	log.Infof("Deleting pod %s/%s, Job name: %s", pod.Namespace, pod.Name, elasticJob.GetName())
 50 | 	if err := r.Delete(context.Background(), pod); err != nil {
 51 | 		r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeWarning, common.FailedDeletePodReason, "Error deleting: %v", err)
 52 | 		return err
 53 | 	}
 54 | 
 55 | 	r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeNormal, common.SuccessfulDeletePodReason, "Deleted pod: %v", pod.Name)
 56 | 
 57 | 	return nil
 58 | }
 59 | 
 60 | // GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name"
 61 | // i.e. all pods created by the job will come with label "job-name" = <this_job_name>
 62 | func (r *ElasticJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) {
 63 | 	job, err := meta.Accessor(obj)
 64 | 	if err != nil {
 65 | 		return nil, err
 66 | 	}
 67 | 	// List all pods to include those that don't match the selector anymore
 68 | 	// but have a ControllerRef pointing to this controller.
 69 | 	podlist := &corev1.PodList{}
 70 | 	if err := r.List(context.Background(), podlist, client.InNamespace(job.GetNamespace()),
 71 | 		client.MatchingLabels(r.jobController.GenLabels(job.GetName()))); err != nil {
 72 | 		return nil, err
 73 | 	}
 74 | 
 75 | 	return convertPodList(podlist.Items), nil
 76 | }
 77 | 
 78 | // convertPodList convert pod list to pod pointer list
 79 | func convertPodList(list []corev1.Pod) []*corev1.Pod {
 80 | 	if list == nil {
 81 | 		return nil
 82 | 	}
 83 | 	ret := make([]*corev1.Pod, 0, len(list))
 84 | 	for i := range list {
 85 | 		ret = append(ret, &list[i])
 86 | 	}
 87 | 	return ret
 88 | }
 89 | 
 90 | // Set pod environment set for ElasticJob
 91 | func SetClusterSpecForPod(job interface{}, podTemplate *corev1.PodTemplateSpec) error {
 92 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
 93 | 	if !ok {
 94 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
 95 | 	}
 96 | 
 97 | 	desiredReplicas, err := computeDesiredReplicas(elasticJob)
 98 | 	if err != nil {
 99 | 		return err
100 | 	}
101 | 
102 | 	// Set default value if minReplicas and maxReplicas are not set
103 | 	var minReplicas, maxReplicas int32
104 | 	if elasticJob.Spec.MinReplicas != nil {
105 | 		minReplicas = *elasticJob.Spec.MinReplicas
106 | 	} else {
107 | 		minReplicas = desiredReplicas
108 | 	}
109 | 
110 | 	if elasticJob.Spec.MaxReplicas != nil {
111 | 		maxReplicas = *elasticJob.Spec.MaxReplicas
112 | 	} else {
113 | 		maxReplicas = desiredReplicas
114 | 	}
115 | 
116 | 	launchDefaultArgs := []string{
117 | 		"--rdzv_backend=etcd",
118 | 		"--rdzv_endpoint=" + elasticJob.Spec.RdzvEndpoint,
119 | 		"--rdzv_id=" + elasticJob.Name,
120 | 		"--nnodes=" + strconv.Itoa(int(minReplicas)) + ":" + strconv.Itoa(int(maxReplicas))}
121 | 
122 | 	for i := range podTemplate.Spec.Containers {
123 | 		podTemplate.Spec.Containers[i].Args = append(launchDefaultArgs, podTemplate.Spec.Containers[i].Args...)
124 | 	}
125 | 
126 | 	return nil
127 | }
128 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/service.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | package controllers
10 | 
11 | import (
12 | 	"context"
13 | 	"fmt"
14 | 	"github.com/kubeflow/common/pkg/controller.v1/common"
15 | 	commonutil "github.com/kubeflow/common/pkg/util"
16 | 	logger "github.com/kubeflow/common/pkg/util"
17 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
18 | 	corev1 "k8s.io/api/core/v1"
19 | 	"k8s.io/apimachinery/pkg/api/meta"
20 | 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21 | 	"sigs.k8s.io/controller-runtime/pkg/client"
22 | )
23 | 
24 | // CreateService creates the service
25 | func (r *ElasticJobReconciler) CreateService(job interface{}, service *corev1.Service) error {
26 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
27 | 	if !ok {
28 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
29 | 	}
30 | 
31 | 	log := logger.LoggerForJob(elasticJob)
32 | 	log.Infof("Creating service %s/%s, Job name: %s ", service.Namespace, service.Name, elasticJob.GetName())
33 | 
34 | 	if err := r.Create(context.Background(), service); err != nil {
35 | 		log.Infof("Create service %s/%s error %s", service.Namespace, service.Name, err)
36 | 	}
37 | 
38 | 	return nil
39 | }
40 | 
41 | // DeleteService deletes the service
42 | func (r *ElasticJobReconciler) DeleteService(job interface{}, name string, namespace string) error {
43 | 	elasticJob, ok := job.(*v1alpha1.ElasticJob)
44 | 	if !ok {
45 | 		return fmt.Errorf("%+v is not a type of ElasticJob", elasticJob)
46 | 	}
47 | 
48 | 	log := logger.LoggerForJob(elasticJob)
49 | 	service := &corev1.Service{ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: name}}
50 | 	log.Infof("Deleting service %s/%s, Job name %s", service.Namespace, service.Name, elasticJob.GetName())
51 | 
52 | 	if err := r.Delete(context.Background(), service); err != nil {
53 | 		if commonutil.IsSucceeded(elasticJob.Status.JobStatus) {
54 | 			//r.recorder.Eventf(elasticJob, corev1.EventTypeNormal, job_controller.SuccessfulDeleteServiceReason, "Deleted service: %v", name)
55 | 			return nil
56 | 		}
57 | 
58 | 		r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeWarning, common.FailedDeleteServiceReason, "Error deleting: %v", err)
59 | 		return fmt.Errorf("unable to delete service: %v", err)
60 | 	}
61 | 
62 | 	r.jobController.Recorder.Eventf(elasticJob, corev1.EventTypeNormal, common.SuccessfulDeleteServiceReason, "Deleted service: %v", name)
63 | 	return nil
64 | }
65 | 
66 | // GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name"
67 | // i.e. all services created by the job will come with label "job-name" = <this_job_name>
68 | func (r *ElasticJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) {
69 | 	job, err := meta.Accessor(obj)
70 | 	if err != nil {
71 | 		return nil, fmt.Errorf("%+v is not a type of TorchElasticJob", job)
72 | 	}
73 | 	// List all pods to include those that don't match the selector anymore
74 | 	// but have a ControllerRef pointing to this controller.
75 | 	serviceList := &corev1.ServiceList{}
76 | 	if err := r.List(context.Background(), serviceList, client.InNamespace(job.GetNamespace()),
77 | 		client.MatchingLabels(r.jobController.GenLabels(job.GetName()))); err != nil {
78 | 		return nil, err
79 | 	}
80 | 
81 | 	ret := convertServiceList(serviceList.Items)
82 | 
83 | 	return ret, nil
84 | }
85 | 
86 | // convertServiceList convert service list to service point list
87 | func convertServiceList(list []corev1.Service) []*corev1.Service {
88 | 	if list == nil {
89 | 		return nil
90 | 	}
91 | 	ret := make([]*corev1.Service, 0, len(list))
92 | 	for i := range list {
93 | 		ret = append(ret, &list[i])
94 | 	}
95 | 	return ret
96 | }
97 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/suite_test.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | package controllers
10 | 
11 | import (
12 | 	"path/filepath"
13 | 	"testing"
14 | 
15 | 	. "github.com/onsi/ginkgo"
16 | 	. "github.com/onsi/gomega"
17 | 
18 | 	elasticv1alpha1 "github.com/pytorch/elastic/kubernetes/api/v1alpha1"
19 | 	"k8s.io/client-go/kubernetes/scheme"
20 | 	"k8s.io/client-go/rest"
21 | 	"sigs.k8s.io/controller-runtime/pkg/client"
22 | 	"sigs.k8s.io/controller-runtime/pkg/envtest"
23 | 	logf "sigs.k8s.io/controller-runtime/pkg/log"
24 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
25 | 	// +kubebuilder:scaffold:imports
26 | )
27 | 
28 | // These tests use Ginkgo (BDD-style Go testing framework). Refer to
29 | // http://onsi.github.io/ginkgo/ to learn more about Ginkgo.
30 | 
31 | var cfg *rest.Config
32 | var k8sClient client.Client
33 | var testEnv *envtest.Environment
34 | 
35 | func TestAPIs(t *testing.T) {
36 | 	RegisterFailHandler(Fail)
37 | 
38 | 	RunSpecsWithDefaultAndCustomReporters(t,
39 | 		"Controller Suite",
40 | 		[]Reporter{envtest.NewlineReporter{}})
41 | }
42 | 
43 | var _ = BeforeSuite(func(done Done) {
44 | 	logf.SetLogger(zap.LoggerTo(GinkgoWriter, true))
45 | 
46 | 	By("bootstrapping test environment")
47 | 	testEnv = &envtest.Environment{
48 | 		CRDDirectoryPaths: []string{filepath.Join("..", "config", "crd", "bases")},
49 | 	}
50 | 
51 | 	var err error
52 | 	cfg, err = testEnv.Start()
53 | 	Expect(err).ToNot(HaveOccurred())
54 | 	Expect(cfg).ToNot(BeNil())
55 | 
56 | 	err = elasticv1alpha1.AddToScheme(scheme.Scheme)
57 | 	Expect(err).NotTo(HaveOccurred())
58 | 
59 | 	// +kubebuilder:scaffold:scheme
60 | 
61 | 	k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme})
62 | 	Expect(err).ToNot(HaveOccurred())
63 | 	Expect(k8sClient).ToNot(BeNil())
64 | 
65 | 	close(done)
66 | }, 60)
67 | 
68 | var _ = AfterSuite(func() {
69 | 	By("tearing down the test environment")
70 | 	err := testEnv.Stop()
71 | 	Expect(err).ToNot(HaveOccurred())
72 | })
73 | 


--------------------------------------------------------------------------------
/kubernetes/controllers/util.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) Facebook, Inc. and its affiliates.
 3 | All rights reserved.
 4 | 
 5 | This source code is licensed under the BSD-style license found in the
 6 | LICENSE file in the root directory of this source tree.
 7 | */
 8 | 
 9 | package controllers
10 | 
11 | import (
12 | 	"errors"
13 | 	"fmt"
14 | 	"github.com/kubeflow/common/pkg/apis/common/v1"
15 | 	"github.com/pytorch/elastic/kubernetes/api/v1alpha1"
16 | 	"sigs.k8s.io/controller-runtime/pkg/client"
17 | )
18 | 
19 | // computeDesiredReplicas retrieve user's replica setting in specs
20 | func computeDesiredReplicas(elasticJob *v1alpha1.ElasticJob) (int32, error) {
21 | 	workerSpecs, exist := elasticJob.Spec.ReplicaSpecs[v1.ReplicaType(v1alpha1.ElasticReplicaTypeWorker)]
22 | 	if !exist {
23 | 		return 0, fmt.Errorf("elasticJob %v doesn't have %s", elasticJob, v1alpha1.ElasticReplicaTypeWorker)
24 | 	}
25 | 
26 | 	return *workerSpecs.Replicas, nil
27 | }
28 | 
29 | func getClientReaderFromClient(client client.Client) (client.Reader, error) {
30 | 	if dr, err := getDelegatingReader(client); err != nil {
31 | 		return nil, err
32 | 	} else {
33 | 		return dr.ClientReader, nil
34 | 	}
35 | }
36 | 
37 | // getDelegatingReader try to extract DelegatingReader from client.
38 | func getDelegatingReader(c client.Client) (*client.DelegatingReader, error) {
39 | 	dc, ok := c.(*client.DelegatingClient)
40 | 	if !ok {
41 | 		return nil, errors.New("cannot convert from Client to DelegatingClient")
42 | 	}
43 | 	dr, ok := dc.Reader.(*client.DelegatingReader)
44 | 	if !ok {
45 | 		return nil, errors.New("cannot convert from DelegatingClient.Reader to Delegating Reader")
46 | 	}
47 | 	return dr, nil
48 | }
49 | 


--------------------------------------------------------------------------------
/kubernetes/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/pytorch/elastic/kubernetes
 2 | 
 3 | go 1.13
 4 | 
 5 | require (
 6 | 	github.com/go-logr/logr v0.1.0
 7 | 	github.com/kubeflow/common v0.0.0-20200327002023-0b3a4c3fca85
 8 | 	github.com/onsi/ginkgo v1.8.0
 9 | 	github.com/onsi/gomega v1.5.0
10 | 	k8s.io/api v0.15.10
11 | 	k8s.io/apimachinery v0.15.10
12 | 	k8s.io/client-go v0.15.10
13 | 	k8s.io/kubernetes v1.15.10
14 | 	sigs.k8s.io/controller-runtime v0.3.0
15 | )
16 | 
17 | replace (
18 | 	k8s.io/api => k8s.io/api v0.15.10
19 | 	k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.15.10
20 | 	k8s.io/apimachinery => k8s.io/apimachinery v0.15.11-beta.0
21 | 	k8s.io/apiserver => k8s.io/apiserver v0.15.10
22 | 	k8s.io/cli-runtime => k8s.io/cli-runtime v0.15.10
23 | 	k8s.io/client-go => k8s.io/client-go v0.15.10
24 | 	k8s.io/cloud-provider => k8s.io/cloud-provider v0.15.10
25 | 	k8s.io/cluster-bootstrap => k8s.io/cluster-bootstrap v0.15.10
26 | 	k8s.io/code-generator => k8s.io/code-generator v0.15.10
27 | 	k8s.io/component-base => k8s.io/component-base v0.15.10
28 | 	k8s.io/cri-api => k8s.io/cri-api v0.15.11-beta.0
29 | 	k8s.io/csi-translation-lib => k8s.io/csi-translation-lib v0.15.10
30 | 	k8s.io/kube-aggregator => k8s.io/kube-aggregator v0.15.10
31 | 	k8s.io/kube-controller-manager => k8s.io/kube-controller-manager v0.15.10
32 | 	k8s.io/kube-proxy => k8s.io/kube-proxy v0.15.10
33 | 	k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.15.10
34 | 	k8s.io/kubectl => k8s.io/kubectl v0.15.11-beta.0
35 | 	k8s.io/kubelet => k8s.io/kubelet v0.15.10
36 | 	k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.15.10
37 | 	k8s.io/metrics => k8s.io/metrics v0.15.10
38 | 	k8s.io/node-api => k8s.io/node-api v0.15.10
39 | 	k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.15.10
40 | 	k8s.io/sample-cli-plugin => k8s.io/sample-cli-plugin v0.15.10
41 | 	k8s.io/sample-controller => k8s.io/sample-controller v0.15.10
42 | )
43 | 


--------------------------------------------------------------------------------
/kubernetes/hack/boilerplate.go.txt:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) Facebook, Inc. and its affiliates.
3 | All rights reserved.
4 | 
5 | This source code is licensed under the BSD-style license found in the
6 | LICENSE file in the root directory of this source tree.
7 | */


--------------------------------------------------------------------------------
/kubernetes/main.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2020 The PyTorch Elastic Authors.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | package main
18 | 
19 | import (
20 | 	"flag"
21 | 	corev1 "k8s.io/api/core/v1"
22 | 	"os"
23 | 
24 | 	elasticv1alpha1 "github.com/pytorch/elastic/kubernetes/api/v1alpha1"
25 | 	"github.com/pytorch/elastic/kubernetes/controllers"
26 | 	"k8s.io/apimachinery/pkg/runtime"
27 | 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
28 | 	_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
29 | 	ctrl "sigs.k8s.io/controller-runtime"
30 | 	"sigs.k8s.io/controller-runtime/pkg/log/zap"
31 | 	// +kubebuilder:scaffold:imports
32 | )
33 | 
34 | var (
35 | 	scheme   = runtime.NewScheme()
36 | 	setupLog = ctrl.Log.WithName("setup")
37 | )
38 | 
39 | func init() {
40 | 	_ = clientgoscheme.AddToScheme(scheme)
41 | 
42 | 	_ = elasticv1alpha1.AddToScheme(scheme)
43 | 	_ = corev1.AddToScheme(scheme)
44 | 	// +kubebuilder:scaffold:scheme
45 | }
46 | 
47 | func main() {
48 | 	var metricsAddr string
49 | 	var enableLeaderElection bool
50 | 	flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
51 | 	flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
52 | 		"Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.")
53 | 	flag.Parse()
54 | 
55 | 	ctrl.SetLogger(zap.New(func(o *zap.Options) {
56 | 		o.Development = true
57 | 	}))
58 | 
59 | 	mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
60 | 		Scheme:             scheme,
61 | 		MetricsBindAddress: metricsAddr,
62 | 		LeaderElection:     enableLeaderElection,
63 | 		Port:               9443,
64 | 	})
65 | 	if err != nil {
66 | 		setupLog.Error(err, "unable to start manager")
67 | 		os.Exit(1)
68 | 	}
69 | 
70 | 	if err = (&controllers.ElasticJobReconciler{
71 | 		Client: mgr.GetClient(),
72 | 		Log:    ctrl.Log.WithName("controllers").WithName("ElasticJob"),
73 | 		Scheme: mgr.GetScheme(),
74 | 	}).SetupWithManager(mgr); err != nil {
75 | 		setupLog.Error(err, "unable to create controller", "controller", "ElasticJob")
76 | 		os.Exit(1)
77 | 	}
78 | 	// +kubebuilder:scaffold:builder
79 | 
80 | 	setupLog.Info("starting manager")
81 | 	if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil {
82 | 		setupLog.Error(err, "problem running manager")
83 | 		os.Exit(1)
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | python-etcd>=0.4.5
3 | # please install torch nightly by following the instructions on
4 | # https://pytorch.org/
5 | torch>=1.8.0dev
6 | 


--------------------------------------------------------------------------------
/scripts/formatter_python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the BSD-style license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | 
 8 | if [ ! "$(black --version)" ]
 9 | then
10 |     echo "Please install black."
11 |     exit 1
12 | fi
13 | if [ ! "$(isort --version)" ]
14 | then
15 |     echo "Please install isort."
16 |     exit 1
17 | fi
18 | 
19 | # cd to the project directory
20 | cd "$(dirname "$0")/.." || exit 1
21 | 
22 | GIT_URL_1="https://github.com/pytorch/elastic.git"
23 | GIT_URL_2="git@github.com:pytorch/elastic.git"
24 | 
25 | UPSTREAM_URL="$(git config remote.upstream.url)"
26 | 
27 | if [ -z "$UPSTREAM_URL" ]
28 | then
29 |     echo "Setting upstream remote to $GIT_URL_1"
30 |     git remote add upstream "$GIT_URL_1"
31 | elif [ "$UPSTREAM_URL" != "$GIT_URL_1" ] && \
32 |      [ "$UPSTREAM_URL" != "$GIT_URL_2" ]
33 | then
34 |     echo "upstream remote set to $UPSTREAM_URL."
35 |     echo "Please delete the upstream remote or set it to $GIT_URL_1 to use this script."
36 |     exit 1
37 | fi
38 | 
39 | # fetch upstream
40 | git fetch upstream
41 | 
42 | 
43 | CHANGED_FILES="$(git diff --diff-filter=ACMRT --name-only upstream/master | grep '\.py$' | tr '\n' ' ')"
44 | 
45 | if [ "$CHANGED_FILES" != "" ]
46 | then
47 |     # Processing files one by one since passing directly $CHANGED_FILES will
48 |     # treat the whole variable as a single file.
49 |     echo "Running isort and black ..."
50 |     for file in $CHANGED_FILES
51 |     do
52 |         echo "Checking $file"
53 |         set -e isort "$file" --recursive --multi-line 3 --trailing-comma --force-grid-wrap 0 \
54 |                 --line-width 88 --lines-after-imports 2 --combine-as --section-default THIRDPARTY
55 | 
56 |         set -e black "$file"
57 |     done
58 | else
59 |     echo "No changes made to any Python files. Nothing to do."
60 |     exit 0
61 | fi
62 | 
63 | # Check if any files were modified by running isort + black
64 | # If so, then the files were formatted incorrectly (e.g. did not pass lint)
65 | CHANGED_FILES="$(git diff --name-only | grep '\.py$' | tr '\n' ' ')"
66 | if [ "$CHANGED_FILES" != "" ]
67 | then
68 |     # need this so that CircleCI fails
69 |     exit 1
70 | fi
71 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | 
10 | import sys
11 | 
12 | from setuptools import find_packages, setup
13 | 
14 | 
15 | def get_version():
16 |     return "0.2.3.dev0"
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     if sys.version_info < (3, 8):
21 |         sys.exit("python >= 3.8 required for torchelastic")
22 | 
23 |     with open("README.md", encoding="utf8") as f:
24 |         readme = f.read()
25 | 
26 |     with open("requirements.txt") as f:
27 |         reqs = f.read()
28 | 
29 |     version = get_version()
30 |     print("-- Building version: " + version)
31 | 
32 |     setup(
33 |         # Metadata
34 |         name="torchelastic",
35 |         version=version,
36 |         author="PyTorch Elastic Devs",
37 |         author_email="torchelastic@fb.com",
38 |         description="PyTorch Elastic Training",
39 |         long_description=readme,
40 |         long_description_content_type="text/markdown",
41 |         url="https://github.com/pytorch/elastic",
42 |         license="BSD-3",
43 |         keywords=["pytorch", "machine learning", "elastic", "distributed"],
44 |         python_requires=">=3.8",
45 |         install_requires=reqs.strip().split("\n"),
46 |         include_package_data=True,
47 |         packages=find_packages(exclude=("*.test", "aws*", "*.fb")),
48 |         # PyPI package information.
49 |         classifiers=[
50 |             "Development Status :: 4 - Beta",
51 |             "Intended Audience :: Developers",
52 |             "Intended Audience :: Science/Research",
53 |             "License :: OSI Approved :: BSD License",
54 |             "Programming Language :: Python :: 3",
55 |             "Programming Language :: Python :: 3.8",
56 |             "Topic :: System :: Distributed Computing",
57 |             "Topic :: Scientific/Engineering :: Artificial Intelligence",
58 |         ],
59 |     )
60 | 


--------------------------------------------------------------------------------
/torchelastic/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | # Copyright (c) Facebook, Inc. and its affiliates.
4 | # All rights reserved.
5 | #
6 | # This source code is licensed under the BSD-style license found in the
7 | # LICENSE file in the root directory of this source tree.
8 | 


--------------------------------------------------------------------------------
/torchelastic/distributed/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env/python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | 
10 | from torch.distributed.launcher.api import (  # noqa F401
11 |     elastic_launch,
12 |     launch_agent,
13 |     LaunchConfig,
14 | )
15 | 


--------------------------------------------------------------------------------
/torchelastic/distributed/launch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env/python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | # All rights reserved.
 5 | #
 6 | # This source code is licensed under the BSD-style license found in the
 7 | # LICENSE file in the root directory of this source tree.
 8 | 
 9 | import os
10 | 
11 | os.environ["LOGLEVEL"] = "INFO"
12 | 
13 | # Since logger initialized during imoprt statement
14 | # the log level should be set first
15 | from torch.distributed.run import main as run_main
16 | 
17 | 
18 | def main(args=None) -> None:
19 |     run_main(args)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------