├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── documentation.md │ ├── feature-request.md │ └── questions-help-support.md └── PULL_REQUEST_TEMPLATE │ └── pull_request_template.md ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── aws ├── README.md ├── __init__.py ├── auth │ ├── __init__.py │ └── session.py ├── autoscaling.py ├── cfn │ └── setup.yml ├── cloudformation.py ├── config │ ├── sample_specs.json │ ├── user_data_rdzv │ └── user_data_worker ├── petctl.py ├── requirements.txt ├── s3.py └── util.py ├── azure ├── README.md ├── config │ ├── Dockerfile │ ├── kubernetes.json │ └── sample_specs.yaml ├── petctl.py └── util.py ├── design ├── kubernetes │ └── torchelastic-operator-design.md └── torchelastic │ └── 0.2.0 │ ├── design_doc.md │ ├── torchelastic_agent_diagram.jpg │ └── torchelastic_diagram.jpg ├── docs ├── Makefile ├── doc_push.sh ├── requirements.txt ├── source │ ├── _static │ │ └── img │ │ │ ├── efs-setup.jpg │ │ │ ├── pytorch-logo-dark.svg │ │ │ └── pytorch-logo-flame.png │ ├── conf.py │ ├── index.rst │ └── scripts │ │ └── create_redirect_md.py └── src │ └── pip-delete-this-directory.txt ├── examples ├── Dockerfile ├── README.md ├── bin │ ├── fetch_and_run │ └── install_etcd ├── imagenet │ └── main.py └── multi_container │ ├── Dockerfile │ ├── README.md │ ├── docker-compose.yaml │ └── echo.py ├── kubernetes ├── DEVELOPMENT.md ├── Dockerfile ├── Makefile ├── PROJECT ├── README.md ├── TROUBLESHOOTING.md ├── api │ └── v1alpha1 │ │ ├── constants.go │ │ ├── elasticjob_types.go │ │ ├── groupversion_info.go │ │ └── zz_generated.deepcopy.go ├── config │ ├── crd │ │ ├── bases │ │ │ └── elastic.pytorch.org_elasticjobs.yaml │ │ ├── kustomization.yaml │ │ └── kustomizeconfig.yaml │ ├── default │ │ └── kustomization.yaml │ ├── manager │ │ ├── kustomization.yaml │ │ └── manager.yaml │ ├── rbac │ │ ├── elasticjob_editor_role.yaml │ │ ├── elasticjob_viewer_role.yaml │ │ ├── kustomization.yaml │ │ ├── leader_election_role.yaml │ │ ├── leader_election_role_binding.yaml │ │ ├── role.yaml │ │ └── role_binding.yaml │ └── samples │ │ ├── classy-vision.yaml │ │ ├── etcd.yaml │ │ └── imagenet.yaml ├── controllers │ ├── elasticjob_controller.go │ ├── expectation.go │ ├── job.go │ ├── pod.go │ ├── service.go │ ├── suite_test.go │ └── util.go ├── go.mod ├── go.sum ├── hack │ └── boilerplate.go.txt └── main.go ├── requirements.txt ├── scripts └── formatter_python.sh ├── setup.py └── torchelastic ├── __init__.py └── distributed ├── __init__.py └── launch.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | .git 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/.github/ISSUE_TEMPLATE/bug-report.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/.github/ISSUE_TEMPLATE/documentation.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/.github/ISSUE_TEMPLATE/feature-request.md -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/questions-help-support.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/.github/ISSUE_TEMPLATE/questions-help-support.md -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/pull_request_template.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/.gitignore -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/CHANGELOG.md -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/Dockerfile -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/README.md -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/__init__.py -------------------------------------------------------------------------------- /aws/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/README.md -------------------------------------------------------------------------------- /aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/__init__.py -------------------------------------------------------------------------------- /aws/auth/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/auth/__init__.py -------------------------------------------------------------------------------- /aws/auth/session.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/auth/session.py -------------------------------------------------------------------------------- /aws/autoscaling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/autoscaling.py -------------------------------------------------------------------------------- /aws/cfn/setup.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/cfn/setup.yml -------------------------------------------------------------------------------- /aws/cloudformation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/cloudformation.py -------------------------------------------------------------------------------- /aws/config/sample_specs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/config/sample_specs.json -------------------------------------------------------------------------------- /aws/config/user_data_rdzv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/config/user_data_rdzv -------------------------------------------------------------------------------- /aws/config/user_data_worker: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/config/user_data_worker -------------------------------------------------------------------------------- /aws/petctl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/petctl.py -------------------------------------------------------------------------------- /aws/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3>=1.9.148 2 | jinja2>=2.10 3 | -------------------------------------------------------------------------------- /aws/s3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/s3.py -------------------------------------------------------------------------------- /aws/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/aws/util.py -------------------------------------------------------------------------------- /azure/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/README.md -------------------------------------------------------------------------------- /azure/config/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/config/Dockerfile -------------------------------------------------------------------------------- /azure/config/kubernetes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/config/kubernetes.json -------------------------------------------------------------------------------- /azure/config/sample_specs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/config/sample_specs.yaml -------------------------------------------------------------------------------- /azure/petctl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/petctl.py -------------------------------------------------------------------------------- /azure/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/azure/util.py -------------------------------------------------------------------------------- /design/kubernetes/torchelastic-operator-design.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/design/kubernetes/torchelastic-operator-design.md -------------------------------------------------------------------------------- /design/torchelastic/0.2.0/design_doc.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/design/torchelastic/0.2.0/design_doc.md -------------------------------------------------------------------------------- /design/torchelastic/0.2.0/torchelastic_agent_diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/design/torchelastic/0.2.0/torchelastic_agent_diagram.jpg -------------------------------------------------------------------------------- /design/torchelastic/0.2.0/torchelastic_diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/design/torchelastic/0.2.0/torchelastic_diagram.jpg -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/doc_push.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/doc_push.sh -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/requirements.txt -------------------------------------------------------------------------------- /docs/source/_static/img/efs-setup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/_static/img/efs-setup.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/pytorch-logo-dark.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/_static/img/pytorch-logo-dark.svg -------------------------------------------------------------------------------- /docs/source/_static/img/pytorch-logo-flame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/_static/img/pytorch-logo-flame.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/conf.py -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/index.rst -------------------------------------------------------------------------------- /docs/source/scripts/create_redirect_md.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/source/scripts/create_redirect_md.py -------------------------------------------------------------------------------- /docs/src/pip-delete-this-directory.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/docs/src/pip-delete-this-directory.txt -------------------------------------------------------------------------------- /examples/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/Dockerfile -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/README.md -------------------------------------------------------------------------------- /examples/bin/fetch_and_run: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/bin/fetch_and_run -------------------------------------------------------------------------------- /examples/bin/install_etcd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/bin/install_etcd -------------------------------------------------------------------------------- /examples/imagenet/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/imagenet/main.py -------------------------------------------------------------------------------- /examples/multi_container/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/multi_container/Dockerfile -------------------------------------------------------------------------------- /examples/multi_container/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/multi_container/README.md -------------------------------------------------------------------------------- /examples/multi_container/docker-compose.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/multi_container/docker-compose.yaml -------------------------------------------------------------------------------- /examples/multi_container/echo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/examples/multi_container/echo.py -------------------------------------------------------------------------------- /kubernetes/DEVELOPMENT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/DEVELOPMENT.md -------------------------------------------------------------------------------- /kubernetes/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/Dockerfile -------------------------------------------------------------------------------- /kubernetes/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/Makefile -------------------------------------------------------------------------------- /kubernetes/PROJECT: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/PROJECT -------------------------------------------------------------------------------- /kubernetes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/README.md -------------------------------------------------------------------------------- /kubernetes/TROUBLESHOOTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/TROUBLESHOOTING.md -------------------------------------------------------------------------------- /kubernetes/api/v1alpha1/constants.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/api/v1alpha1/constants.go -------------------------------------------------------------------------------- /kubernetes/api/v1alpha1/elasticjob_types.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/api/v1alpha1/elasticjob_types.go -------------------------------------------------------------------------------- /kubernetes/api/v1alpha1/groupversion_info.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/api/v1alpha1/groupversion_info.go -------------------------------------------------------------------------------- /kubernetes/api/v1alpha1/zz_generated.deepcopy.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/api/v1alpha1/zz_generated.deepcopy.go -------------------------------------------------------------------------------- /kubernetes/config/crd/bases/elastic.pytorch.org_elasticjobs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/crd/bases/elastic.pytorch.org_elasticjobs.yaml -------------------------------------------------------------------------------- /kubernetes/config/crd/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/crd/kustomization.yaml -------------------------------------------------------------------------------- /kubernetes/config/crd/kustomizeconfig.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/crd/kustomizeconfig.yaml -------------------------------------------------------------------------------- /kubernetes/config/default/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/default/kustomization.yaml -------------------------------------------------------------------------------- /kubernetes/config/manager/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - manager.yaml 3 | -------------------------------------------------------------------------------- /kubernetes/config/manager/manager.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/manager/manager.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/elasticjob_editor_role.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/elasticjob_editor_role.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/elasticjob_viewer_role.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/elasticjob_viewer_role.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/kustomization.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/kustomization.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/leader_election_role.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/leader_election_role.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/leader_election_role_binding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/leader_election_role_binding.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/role.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/role.yaml -------------------------------------------------------------------------------- /kubernetes/config/rbac/role_binding.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/rbac/role_binding.yaml -------------------------------------------------------------------------------- /kubernetes/config/samples/classy-vision.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/samples/classy-vision.yaml -------------------------------------------------------------------------------- /kubernetes/config/samples/etcd.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/samples/etcd.yaml -------------------------------------------------------------------------------- /kubernetes/config/samples/imagenet.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/config/samples/imagenet.yaml -------------------------------------------------------------------------------- /kubernetes/controllers/elasticjob_controller.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/elasticjob_controller.go -------------------------------------------------------------------------------- /kubernetes/controllers/expectation.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/expectation.go -------------------------------------------------------------------------------- /kubernetes/controllers/job.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/job.go -------------------------------------------------------------------------------- /kubernetes/controllers/pod.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/pod.go -------------------------------------------------------------------------------- /kubernetes/controllers/service.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/service.go -------------------------------------------------------------------------------- /kubernetes/controllers/suite_test.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/suite_test.go -------------------------------------------------------------------------------- /kubernetes/controllers/util.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/controllers/util.go -------------------------------------------------------------------------------- /kubernetes/go.mod: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/go.mod -------------------------------------------------------------------------------- /kubernetes/go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/go.sum -------------------------------------------------------------------------------- /kubernetes/hack/boilerplate.go.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/hack/boilerplate.go.txt -------------------------------------------------------------------------------- /kubernetes/main.go: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/kubernetes/main.go -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/formatter_python.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/scripts/formatter_python.sh -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/setup.py -------------------------------------------------------------------------------- /torchelastic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/torchelastic/__init__.py -------------------------------------------------------------------------------- /torchelastic/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/torchelastic/distributed/__init__.py -------------------------------------------------------------------------------- /torchelastic/distributed/launch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pytorch/elastic/HEAD/torchelastic/distributed/launch.py --------------------------------------------------------------------------------