├── .dockerignore
├── .github
├── issue_template.md
└── pull_request_template.md
├── .gitignore
├── .pre-commit-config.yaml
├── .tools
├── codestyle
│ ├── .gitignore
│ ├── clang_format.hook
│ ├── copyright.py
│ └── docstring_checker.py
└── test_runner.py
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── FAQ.md
├── LICENSE
├── OWNERS.md
├── README.md
├── RELEASE.md
├── cmake
├── python.cmake
└── python_module.cmake
├── doc
├── Elastic Deep Learning Survey.pdf
├── ROADMAP.md
├── SUPPORT.md
├── boss_tutorial.md
├── build.md
├── checkpoint_based_edl.gif
├── distill.gif
├── edl_collective_design_doc.md
├── edl_collective_design_doc_cn.md
├── edl_design_doc.md
├── edl_design_doc_cn.md
├── edl_distill_design_doc.md
├── edl_live_fault_tolerance.md
├── experiment
│ └── distill_resnet50.md
├── fault_tolerance.md
├── fault_tolerance_cn.md
├── images
│ ├── edl-arch.png
│ ├── launcher.png
│ └── trainer.png
├── install.md
└── usage.md
├── docker
├── Dockerfile
├── Dockerfile.runtime
├── README.md
├── build-devel.sh
├── build-runtime.sh
├── dev_requirements.txt
├── release-run-time.sh
└── requirements.txt
├── example
├── collective
│ └── resnet50
│ │ ├── dali.py
│ │ ├── models
│ │ ├── __init__.py
│ │ ├── resnet.py
│ │ └── vgg.py
│ │ ├── scripts
│ │ └── train_gpu.sh
│ │ ├── train_pretrain.sh
│ │ ├── train_with_fleet.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── fp16_utils.py
│ │ ├── img_tool.py
│ │ ├── learning_rate.py
│ │ ├── reader_cv2.py
│ │ └── utility.py
├── ctr
│ ├── Dockerfile
│ ├── README
│ ├── ctr
│ │ ├── dumper.py
│ │ ├── kvtool.py
│ │ ├── save_program.py
│ │ └── train.py
│ ├── deploy_ctr_on_baidu_cloud_cn.rst
│ ├── k8s
│ │ ├── ctr.yaml
│ │ ├── cube.yaml
│ │ ├── ftp.yaml
│ │ ├── paddle-suite.sh
│ │ ├── paddle-suite.yaml
│ │ ├── pdclient.yaml
│ │ ├── pdserving.yaml
│ │ └── transfer.yaml
│ ├── ps-train
│ │ ├── pserver.yaml
│ │ └── trainer.yaml
│ ├── script
│ │ ├── README
│ │ ├── ctr.yaml
│ │ ├── cube.yaml
│ │ ├── defaultserviceaccountclusterrole.yaml
│ │ ├── fileserver.yaml
│ │ ├── ftp.yaml
│ │ ├── paddle-suite.sh
│ │ ├── paddle-suite.yaml
│ │ ├── pdclient.yaml
│ │ ├── pdserving.yaml
│ │ └── transfer.yaml
│ └── src
│ │ ├── baidu_cloud
│ │ ├── cluster-info.png
│ │ ├── concole.png
│ │ ├── conf-download.png
│ │ ├── ctr-models.png
│ │ ├── ctr-prediction-end-to-end-deployment.png
│ │ ├── ctr-running.png
│ │ ├── eip.png
│ │ ├── file_server.png
│ │ ├── helm-version.png
│ │ ├── kubectl-version.png
│ │ ├── load_balancer.png
│ │ ├── pserver-log.png
│ │ ├── tiller.png
│ │ ├── trainer-log.png
│ │ ├── volcano.png
│ │ ├── wget_example.png
│ │ └── workload.png
│ │ ├── create_gpu_machine.png
│ │ ├── create_image.png
│ │ ├── create_more_nodes.png
│ │ ├── ctr.png
│ │ ├── ctr_kubectl_download.png
│ │ ├── ctr_node.png
│ │ ├── ctr_pods.png
│ │ ├── ctr_pserver_log.png
│ │ ├── ctr_trainer_log.png
│ │ ├── ctr_volcano_install.png
│ │ ├── ctryaml1.png
│ │ ├── ctryaml2.png
│ │ ├── ctryaml3.png
│ │ ├── cube.png
│ │ ├── cube_config1.png
│ │ ├── cube_config2.png
│ │ ├── dist_train_demo.py
│ │ ├── dist_train_nccl2.graffle
│ │ ├── dist_train_nccl2.png
│ │ ├── dist_train_pserver.graffle
│ │ ├── dist_train_pserver.png
│ │ ├── file_server_pod.png
│ │ ├── file_server_svc.png
│ │ ├── overview.png
│ │ ├── paddleclient.png
│ │ ├── paddleserving_pod.png
│ │ ├── paddleserving_svc.png
│ │ ├── parallelism.png
│ │ ├── pyreader.png
│ │ ├── release.png
│ │ └── transfer.png
├── demo
│ └── collective
│ │ ├── README.md
│ │ ├── env.sh
│ │ ├── resnet50
│ │ └── package.sh
│ │ ├── start_job_client.sh
│ │ └── start_job_server.sh
├── distill
│ ├── README.md
│ ├── k8s
│ │ ├── balance.yaml
│ │ ├── edl_k8s
│ │ ├── etcd.yaml
│ │ ├── student.yaml
│ │ └── teacher.yaml
│ ├── mnist_distill
│ │ ├── README_CN.md
│ │ ├── image
│ │ │ └── infer_3.png
│ │ ├── run.sh
│ │ └── train_with_fleet.py
│ ├── nlp
│ │ ├── README.md
│ │ ├── distill.py
│ │ ├── fine_tune.py
│ │ ├── model.py
│ │ ├── reader.py
│ │ ├── test_distill.sh
│ │ ├── test_train.sh
│ │ └── train.py
│ ├── qps_tools
│ │ ├── distill_reader_qps.py
│ │ ├── parse_config.py
│ │ └── run.sh
│ ├── reader_demo
│ │ ├── distill_reader_demo.py
│ │ └── run_demo.sh
│ └── resnet
│ │ ├── README.md
│ │ ├── dali.py
│ │ ├── models
│ │ ├── __init__.py
│ │ ├── resnet.py
│ │ ├── resnet_vd.py
│ │ └── vgg.py
│ │ ├── scripts
│ │ ├── start_local_teacher.sh
│ │ └── train_student.sh
│ │ ├── train_with_fleet.py
│ │ └── utils
│ │ ├── __init__.py
│ │ ├── fp16_utils.py
│ │ ├── img_tool.py
│ │ ├── learning_rate.py
│ │ ├── reader_cv2.py
│ │ └── utility.py
└── fit_a_line
│ ├── Dockerfile
│ ├── collector.py
│ ├── collector.pyc
│ ├── del_jobs.sh
│ ├── examplejob.yaml
│ ├── fluid
│ ├── common.py
│ ├── fit_a_line.py
│ ├── image
│ │ ├── infer_3.png
│ │ └── ranges.png
│ └── recognize_digits.py
│ ├── nginx.yaml
│ ├── train_ft.py
│ └── train_local.py
├── k8s
├── edl_controller.yaml
├── k8s_tools.py
├── rbac_admin.yaml
└── thirdpartyresource.yaml
├── logo
├── edl.png
└── paddle.png
├── python
├── CMakeLists.txt
├── edl
│ ├── __init__.py
│ ├── collective
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── distribute_reader.py
│ │ ├── launch.py
│ │ └── serializable.py
│ ├── discovery
│ │ ├── __init__.py
│ │ ├── consistent_hash.py
│ │ ├── etcd_client.py
│ │ ├── register.py
│ │ └── server_alive.py
│ ├── distill
│ │ ├── __init__.py
│ │ ├── balance_table.py
│ │ ├── discovery_client.py
│ │ ├── discovery_server.py
│ │ ├── distill_reader.py
│ │ ├── distill_worker.py
│ │ ├── redis
│ │ │ ├── __init__.py
│ │ │ ├── balance_server.py
│ │ │ ├── client.py
│ │ │ ├── redis_store.py
│ │ │ ├── server_register.py
│ │ │ └── service_table.py
│ │ ├── timeline.py
│ │ └── utils.py
│ ├── liveft
│ │ ├── __init__.py
│ │ ├── elastic.py
│ │ └── launch.py
│ ├── protos
│ │ ├── common.proto
│ │ ├── data_server.proto
│ │ ├── distill_discovery.proto
│ │ ├── generate.sh
│ │ ├── pod_server.proto
│ │ └── run_codegen.py
│ ├── tests
│ │ ├── __init__.py
│ │ └── unittests
│ │ │ ├── CMakeLists.txt
│ │ │ ├── __init__.py
│ │ │ ├── data_server
│ │ │ ├── a.txt
│ │ │ └── b.txt
│ │ │ ├── data_server_tmp.py
│ │ │ ├── del_from_etcd.py
│ │ │ ├── distill_reader_test.py
│ │ │ ├── edl_demo.py
│ │ │ ├── etcd_client_test.py
│ │ │ ├── etcd_test.sh
│ │ │ ├── etcd_test_base.py
│ │ │ ├── launch_demo.py
│ │ │ ├── master_client_test.py
│ │ │ ├── serving_conf
│ │ │ └── serving_client_conf.prototxt
│ │ │ ├── test_cluster.py
│ │ │ ├── test_cluster_generator.py
│ │ │ ├── test_cluster_watcher.py
│ │ │ ├── test_consistent_hash.py
│ │ │ ├── test_data_reader.py
│ │ │ ├── test_data_server.py
│ │ │ ├── test_distill_reader.sh
│ │ │ ├── test_etcd_client.sh
│ │ │ ├── test_file_list.txt
│ │ │ ├── test_launch.py
│ │ │ ├── test_launch.sh
│ │ │ ├── test_leader_pod.py
│ │ │ ├── test_pod.py
│ │ │ ├── test_redis_distill_reader.sh
│ │ │ ├── test_resource_pods.py
│ │ │ ├── test_state.py
│ │ │ └── test_train.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── args_utils.py
│ │ ├── client.py
│ │ ├── cluster.py
│ │ ├── cluster_generator.py
│ │ ├── cluster_watcher.py
│ │ ├── constants.py
│ │ ├── data_filter.py
│ │ ├── data_server.py
│ │ ├── data_server_client.py
│ │ ├── env.py
│ │ ├── error_utils.py
│ │ ├── etcd_db.py
│ │ ├── etcd_utils.py
│ │ ├── exceptions.py
│ │ ├── file_utils.py
│ │ ├── json_serializable.py
│ │ ├── launcher.py
│ │ ├── leader_pod.py
│ │ ├── log_utils.py
│ │ ├── network_utils.py
│ │ ├── pb_utils.py
│ │ ├── pod.py
│ │ ├── pod_server.py
│ │ ├── pod_server_client.py
│ │ ├── process.py
│ │ ├── reader.py
│ │ ├── register.py
│ │ ├── resource_pods.py
│ │ ├── state.py
│ │ ├── status.py
│ │ ├── string_utils.py
│ │ ├── train_process.py
│ │ ├── train_status.py
│ │ ├── trainer.py
│ │ └── unique_name.py
└── setup.py.in
└── scripts
├── build.sh
├── custom-boilerplate.go.txt
├── download_etcd.sh
└── run_build.sh
/.dockerignore:
--------------------------------------------------------------------------------
1 | *~
2 | vendor/
3 | .glide/
4 |
--------------------------------------------------------------------------------
/.github/issue_template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 训练(Training issue)
3 | about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training
4 | issue.
5 |
6 | ---
7 |
8 | 为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】
9 |
10 | 如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息:
11 | - 版本、环境信息:
12 | - PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.8或CommitID
13 | - EDL版本:请提供您的EDL版本号,例如0.3或CommitID
14 | - CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况
15 | - GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号
16 | - 系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本
17 | - 复现信息:如为报错,请给出复现环境、复现步骤
18 | - 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段
19 |
20 | Thank you for contributing to EDL.
21 | Before submitting the issue, you could search the issue in the GitHub in case that there was a similar issue submitted or resolved before.
22 | If there is no solution, please make sure that this is a training issue including the following details:
23 |
24 | - System information:
25 | - PaddlePaddle version (eg.1.8)or commit
26 | - EDL version (eg.0.3)or commit
27 | - CPU: including CPUMKL/OpenBlas/MKLDNN version
28 | - GPU: including CUDA/CUDNN version
29 | - OS Platform (eg.Mac OS 10.14)
30 | - To Reproduce:
31 | - steps to reproduce the behavior
32 | - Describe your current behavior
33 | - Code to reproduce the issue
34 |
35 | - Other info/logs
36 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | 非常感谢您给EDL项目提交PR!
2 | 在提交PR之前,请帮忙回答一下问题以帮助我们判断PR的意图:
3 | - 这个PR要解决什么问题?
4 | - 有没有对应的ISSUE?
5 | Fix
6 | - 这个PR里边有没有要注意的地方?
7 | - 这个PR改变了用户接口?
8 | - 其他的说明?
9 |
10 |
11 | You are welcome to submmit PR for EDL.
12 | Before this, would you like to answer some questions to help others to get what the PR does?
13 | - What this PR does / why we need it?
14 | - Which issue(s) this PR fixes?
15 | Fix
16 | - Special notes for your reviewer:
17 | - Does this PR introduce a user-facing change?
18 | - Additional documentation?
19 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | vendor/
3 | .glide/
4 | .vscode/
5 | *.pyc
6 | build/
7 | *.log
8 | resnet50_pod/
9 | .*.swp
10 | *_pb2.py
11 | *_pb2_grpc.py
12 | *.pb.go
13 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: 19.10b0
4 | hooks:
5 | - id: black
6 | exclude: ^(\.tools\/|example\/|k8s\/)
7 | - repo: https://github.com/pre-commit/pygrep-hooks
8 | rev: v1.5.1
9 | hooks:
10 | - id: python-use-type-annotations
11 | exclude: ^(\.tools\/|example\/|k8s\/)
12 | - repo: https://github.com/pre-commit/pre-commit-hooks
13 | rev: v3.2.0
14 | hooks:
15 | - id: trailing-whitespace
16 | files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
17 | - id: check-docstring-first
18 | - id: check-json
19 | - id: check-added-large-files
20 | - id: debug-statements
21 | exclude: ^(\.tools\/|example\/|k8s\/)
22 | - id: requirements-txt-fixer
23 | - id: check-merge-conflict
24 | - id: check-symlinks
25 | - id: detect-private-key
26 | - id: end-of-file-fixer
27 | - repo: local
28 | hooks:
29 | - id: copyright_checker
30 | name: copyright_checker
31 | entry: python .tools/codestyle/copyright.py
32 | language: system
33 | files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
34 | - repo: https://gitlab.com/pycqa/flake8
35 | rev: 3.8.3
36 | hooks:
37 | - id: flake8
38 | exclude: ^(\.tools\/|example\/|k8s\/)
39 | args: ['--max-line-length=100', '--extend-ignore=E203']
40 | - repo: local
41 | hooks:
42 | - id: shellcheck
43 | name: shellcheck
44 | entry: shellcheck
45 | language: system
46 | files: .sh$
47 | exclude: ^(\.tools\/|example\/|k8s\/)
48 |
--------------------------------------------------------------------------------
/.tools/codestyle/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 |
--------------------------------------------------------------------------------
/.tools/codestyle/clang_format.hook:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | readonly VERSION="3.8"
5 |
6 | version=$(clang-format -version)
7 |
8 | if ! [[ $version == *"$VERSION"* ]]; then
9 | echo "clang-format version check failed."
10 | echo "a version contains '$VERSION' is needed, but get '$version'"
11 | echo "you can install the right version, and make an soft-link to '\$PATH' env"
12 | exit -1
13 | fi
14 |
15 | clang-format $@
16 |
--------------------------------------------------------------------------------
/.tools/test_runner.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | import unittest
18 | import os
19 | import sys
20 | import paddle.fluid as fluid
21 | import importlib
22 | from six.moves import cStringIO
23 |
24 |
25 | def main():
26 | sys.path.append(os.getcwd())
27 | some_test_failed = False
28 | for module_name in sys.argv[1:]:
29 | buffer = cStringIO()
30 | main = fluid.Program()
31 | startup = fluid.Program()
32 | scope = fluid.core.Scope()
33 | with fluid.program_guard(main, startup):
34 | with fluid.scope_guard(scope):
35 | with fluid.unique_name.guard():
36 | test_loader = unittest.TestLoader()
37 | module = importlib.import_module(module_name)
38 | tests = test_loader.loadTestsFromModule(module)
39 | res = unittest.TextTestRunner(stream=buffer).run(tests)
40 | if not res.wasSuccessful():
41 | some_test_failed = True
42 | print(
43 | module_name,
44 | 'failed\n',
45 | buffer.getvalue(),
46 | file=sys.stderr)
47 |
48 | if some_test_failed:
49 | exit(1)
50 |
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.0)
2 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
3 | set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
4 | set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
5 | SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
6 | SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
7 | project(edl)
8 |
9 | option(WITH_TESTING "Compile EDL with unit testing" ON)
10 | option(WITH_COVERAGE "Compile EDL with code coverage" OFF)
11 | option(PY_VERSION "Compile EDL with python3 support" ${PY_VERSION})
12 |
13 | # PY_VERSION
14 | if(NOT PY_VERSION)
15 | set(PY_VERSION 2.7)
16 | endif()
17 |
18 | include(python)
19 |
20 | IF(WITH_TESTING)
21 | ENABLE_TESTING()
22 | ENDIF()
23 |
24 | add_subdirectory(python)
25 |
--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
1 | ## Frequent asked questions
2 | - what is edl?
3 | - Computing resources on cloud such as Amazon AWS、Baidu Cloud have multi-tenancy. Deep learning model training and inference with elastic resources will be common on cloud. We propose Elastic Deep Learning (EDL) that makes training and inference of deep learning models on cloud easier and more efficient.
4 |
--------------------------------------------------------------------------------
/OWNERS.md:
--------------------------------------------------------------------------------
1 | ## Owner:
2 | EDL project aims to supported PaddlePaddle's distributed training, current owner and contributors are as follows:
3 | - Owner: [guru4elephant](https://github.com/guru4elephant)
4 |
5 | ## Contributors:
6 | - [Yancey1898](https://github.com/Yancey1989)
7 | - [gonweibao](https://github.com/gongweibao)
8 | - [helinwang](https://github.com/helinwang)
9 | - [typhoonzero](https://github.com/typhoonzero)
10 | - [putcn](https://github.com/putcn)
11 | - [m3ngyang](https://github.com/m3ngyang)
12 | - [wangkuiyi](https://github.com/wangkuiyi)
13 | - [qizheng09](https://github.com/qizheng09)
14 | - [wangjiawei04](https://github.com/wangjiawei04)
15 | - [wopeizl](https://github.com/wopeizl)
16 | - [drinktee](https://github.com/drinktee)
17 | - [wanghaoshuang](https://github.com/wanghaoshuang)
18 | - [denkensk](https://github.com/denkensk)
19 | - [tizhou86](https://github.com/tizhou86)
20 | - [luotao1](https://github.com/luotao1)
21 | - [gangliao](https://github.com/gangliao)
22 |
--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
1 | # Release Note
2 |
3 | Please turn to [here](https://github.com/PaddlePaddle/edl/releases) for release note.
4 |
--------------------------------------------------------------------------------
/cmake/python.cmake:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | INCLUDE(python_module)
16 |
17 | FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
18 | FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
19 |
20 | if(WIN32)
21 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
22 | "from distutils import sysconfig as s;import sys;import struct;
23 | print(sys.prefix);
24 | print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
25 | "
26 | RESULT_VARIABLE _PYTHON_SUCCESS
27 | OUTPUT_VARIABLE _PYTHON_VALUES
28 | ERROR_VARIABLE _PYTHON_ERROR_VALUE)
29 |
30 | if(NOT _PYTHON_SUCCESS MATCHES 0)
31 | set(PYTHONLIBS_FOUND FALSE)
32 | return()
33 | endif()
34 |
35 | # Convert the process output into a list
36 | string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
37 | string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
38 | list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
39 | list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
40 |
41 | # Make sure all directory separators are '/'
42 | string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
43 |
44 | set(PYTHON_LIBRARY
45 | "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
46 |
47 | # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
48 | # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
49 | if(NOT EXISTS "${PYTHON_LIBRARY}")
50 | get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
51 | set(PYTHON_LIBRARY
52 | "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
53 | endif()
54 |
55 | # raise an error if the python libs are still not found.
56 | if(NOT EXISTS "${PYTHON_LIBRARY}")
57 | message(FATAL_ERROR "Python libraries not found")
58 | endif()
59 | SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
60 | endif(WIN32)
61 |
62 | # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
63 | ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
64 | SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
65 |
66 | SET(py_env "")
67 | IF(PYTHONINTERP_FOUND)
68 | find_python_module(pip REQUIRED)
69 | find_python_module(wheel REQUIRED)
70 | ENDIF(PYTHONINTERP_FOUND)
71 | INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
72 | INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
73 |
--------------------------------------------------------------------------------
/cmake/python_module.cmake:
--------------------------------------------------------------------------------
1 | # Find if a Python module is installed
2 | # Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html
3 | # To use do: find_python_module(PyQt4 REQUIRED)
4 | function(find_python_module module)
5 | string(TOUPPER ${module} module_upper)
6 | if(NOT PY_${module_upper})
7 | if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
8 | set(${module}_FIND_REQUIRED TRUE)
9 | else()
10 | set(${module}_FIND_REQUIRED FALSE)
11 | endif()
12 | # A module's location is usually a directory, but for binary modules
13 | # it's a .so file.
14 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
15 | "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
16 | RESULT_VARIABLE _${module}_status
17 | OUTPUT_VARIABLE _${module}_location
18 | ERROR_QUIET
19 | OUTPUT_STRIP_TRAILING_WHITESPACE)
20 | if(NOT _${module}_status)
21 | set(PY_${module_upper} ${_${module}_location} CACHE STRING
22 | "Location of Python module ${module}")
23 | endif(NOT _${module}_status)
24 | endif(NOT PY_${module_upper})
25 | find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
26 | if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
27 | message(FATAL_ERROR "python module ${module} is not found")
28 | endif()
29 |
30 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
31 | "import sys, ${module}; sys.stdout.write(${module}.__version__)"
32 | OUTPUT_VARIABLE _${module}_version
33 | RESULT_VARIABLE _${module}_status
34 | ERROR_QUIET
35 | OUTPUT_STRIP_TRAILING_WHITESPACE)
36 | if(NOT _${module}_status)
37 | set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
38 | "Version of Python module ${module}")
39 | endif(NOT _${module}_status)
40 |
41 | set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
42 | set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
43 | endfunction(find_python_module)
44 |
--------------------------------------------------------------------------------
/doc/Elastic Deep Learning Survey.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/Elastic Deep Learning Survey.pdf
--------------------------------------------------------------------------------
/doc/ROADMAP.md:
--------------------------------------------------------------------------------
1 | ## 2020-03 Release 0.2.0
2 |
3 | ### Elastic Inference
4 | - Release features for inference with paddle serving on EDL
5 |
6 | ### Elastic Training with Checkpoint
7 | - Release features for training with paddlepaddle based on checkpoint
8 | - Verify the correctness of at least one training task.
9 |
10 | ### Release 0.2.0
11 | - Official release 0.2.0 with features of Elastic Inference and Checkpoint based Elastic Training
12 |
13 | ## 2020-06 Release 0.3.0
14 |
15 | ### EDL High Level API Design
16 | - Support User defined Training Data and Parameter Adaptation API
17 | - A user can define what to adapt when computing resources are adjusted
18 |
19 | ### Release Verified Model Training Scripts based on EDL API
20 | - NLP models and CV models trained based on GPU will be verified.
21 | Bert/Ernie and Resnet50 for classification will be considered currently.
22 |
23 | ### Release 0.3.0
24 | - Features above will be released
25 |
26 | ## 2020-09 Release 0.4.0
27 |
28 | ### Online Training
29 | - Support Elastic Online Training Solution with resources allocated dynamically along the training timeline
30 | - Recommendation scenarios should be considered on high priority, verified model will be released
31 |
32 | ### EDL API upgraded
33 | Update EDL API to support online elastic deep learning
34 |
35 | ### Release 0.4.0
36 | - Features above will be released
37 |
38 | ## 2020-12
39 | - More Application on EDL
40 | - More machine learning tools integrated.
41 |
--------------------------------------------------------------------------------
/doc/SUPPORT.md:
--------------------------------------------------------------------------------
1 | To get support for EDL and participate in the discussions, please join one or more appropriate mailing list below:
2 |
3 | * [EDL Announce Mailing List](https://lists.lfai.foundation/g/edl-announce)
4 | * [EDL Technical Discussion](https://lists.lfai.foundation/g/edl-technical-discuss)
5 | * [EDL Technical Steering Committee](https://lists.lfai.foundation/g/edl-tsc)
6 |
--------------------------------------------------------------------------------
/doc/build.md:
--------------------------------------------------------------------------------
1 | # How to Build EDL Component
2 |
3 | This article contains instructions of build EDL and how to pack them into
4 | Docker image so that the EDL component can run in the Kubernetes cluster.
5 |
6 | ## Build EDL Controller
7 |
8 | ```bash
9 | glide install --strip-vendor
10 | go build github.com/paddlepaddle/edl/cmd/edl
11 | ```
12 |
13 | The above step will generate a binary file named `edl` which should
14 | run as a daemon process on the Kubernetes cluster.
15 |
16 | ## Build EDL Controller Image
17 |
18 | To build your own docker images, run the following command:
19 |
20 | ```bash
21 | docker build -t yourRepoName/edl-controller .
22 | ```
23 |
24 | This command will take the `Dockerfile`, build the EDL docker image and tag it as `yourRepoName/edl-controller`
25 |
26 | Now you want to push it to your docker hub so that Kubernetes cluster is able to pull and deploy it.
27 |
28 | ``` bash
29 | docker push yourRepoName/edl-controller
30 | ```
31 |
--------------------------------------------------------------------------------
/doc/checkpoint_based_edl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/checkpoint_based_edl.gif
--------------------------------------------------------------------------------
/doc/distill.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/distill.gif
--------------------------------------------------------------------------------
/doc/edl_collective_design_doc_cn.md:
--------------------------------------------------------------------------------
1 | # 概述
2 | Collective通信(同步)模式的训练因为其精度稳定、好复现的特点广泛用于图像、文本、语音等深度学习领域。
3 | 本文将阐述Collective通信模式下的EDL的设计思路和方法
4 |
5 | # 难点
6 | 当用户把自己的单机程序改成多机程序,他的程序需要增加的唯一的超参是节点的个数,由此带来的程序的改动可能会有几个,如:数据如何切分;batchsize、learning rate类的调整。
7 | 当用户把多机程序改成可以适应EDL的程序,需要在节点个数的基础上增加另外一个考虑:数据的一致性的问题。需要保证各处的跟节点数目相关的参数在节点变化的时候多个节点间数据是一致的。我们需要在框架端把这个考虑带来的影响减少到最小。
8 |
9 | 这带来几个难点问题:
10 |
11 | 1. 如何保存Python端的用户逻辑.
12 | 如数据如何切分、 文件的位置、及其他的Paddle框架之外的参数等。
13 | 这些参数是比较自由的、用户自定义的,我们在训练引擎端无法控制的。所以我们采用stop-resume的方式解决,用户程序面对新的超参只有节点个数一个。
14 |
15 | 2. 如何尽可能的保证精度、结果可复现。
16 | 训练的任务提交之前,用户需要指定自己的训练节点的最小和最大的节点的个数,同时需要指定batchsize是保持不变还是随着节点数目线性增长,因为batchsize是精度相关的超参,有些模型超过了一定阈值就需要做额外的调整,如Resnet50 total batchsize 超过8K的时候需要对学习率做额外的调整。
17 | 但是,保持总的batchsize不变也会带来扩展的效率问题:单卡batchsize减少,训练的性能可能会降低。
18 | 考虑到上述两个问题,这个地方需要用户自己根据节点的个数和自己的模型的特点做决定。
19 |
20 | 3. 如何让用户的程序改动少。
21 | stop-resume的方式是需要`save_checkpoint`和`load_checkpoint`的时机。因为需要用户在Python端的显示调用,这部分很难隐藏到接口里边去。
22 | 除了这个之外,其他无改动。
23 |
24 | 4. 如何对接多个集群。
25 | Kubernetes虽然用的越来越多,但是实际生产中会有多种类型的在线、离线集群。为了和这些集群对接,我们提出了一个中间层:Jobserver。
26 | 用这个模块来对接各种各样的集群接口。
27 |
28 | 5. 如何防止没有意义的调度。
29 | - 当一个训练任务临近结束的时候其实是没有必要进行伸缩的,这个时候的伸缩反而会降低效率
30 | - 某些场景下,需要优先scale资源利用率高的作业而不是利用率低的,这有利于整体吞吐量的提升。
31 | 考虑到上述的原因,Paddle需要把作业的性能统计信息传递给调度端以便调度进行决策.
32 |
33 | 可能会有多种需要考虑的场景,而不仅仅是上述的两个。Paddle(计算引擎)需要把训练节点的信息汇报给调度端,以便于调度端做调度的决策。
34 |
35 | 6. 如何做数据的切分。
36 | 节点的变化一般会带来数据切分方式的变化。这就需要用户对数据有全局观。要么用户把数据全部下载下来,要么采用mount一个分布式的文件系统(如Ceph等)的方式。
37 |
38 | # 方案设计
39 | ## 架构图
40 |
41 |
42 | ## Launcher module
43 |
44 | Launcher模块主要负责多个trainer端的协调
45 |
46 | ## Trainer module
47 |
.
48 | Trainer模块主要负责EDL功能里边的`save_checkpoint` `load_checkpoint`
49 |
--------------------------------------------------------------------------------
/doc/edl_design_doc.md:
--------------------------------------------------------------------------------
1 | # Design Doc: Elastic Deep Learning
2 |
3 | TBD
4 |
--------------------------------------------------------------------------------
/doc/edl_design_doc_cn.md:
--------------------------------------------------------------------------------
1 | #
2 |
--------------------------------------------------------------------------------
/doc/edl_distill_design_doc.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 | Distilling the Knowledge in a Neural Network[1](#r_1) is a different type of training used to transfer the knowledge from the cumbersome models(teachers) to a small model(student) that is more suitable for deployment.
3 |
4 | EDL Distillation is a large scale and universal solution for knowledge distillation.
5 |
6 | - Decouple the teacher and student models
7 | - They can run in the same or different nodes and transfer knowledge via network even on heterogeneous machines.
8 | Use Distillation on resnet50 as an example: The teachers(Resnet101 for example) can be deployed on P4 GPU cards since they compute forward network generally and the student can be deployed on v100 GPU cards since they need more GPU memory.
9 |
10 | - It's flexible and efficient.
11 | - Teachers and students can be adjusted elastically in training by the resource utilization
12 | - Easier to use and deploy.
13 | - Few lines need to change.
14 | - End to end use. We release the Kubernetes' deployment solution for you.
15 |
16 | # Design
17 | ## Architecture
18 | ## Student
19 | ## Teacher
20 | ## Reader
21 | ## Balancer
22 |
23 | ## Reference
24 | 1.[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf)
25 |
--------------------------------------------------------------------------------
/doc/experiment/distill_resnet50.md:
--------------------------------------------------------------------------------
1 | # Distill experiment on resnet50
2 | TBD
3 |
--------------------------------------------------------------------------------
/doc/fault_tolerance.md:
--------------------------------------------------------------------------------
1 | # Fault tolerance for sync training
2 | ## Design
3 | In the process of training, we may meet that one or more trainers crash. We use checkpoints to continue training.
4 |
5 | There may be several design-tricks for it:
6 |
7 | 1. How does Paddle save checkpoint itself?
8 | Paddle implements `save_persistables` to save all persistable variables.
9 |
10 | 2. How to save user's Python frontend logic?
11 | Such as current epoch number, step number in an epoch, and the data slice and offset and so on.
12 |
13 | 3. How to save checkpoints?
14 | - Which trainer saves the checkpoint?
15 | If there are many trainers, the trainer who `rank`==0 will do it.
16 |
17 | - Where do we save the checkpoint?
18 | It can be saved to the local file system, but eventually, it should be saved to a file-system that can be seen by all trainers such as a distributed HDFS.
19 |
20 | - How to guarantee the checkpoint's integrity and correctness?
21 | It's a process to save a file and it's not an atomic action but `rm` `rename` `mv` and others should be.
22 | We can use it and don't change any checkpoint when it's written with a version number. All checkpoints will be saved to the file system with an increment version number. The interface generates a temporary checkpoint file and then `rename` it to valid when it has done.
23 |
24 | - when is the checkpoint saved?
25 | Now the trainer saves checkpoint every epoch and it need not save the data offset, it's very simple. Of course, this method is not friendly when an epoch takes a too long time. We will implement a step level(time-limited) checkpoint interface the next version.
26 |
27 | ## Interface
28 | There are two interfaces `save_check_point` and `load_check_point` to save/load a checkpoint.
29 | There are two arguments should be careful:
30 |
31 | 1. fs:
32 | It's an abstract interface to file system and there are two implementations: local file system and HDFS.
33 | You can implement the member function of this class to use the checkpoint interface.
34 |
35 | 2. train_status:
36 | Now there is only one member variable `epoch_no` and there will be more variables here after 0.2 version.
37 |
38 | ## Example
39 | 1.save_check_point:
40 |
41 | ```
42 | if trainer_id == 0:
43 | saved_status = TrainStatus(pass_id)
44 | if args.checkpoint:
45 | if not os.path.isdir(args.checkpoint):
46 | os.makedirs(args.checkpoint)
47 |
48 | print("save_check_point:{}".format(args.checkpoint))
49 | fleet.save_check_point(executor=exe, train_status=saved_status,
50 | path=args.checkpoint, fs=fs)#, main_program=fleet._origin_program)
51 | ```
52 |
53 | 2.load_check_point:
54 |
55 | ```
56 | if args.checkpoint is not None:
57 | tmp_s = fleet.load_check_point(exe, args.checkpoint, fs=fs, trainer_id=trainer_id)
58 | if tmp_s is not None:
59 | train_status = tmp_s
60 |
61 | for pass_id in range(train_status.next(), params["num_epochs"]):
62 | train()
63 | ```
64 |
65 | # Async training
66 | TBD
67 |
--------------------------------------------------------------------------------
/doc/fault_tolerance_cn.md:
--------------------------------------------------------------------------------
1 | # 同步训练的FaultTolerance
2 | ## 设计思路
3 | 在训练的过程中我们可能会碰到因为各种的问题造成的训练单个(或者多个)trainer挂掉的问题。我们采用checkpoint的方式记录当前状态,保证重启之后训练任务能够正常运行。
4 | 这里边可能有几个地方需要考虑:
5 |
6 | 1. Paddle本身的checkpoint
7 | Paddle本身提供`save_persistables `保存所有持久的变量。
8 |
9 | 2. 用户python端逻辑的checkpoint问题
10 | 主要是当前epoch number,数据切分方法和位置等。
11 |
12 | 3. checkpoint保存的问题
13 | - 谁来保存
14 | 如果有多个trainer节点,我们一般会选择rank=0的trainer来负责保存checkpoint
15 |
16 | - 保存的位置
17 | 可以保存到本地,但是最终要保存到重启任务能够看到的文件系统里边,如分布式的HDFS文件系统。
18 |
19 | - 如何确保checkpoint的正确性
20 | 保存文件一个持续性的过程,不是一个原子性的过程,不能保证事务性。但是一般的文件系统的操作`mv` `rename` `rm` 是。
21 | 可以利用这个特点,对已经保存的checkpoint不变,递增当前的 checkpoint的版本号,先写入一个临时文件,完成之后再rename成一个有效文件名的checkpoint。
22 |
23 | - 何时保存
24 | 我们现在推荐的方式是每一个epoch保存一次。因为一个epoch完成之后,可以认为两个epoch数据上没有关系。这样我们只需要保存当前的epoch号就可以了,不用保存当前的文件逻辑切分和位置等。减少了复杂度。当然,这种方式对一个epoch过大的的不友好。我们准备以后的版本开发step级别(时间)的checkpoint
25 |
26 | ## 接口介绍
27 | Paddle提供`save_check_point`和`load_check_point`两种方式来存、读checkpoint。
28 | 其中有两个参数需要注意一下:
29 | 1.fs
30 | 这个是我们对文件系统的抽象,目前的实现有两种:本地和远程HDFS。您可以实现自己的`FS`类来实现保存和读取checkpoint的功能
31 |
32 | 2.train_status
33 | 目前该类只有`epoch_no`的类变量,0.2以后的版本将尝试增加用户自定义的member等更多的值。
34 |
35 | ## 使用样例
36 | 1. save_check_point的样例:
37 |
38 | ```
39 | if trainer_id == 0:
40 | saved_status = TrainStatus(pass_id)
41 | if args.checkpoint:
42 | if not os.path.isdir(args.checkpoint):
43 | os.makedirs(args.checkpoint)
44 |
45 | print("save_check_point:{}".format(args.checkpoint))
46 | fleet.save_check_point(executor=exe, train_status=saved_status,
47 | path=args.checkpoint, fs=fs)#, main_program=fleet._origin_program)
48 | ```
49 |
50 | 2. load_check_point的样例:
51 |
52 | ```
53 | if args.checkpoint is not None:
54 | tmp_s = fleet.load_check_point(exe, args.checkpoint, fs=fs, trainer_id=trainer_id)
55 | if tmp_s is not None:
56 | train_status = tmp_s
57 | ```
58 |
59 |
60 | # 异步训练的FaultTolerance
61 | TBD
62 |
--------------------------------------------------------------------------------
/doc/images/edl-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/edl-arch.png
--------------------------------------------------------------------------------
/doc/images/launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/launcher.png
--------------------------------------------------------------------------------
/doc/images/trainer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/trainer.png
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM hub.baidubce.com/paddlepaddle/paddle:
2 |
3 | # gcc 5
4 | RUN ln -sf /usr/bin/gcc-5 /usr/bin/gcc
5 | # python3 default use python3.7
6 | RUN ln -sf /usr/local/bin/python3.7 /usr/local/bin/python3
7 |
8 | # Install Go
9 | RUN rm -rf /usr/local/go && wget -qO- https://dl.google.com/go/go1.13.10.linux-amd64.tar.gz | \
10 | tar -xz -C /usr/local && \
11 | mkdir -p /root/gopath && \
12 | mkdir -p /root/gopath/bin && \
13 | mkdir -p /root/gopath/src
14 | ENV GOROOT=/usr/local/go GOPATH=/root/gopath
15 | # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
16 | ENV PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin
17 |
18 | # python
19 | ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt
20 | RUN python3.7 -m pip install pip==20.1.1
21 | RUN python3.7 -m pip install --upgrade setuptools
22 | RUN python3.7 -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
23 |
24 | ADD ./docker/dev_requirements.txt /root/paddle_edl/dev_requirements.txt
25 | RUN python3.7 -m pip install -r /root/paddle_edl/dev_requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
26 |
27 | # python 2.7 is deprecated
28 | # RUN python -m pip install pip==20.1.1
29 | # RUN python -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
30 |
31 | #etcd
32 | ENV HOME /root
33 | WORKDIR /root/paddle_edl
34 | ADD ./scripts/download_etcd.sh /root/paddle_edl/download_etcd.sh
35 | RUN bash /root/paddle_edl/download_etcd.sh
36 |
37 | # Install redis
38 | RUN cd /tmp/ && wget -q https://paddle-edl.bj.bcebos.com/redis-6.0.1.tar.gz && \
39 | tar xzf redis-6.0.1.tar.gz && \
40 | cd redis-6.0.1 && make -j && \
41 | mv src/redis-server /usr/local/bin && \
42 | mv src/redis-cli /usr/local/bin && \
43 | cd .. && rm -rf redis-6.0.1.tar.gz redis-6.0.1
44 |
45 |
46 | # protoc
47 | RUN mkdir -p /tmp/protoc && cd /tmp/protoc && \
48 | wget -q -O protoc-3.11.4-linux-x86_64.zip --no-check-certificate https://paddle-edl.bj.bcebos.com/protoc-3.11.4-linux-x86_64.zip && \
49 | unzip protoc-3.11.4-linux-x86_64.zip && mv bin/protoc /usr/local/bin
50 |
51 | RUN echo "export PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin" >> /root/.bashrc
52 | RUN echo "go env -w GO111MODULE=on && go env -w GOPROXY=https://goproxy.io,direct" >> /root/.bashrc
53 | ENV GO111MODULE=on
54 | ENV GOPROXY=https://goproxy.io,direct
55 |
56 | RUN rm -f /usr/bin/python /usr/bin/pip /usr/local/bin/pip && \
57 | ln -s /usr/local/bin/python3.7 /usr/bin/python && \
58 | ln -s /usr/local/bin/pip3.7 /usr/bin/pip && \
59 | ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip
60 |
61 | RUN apt-get update && apt-get install -y shellcheck clang-format-3.8
62 |
--------------------------------------------------------------------------------
/docker/Dockerfile.runtime:
--------------------------------------------------------------------------------
1 | FROM hub.baidubce.com/paddlepaddle/paddle:
2 |
3 | # gcc 5
4 | RUN ln -sf /usr/bin/gcc-5 /usr/bin/gcc
5 | # python3 default use python3.6
6 | RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3
7 |
8 | # Install Go
9 | RUN rm -rf /usr/local/go && wget -qO- https://dl.google.com/go/go1.13.10.linux-amd64.tar.gz | \
10 | tar -xz -C /usr/local && \
11 | mkdir -p /root/gopath && \
12 | mkdir -p /root/gopath/bin && \
13 | mkdir -p /root/gopath/src
14 | ENV GOROOT=/usr/local/go GOPATH=/root/gopath
15 | # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
16 | ENV PATH=$PATH:{GOROOT}/bin:${GOPATH}/bin
17 |
18 | ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt
19 | RUN python -m pip install pip==20.1.1
20 | RUN python3.6 -m pip install pip==20.1.1
21 | RUN python3.6 -m pip install --upgrade setuptools
22 | RUN python -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
23 | RUN python3.6 -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
24 |
25 | # etcd
26 | ENV HOME /root
27 | WORKDIR /root/paddle_edl
28 | ADD ./scripts/download_etcd.sh /root/paddle_edl/download_etcd.sh
29 | RUN bash /root/paddle_edl/download_etcd.sh
30 |
31 | # Install redis
32 | RUN cd /tmp/ && wget -q https://paddle-edl.bj.bcebos.com/redis-6.0.1.tar.gz && \
33 | tar xzf redis-6.0.1.tar.gz && \
34 | cd redis-6.0.1 && make -j && \
35 | mv src/redis-server /usr/local/bin && \
36 | mv src/redis-cli /usr/local/bin && \
37 | cd .. && rm -rf redis-6.0.1.tar.gz redis-6.0.1
38 |
39 | RUN echo "export PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin" >> /root/.bashrc
40 | RUN echo "go env -w GO111MODULE=on && go env -w GOPROXY=https://goproxy.io,direct" >> /root/.bashrc
41 | ENV GO111MODULE=on
42 | ENV GOPROXY=https://goproxy.io,direct
43 |
44 | # install edl
45 | ADD ./build/python/dist/paddle_edl-0.0.0-py2.py3-none-any.whl /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl
46 | RUN python -m pip install /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl
47 | RUN python3.6 -m pip install /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl
48 | RUN rm -f /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl
49 |
50 | # add example
51 | ADD ./example /root/paddle_edl/example
52 | ADD ./k8s/k8s_tools.py ./example/distill/k8s/edl_k8s /root/paddle_edl/
53 |
54 | # add mnist distill teacher model
55 | RUN cd /root/paddle_edl/example/distill/mnist_distill && \
56 | wget -q https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz && \
57 | tar xzf mnist_cnn_model.tar.gz
58 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | ## Build Runtime Docker Image for Kubernetes
2 |
3 | For the distributed training job on Kubernetes, we package Paddle binary files and some tools for Kubernetes into a runtime Docker image, the runtime Docker image gets scheduled by Kubernetes to run during training.
4 |
5 | You can build CPU and GPU Docker image which based on different PaddlePaddle product Docker image:
6 |
7 | ```bash
8 | ./build_docker.sh
9 | ```
10 |
11 | - Build CPU runtime Docker image
12 |
13 | ```bash
14 | ./build_docker.sh paddlepaddle/paddle:0.11.0 paddlepaddle/paddlecloud-job:0.11.0
15 | ```
16 |
17 | - Build GPU runtime Docker image
18 |
19 | ```bash
20 | ./build_docker.sh paddlepaddle/paddle:0.11.0-gpu paddlepaddle/paddlecloud-job:0.11.0-gpu
21 | ```
22 |
--------------------------------------------------------------------------------
/docker/build-devel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 |
19 | unset GREP_OPTIONS
20 | BASEDIR="$(dirname "$(readlink -f "${0}")")"
21 | cd "${BASEDIR}"/..
22 |
23 | image=hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7-dev
24 | sed 's//latest-gpu-cuda10.0-cudnn7-dev/g' docker/Dockerfile > docker/Dockerfile.cuda10
25 | docker build --pull --network host . -t ${image} -f docker/Dockerfile.cuda10
26 | docker push ${image}
27 |
28 | image=hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda9.0-cudnn7-dev
29 | sed 's//latest-gpu-cuda9.0-cudnn7-dev/g' docker/Dockerfile > docker/Dockerfile.cuda9
30 | docker build --pull --network host . -t ${image} -f docker/Dockerfile.cuda9
31 | docker push ${image}
32 |
--------------------------------------------------------------------------------
/docker/build-runtime.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 |
19 | if [[ $# != 1 ]] ; then
20 | echo "must set version"
21 | exit 0
22 | fi
23 |
24 | unset GREP_OPTIONS
25 | BASEDIR="$(dirname "$(readlink -f "${0}")")"
26 | cd "${BASEDIR}"/..
27 |
28 | build_image(){
29 | cuda_version=$1
30 | latest_image="hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda${cuda_version}-cudnn7"
31 | sed 's//1.8.0-gpu-cuda'"${cuda_version}"'-cudnn7/g' docker/Dockerfile.runtime > "docker/Dockerfile.runtime.cuda${cuda_version}"
32 | docker build --pull --network host . -t "${latest_image}" -f "docker/Dockerfile.runtime.cuda${cuda_version}"
33 | docker push "${latest_image}"
34 |
35 | version=$2
36 | version_image="hub.baidubce.com/paddle-edl/paddle_edl:${version}-cuda${cuda_version}-cudnn7"
37 | docker tag "${latest_image}" "${version_image}"
38 | docker push "${version_image}"
39 | }
40 |
41 | version=$1
42 | cuda_version="10.0"
43 | echo "build cuda:${cuda_version} edl version:${version}"
44 | build_image "${cuda_version}" "$version"
45 |
46 | cuda_version="9.0"
47 | echo "build cuda:${cuda_version} edl version:${version}"
48 | build_image "${cuda_version}" "$version"
49 |
--------------------------------------------------------------------------------
/docker/dev_requirements.txt:
--------------------------------------------------------------------------------
1 | astroid
2 | cpplint
3 | isort
4 | pre-commit
5 | pylint
6 | pytest
7 |
--------------------------------------------------------------------------------
/docker/release-run-time.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | RED='\033[0;31m'
18 | NC='\033[0m' # No Color
19 | GREEN='\033[0;32m'
20 |
21 | if [[ $# != 1 ]] ; then
22 | echo "must set version"
23 | exit 0
24 | fi
25 |
26 | version=$1
27 |
28 | echo -e "${GREEN} Press 'y' to release ${RED} docker version ${version} ${NC}"
29 | while : ; do
30 | read -n 1 k <&1
31 | if [[ $k == y ]] ; then
32 | break
33 | else
34 | echo "exit"
35 | exit 0
36 | fi
37 | done
38 |
39 | echo -e "\n${GREEN} Begin to release ${RED} edl docker ${version} ${NC}\n"
40 |
41 | unset GREP_OPTIONS
42 | BASEDIR="$(dirname "$(readlink -f "${0}")")"
43 | cd "${BASEDIR}"
44 |
45 | bash ./build-runtime.sh "$version"
46 |
--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
1 | etcd3==0.12.0
2 | flask==1.1.2
3 | grpcio==1.28.1
4 | grpcio_tools==1.28.1
5 | kubernetes
6 | paddle-serving-app
7 | paddle-serving-client
8 | paddle-serving-server-gpu
9 | paddlepaddle-gpu==1.8.0.post107
10 | pathlib2==2.3.5
11 | protobuf==3.8.0
12 | psutil
13 | redis
14 |
--------------------------------------------------------------------------------
/example/collective/resnet50/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .vgg import VGG11, VGG13, VGG16, VGG19
16 | from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
17 |
--------------------------------------------------------------------------------
/example/collective/resnet50/train_pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | export FLAGS_sync_nccl_allreduce=1
18 | export FLAGS_cudnn_exhaustive_search=1
19 | #export FLAGS_conv_workspace_size_limit=4000 #MB
20 | export FLAGS_cudnn_batchnorm_spatial_persistent=1
21 |
22 | export GLOG_v=1
23 | export GLOG_logtostderr=1
24 | export FLAGS_eager_delete_tensor_gb=0
25 | export NCCL_DEBUG=INFO
26 | # Unset proxy
27 | unset https_proxy http_proxy
28 |
29 | FP16=False #whether to use float16
30 | use_dali=False
31 | DATA_FORMAT="NCHW"
32 | if [[ ${use_dali} == "True" ]]; then
33 | export FLAGS_fraction_of_gpu_memory_to_use=0.8
34 | fi
35 |
36 | python -m paddle_edl.collective.launch ${distributed_args} \
37 | --log_dir log \
38 | --log_level 20 \
39 | ./train_with_fleet.py \
40 | --model=ResNet50 \
41 | --batch_size=128 \
42 | --total_images=1281167 \
43 | --data_dir=./ImageNet \
44 | --class_dim=1000 \
45 | --image_shape=3,224,224 \
46 | --model_save_dir=output/ \
47 | --with_mem_opt=False \
48 | --lr_strategy=piecewise_decay \
49 | --lr=0.1\
50 | --l2_decay=1e-4 \
51 | --scale_loss=1.0 \
52 | --num_epochs=90 \
53 | --num_threads=2 \
54 | --nccl_comm_num=1 \
55 | --fuse=True \
56 | --use_hierarchical_allreduce=False \
57 | --fp16=${FP16} \
58 | --use_dali=${use_dali} \
59 | --checkpoint=./fleet_checkpoints \
60 | --do_test=False \
61 | --data_format=${DATA_FORMAT}
62 |
--------------------------------------------------------------------------------
/example/collective/resnet50/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/example/ctr/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 | MAINTAINER peizhilin@baidu.com
3 |
4 | RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv git curl
5 |
6 | RUN pip install -U pip
7 | RUN pip install -U kubernetes paddlepaddle
8 | RUN mkdir -p /workspace
9 |
10 | RUN mkdir -p /temp && cd /temp && git clone https://github.com/PaddlePaddle/models.git && cd models && git checkout f503908d && mv /temp/models/fluid/PaddleRec/ctr /workspace/
11 |
12 | ADD script/paddle_k8s /usr/bin
13 | ADD script/k8s_tools.py /root
14 | RUN chmod +x /usr/bin/paddle_k8s
15 |
16 | COPY ctr /workspace/ctr
17 |
--------------------------------------------------------------------------------
/example/ctr/README:
--------------------------------------------------------------------------------
1 |
2 | CTR分布式训练
3 |
4 | 这是一个paddlepaddle分布式训练任务的示例和安装教程,在一个标准k8s集群上可以通过脚本直接构建一个分布式训练CTR任务。
5 |
6 | 整个工程分为三部分
7 |
8 | 1。镜像文件
9 | Dockerfile -- docker构建文件
10 | script -- 构建docker用到的脚本, 拷贝 edl/docker/k8s_tools 和 edl/docker/paddle_k8s 到此目录
11 | ctr -- paddlepaddle分布式训练CTR例子
12 |
13 | 2。部署文件
14 | ps-train -- 部署k8s的yaml文件
15 |
16 | 3。其它
17 | image -- 任务图例
18 | 百度云部署ctr分布式训练任务.rst -- 百度云搭建CTR任务说明
19 |
--------------------------------------------------------------------------------
/example/ctr/k8s/cube.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: cube-0
5 | labels:
6 | app: cube-0
7 | spec:
8 | containers:
9 | - name: cube-0
10 | image: wangjiawei1993/cube:v11
11 | workingDir: /cube
12 | command: ['/bin/bash']
13 | args: ['start.sh']
14 | ports:
15 | - containerPort: 8001
16 | name: cube-agent
17 | - containerPort: 8027
18 | name: cube-server
19 |
20 | ---
21 |
22 | apiVersion: v1
23 | kind: Pod
24 | metadata:
25 | name: cube-1
26 | labels:
27 | app: cube-1
28 | spec:
29 | containers:
30 | - name: cube-1
31 | image: wangjiawei1993/cube:v11
32 | workingDir: /cube
33 | command: ['/bin/bash']
34 | args: ['start.sh']
35 | ports:
36 | - containerPort: 8001
37 | name: cube-agent
38 | - containerPort: 8027
39 | name: cube-server
40 |
41 | ---
42 |
43 | kind: Service
44 | apiVersion: v1
45 | metadata:
46 | name: cube-0
47 | spec:
48 | ports:
49 | - name: agent
50 | port: 8001
51 | protocol: TCP
52 | - name: server
53 | port: 8027
54 | protocol: TCP
55 | selector:
56 | app: cube-0
57 |
58 | ---
59 |
60 | kind: Service
61 | apiVersion: v1
62 | metadata:
63 | name: cube-1
64 | spec:
65 | ports:
66 | - name: agent
67 | port: 8001
68 | protocol: TCP
69 | - name: server
70 | port: 8027
71 | protocol: TCP
72 | selector:
73 | app: cube-1
74 |
--------------------------------------------------------------------------------
/example/ctr/k8s/ftp.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: file-server
5 | labels:
6 | app: file-server
7 | spec:
8 | volumes:
9 | - hostPath:
10 | path: /home/work
11 | type: ""
12 | name: file-home
13 | containers:
14 | - name: file-server
15 | image: halverneus/static-file-server
16 | ports:
17 | - containerPort: 8080
18 | volumeMounts:
19 | - mountPath: /web
20 | name: file-home
21 | nodeSelector:
22 | nodeType: model
23 | ---
24 | kind: Service
25 | apiVersion: v1
26 | metadata:
27 | name: file-server
28 | spec:
29 | type: LoadBalancer
30 | ports:
31 | - name: file-server
32 | port: 8080
33 | targetPort: 8080
34 | selector:
35 | app: file-server
36 |
--------------------------------------------------------------------------------
/example/ctr/k8s/paddle-suite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml
18 | OUTPUT_NODE=$(kubectl get no | awk '{print $1}' | sed -n '2p')
19 | kubectl label nodes $OUTPUT_NODE nodeType=model --overwrite
20 | kubectl apply -f paddle-suite.yaml
21 |
--------------------------------------------------------------------------------
/example/ctr/k8s/pdclient.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: pdservingclient
5 | labels:
6 | app: pdservingclient
7 | spec:
8 | containers:
9 | - name: pdservingclient
10 | image: wangjiawei1993/pdservingclient:v4
11 | workingDir: /
12 | command: ['bash']
13 | args: ['nonstop.sh']
14 |
--------------------------------------------------------------------------------
/example/ctr/k8s/pdserving.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: paddleserving
5 | labels:
6 | app: paddleserving
7 | spec:
8 | containers:
9 | - name: paddleserving
10 | image: wangjiawei1993/paddleserving:v7-debug
11 | workingDir: /serving
12 | command: ['/bin/bash']
13 | args: ['run.sh']
14 | ports:
15 | - containerPort: 8010
16 | name: serving
17 |
18 | ---
19 | apiVersion: v1
20 | kind: Service
21 | metadata:
22 | name: paddleserving
23 | spec:
24 | ports:
25 | - name: serving
26 | port: 8010
27 | protocol: TCP
28 | selector:
29 | app: paddleserving
30 |
--------------------------------------------------------------------------------
/example/ctr/k8s/transfer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: cube-transfer
5 | labels:
6 | app: cube-transfer
7 | spec:
8 | containers:
9 | - name: cube-transfer
10 | image: wangjiawei1993/cube-transfer:v18
11 | workingDir: /
12 | env:
13 | - name: POD_IP
14 | valueFrom:
15 | fieldRef:
16 | apiVersion: v1
17 | fieldPath: status.podIP
18 | command: ['bash']
19 | args: ['nonstop.sh']
20 | ports:
21 | - containerPort: 8099
22 | name: cube-transfer
23 | - containerPort: 8098
24 | name: cube-http
25 |
--------------------------------------------------------------------------------
/example/ctr/ps-train/pserver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: ReplicaSet
3 | metadata: {name: fluid-ctr-pserver}
4 | spec:
5 | replicas: 2
6 | template:
7 | metadata:
8 | labels: {paddle-job-pserver: fluid-ctr}
9 | spec:
10 | containers:
11 | - command: [paddle_k8s, start_fluid]
12 | env:
13 | - {name: GLOG_v, value: '0'}
14 | - {name: GLOG_logtostderr, value: '1'}
15 | - {name: TOPOLOGY, value: ''}
16 | - {name: TRAINER_PACKAGE, value: /workspace}
17 | - {name: PADDLE_INIT_NICS, value: eth2}
18 | - name: NAMESPACE
19 | valueFrom:
20 | fieldRef: {fieldPath: metadata.namespace}
21 | - name: POD_IP
22 | valueFrom:
23 | fieldRef: {fieldPath: status.podIP}
24 | - name: POD_NAME
25 | valueFrom:
26 | fieldRef: {fieldPath: metadata.name}
27 | - name: PADDLE_CURRENT_IP
28 | valueFrom:
29 | fieldRef: {fieldPath: status.podIP}
30 | - {name: PADDLE_JOB_NAME, value: fluid-ctr}
31 | - {name: PADDLE_IS_LOCAL, value: '0'}
32 | - {name: PADDLE_TRAINERS_NUM, value: '2'}
33 | - {name: PADDLE_PSERVERS_NUM, value: '2'}
34 | - {name: FLAGS_rpc_deadline, value: '36000000'}
35 | - {name: ENTRY, value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1}
36 | - {name: PADDLE_PORT, value: '30236'}
37 | - {name: LD_LIBRARY_PATH, value: '/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind'}
38 | - {name: PADDLE_TRAINING_ROLE, value: PSERVER}
39 | - {name: TRAINING_ROLE, value: PSERVER}
40 | image: wopeizl/paddle_ctr_distribute
41 | imagePullPolicy: Always
42 | name: pserver
43 | volumeMounts:
44 | - {mountPath: /mnt/seqdata, name: seqdata}
45 | resources:
46 | limits: {cpu: '10', memory: 30Gi}
47 | requests: {cpu: '1', memory: 100M}
48 | hostNetwork: true
49 | imagePullSecrets:
50 | - {name: regcred}
51 | volumes:
52 | - hostPath: {path: /home/work/}
53 | name: seqdata
54 |
--------------------------------------------------------------------------------
/example/ctr/ps-train/trainer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata: {name: fluid-ctr-trainer}
4 | spec:
5 | completions: 2
6 | parallelism: 2
7 | template:
8 | metadata:
9 | labels: {paddle-job: fluid-ctr}
10 | spec:
11 | restartPolicy: "OnFailure"
12 | containers:
13 | - command: [paddle_k8s, start_fluid]
14 | env:
15 | - {name: GLOG_v, value: '0'}
16 | - {name: GLOG_logtostderr, value: '1'}
17 | - {name: TOPOLOGY, value: ''}
18 | - {name: TRAINER_PACKAGE, value: /workspace}
19 | - {name: PADDLE_INIT_NICS, value: eth2}
20 | - name: NAMESPACE
21 | valueFrom:
22 | fieldRef: {fieldPath: metadata.namespace}
23 | - name: POD_IP
24 | valueFrom:
25 | fieldRef: {fieldPath: status.podIP}
26 | - name: POD_NAME
27 | valueFrom:
28 | fieldRef: {fieldPath: metadata.name}
29 | - name: PADDLE_CURRENT_IP
30 | valueFrom:
31 | fieldRef: {fieldPath: status.podIP}
32 | - {name: PADDLE_JOB_NAME, value: fluid-ctr}
33 | - {name: PADDLE_IS_LOCAL, value: '0'}
34 | - {name: FLAGS_rpc_deadline, value: '36000000'}
35 | - {name: PADDLE_PORT, value: '30236'}
36 | - {name: PADDLE_PSERVERS_NUM, value: '2'}
37 | - {name: PADDLE_TRAINERS_NUM, value: '2'}
38 | - {name: PADDLE_TRAINING_ROLE, value: TRAINER}
39 | - {name: TRAINING_ROLE, value: TRAINER}
40 | - {name: LD_LIBRARY_PATH, value: '/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind'}
41 | - {name: ENTRY, value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1}
42 | image: wopeizl/paddle_ctr_distribute
43 | imagePullPolicy: Always
44 | name: trainer
45 | volumeMounts:
46 | - {mountPath: /mnt/seqdata, name: seqdata}
47 | resources:
48 | limits: {cpu: '10', memory: 30Gi}
49 | requests: {cpu: '1', memory: 100M}
50 | hostNetwork: true
51 | imagePullSecrets:
52 | - {name: regcred}
53 | volumes:
54 | - hostPath: {path: /home/work/}
55 | name: seqdata
56 |
--------------------------------------------------------------------------------
/example/ctr/script/README:
--------------------------------------------------------------------------------
1 | please copy the edl/docker/k8s_tools and edl/docker/paddle_k8s into this folder if want to build the docker image by yourself.
2 |
--------------------------------------------------------------------------------
/example/ctr/script/cube.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: cube-0
5 | labels:
6 | app: cube-0
7 | spec:
8 | containers:
9 | - name: cube-0
10 | image: hub.baidubce.com/ctr/cube:latest
11 | workingDir: /cube
12 | command: ['/bin/bash']
13 | args: ['start.sh']
14 | ports:
15 | - containerPort: 8001
16 | name: cube-agent
17 | - containerPort: 8027
18 | name: cube-server
19 |
20 | ---
21 |
22 | apiVersion: v1
23 | kind: Pod
24 | metadata:
25 | name: cube-1
26 | labels:
27 | app: cube-1
28 | spec:
29 | containers:
30 | - name: cube-1
31 | image: hub.baidubce.com/ctr/cube:latest
32 | workingDir: /cube
33 | command: ['/bin/bash']
34 | args: ['start.sh']
35 | ports:
36 | - containerPort: 8001
37 | name: cube-agent
38 | - containerPort: 8027
39 | name: cube-server
40 |
41 | ---
42 |
43 | kind: Service
44 | apiVersion: v1
45 | metadata:
46 | name: cube-0
47 | spec:
48 | ports:
49 | - name: agent
50 | port: 8001
51 | protocol: TCP
52 | - name: server
53 | port: 8027
54 | protocol: TCP
55 | selector:
56 | app: cube-0
57 |
58 | ---
59 |
60 | kind: Service
61 | apiVersion: v1
62 | metadata:
63 | name: cube-1
64 | spec:
65 | ports:
66 | - name: agent
67 | port: 8001
68 | protocol: TCP
69 | - name: server
70 | port: 8027
71 | protocol: TCP
72 | selector:
73 | app: cube-1
74 |
--------------------------------------------------------------------------------
/example/ctr/script/defaultserviceaccountclusterrole.yaml:
--------------------------------------------------------------------------------
1 | kind: ClusterRole
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | metadata:
4 | name: default
5 | namespace: default
6 | rules:
7 | - apiGroups: [""]
8 | resources: ["pods"]
9 | verbs: ["get", "list", "watch"]
10 |
11 | ---
12 | kind: ClusterRoleBinding
13 | apiVersion: rbac.authorization.k8s.io/v1
14 | metadata:
15 | name: default
16 | namespace: default
17 | subjects:
18 | - kind: ServiceAccount
19 | name: default
20 | namespace: default
21 | roleRef:
22 | kind: ClusterRole
23 | name: default
24 | apiGroup: rbac.authorization.k8s.io
25 |
--------------------------------------------------------------------------------
/example/ctr/script/fileserver.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: file-server
5 | labels:
6 | app: file-server
7 | spec:
8 | nodeSelector:
9 | nodeType: model
10 | volumes:
11 | - hostPath:
12 | path: /home/work
13 | type: ""
14 | name: file-home
15 | containers:
16 | - name: file-server
17 | image: halverneus/static-file-server
18 | ports:
19 | - containerPort: 8080
20 | volumeMounts:
21 | - mountPath: /web
22 | name: file-home
23 | ---
24 | kind: Service
25 | apiVersion: v1
26 | metadata:
27 | name: loadbalancer
28 | spec:
29 | type: LoadBalancer
30 | ports:
31 | - name: file-server
32 | port: 8080
33 | targetPort: 8080
34 | selector:
35 | app: file-server
36 |
--------------------------------------------------------------------------------
/example/ctr/script/ftp.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: file-server
5 | labels:
6 | app: file-server
7 | spec:
8 | volumes:
9 | - hostPath:
10 | path: /home/work
11 | type: ""
12 | name: file-home
13 | containers:
14 | - name: file-server
15 | image: hub.baidubce.com/ctr/file-server:latest
16 | ports:
17 | - containerPort: 8080
18 | volumeMounts:
19 | - mountPath: /web
20 | name: file-home
21 | nodeSelector:
22 | nodeType: model
23 | ---
24 | kind: Service
25 | apiVersion: v1
26 | metadata:
27 | name: file-server
28 | spec:
29 | type: LoadBalancer
30 | ports:
31 | - name: file-server
32 | port: 8080
33 | targetPort: 8080
34 | selector:
35 | app: file-server
36 |
--------------------------------------------------------------------------------
/example/ctr/script/paddle-suite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml
18 | OUTPUT_NODE=$(kubectl get no | awk '{print $1}' | sed -n '2p')
19 | kubectl label nodes $OUTPUT_NODE nodeType=model --overwrite
20 | kubectl apply -f paddle-suite.yaml
21 |
--------------------------------------------------------------------------------
/example/ctr/script/pdclient.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: pdservingclient
5 | labels:
6 | app: pdservingclient
7 | spec:
8 | containers:
9 | - name: pdservingclient
10 | image: hub.baidubce.com/ctr/pdservingclient:latest
11 | workingDir: /
12 | command: ['bash']
13 | args: ['nonstop.sh']
14 |
--------------------------------------------------------------------------------
/example/ctr/script/pdserving.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: paddleserving
5 | labels:
6 | app: paddleserving
7 | spec:
8 | containers:
9 | - name: paddleserving
10 | image: hub.baidubce.com/ctr/paddleserving:latest
11 | workingDir: /serving
12 | command: ['/bin/bash']
13 | args: ['run.sh']
14 | ports:
15 | - containerPort: 8010
16 | name: serving
17 |
18 | ---
19 | apiVersion: v1
20 | kind: Service
21 | metadata:
22 | name: paddleserving
23 | spec:
24 | ports:
25 | - name: serving
26 | port: 8010
27 | protocol: TCP
28 | selector:
29 | app: paddleserving
30 |
--------------------------------------------------------------------------------
/example/ctr/script/transfer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: cube-transfer
5 | labels:
6 | app: cube-transfer
7 | spec:
8 | containers:
9 | - name: cube-transfer
10 | image: hub.baidubce.com/ctr/cube-transfer:latest
11 | workingDir: /
12 | env:
13 | - name: POD_IP
14 | valueFrom:
15 | fieldRef:
16 | apiVersion: v1
17 | fieldPath: status.podIP
18 | command: ['bash']
19 | args: ['nonstop.sh']
20 | ports:
21 | - containerPort: 8099
22 | name: cube-transfer
23 | - containerPort: 8098
24 | name: cube-http
25 |
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/cluster-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/cluster-info.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/concole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/concole.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/conf-download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/conf-download.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/ctr-models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-models.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/ctr-prediction-end-to-end-deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-prediction-end-to-end-deployment.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/ctr-running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-running.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/eip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/eip.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/file_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/file_server.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/helm-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/helm-version.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/kubectl-version.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/kubectl-version.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/load_balancer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/load_balancer.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/pserver-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/pserver-log.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/tiller.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/tiller.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/trainer-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/trainer-log.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/volcano.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/volcano.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/wget_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/wget_example.png
--------------------------------------------------------------------------------
/example/ctr/src/baidu_cloud/workload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/workload.png
--------------------------------------------------------------------------------
/example/ctr/src/create_gpu_machine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_gpu_machine.png
--------------------------------------------------------------------------------
/example/ctr/src/create_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_image.png
--------------------------------------------------------------------------------
/example/ctr/src/create_more_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_more_nodes.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_kubectl_download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_kubectl_download.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_node.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_pods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_pods.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_pserver_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_pserver_log.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_trainer_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_trainer_log.png
--------------------------------------------------------------------------------
/example/ctr/src/ctr_volcano_install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_volcano_install.png
--------------------------------------------------------------------------------
/example/ctr/src/ctryaml1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml1.png
--------------------------------------------------------------------------------
/example/ctr/src/ctryaml2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml2.png
--------------------------------------------------------------------------------
/example/ctr/src/ctryaml3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml3.png
--------------------------------------------------------------------------------
/example/ctr/src/cube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube.png
--------------------------------------------------------------------------------
/example/ctr/src/cube_config1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube_config1.png
--------------------------------------------------------------------------------
/example/ctr/src/cube_config2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube_config2.png
--------------------------------------------------------------------------------
/example/ctr/src/dist_train_nccl2.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_nccl2.graffle
--------------------------------------------------------------------------------
/example/ctr/src/dist_train_nccl2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_nccl2.png
--------------------------------------------------------------------------------
/example/ctr/src/dist_train_pserver.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_pserver.graffle
--------------------------------------------------------------------------------
/example/ctr/src/dist_train_pserver.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_pserver.png
--------------------------------------------------------------------------------
/example/ctr/src/file_server_pod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/file_server_pod.png
--------------------------------------------------------------------------------
/example/ctr/src/file_server_svc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/file_server_svc.png
--------------------------------------------------------------------------------
/example/ctr/src/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/overview.png
--------------------------------------------------------------------------------
/example/ctr/src/paddleclient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleclient.png
--------------------------------------------------------------------------------
/example/ctr/src/paddleserving_pod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleserving_pod.png
--------------------------------------------------------------------------------
/example/ctr/src/paddleserving_svc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleserving_svc.png
--------------------------------------------------------------------------------
/example/ctr/src/parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/parallelism.png
--------------------------------------------------------------------------------
/example/ctr/src/pyreader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/pyreader.png
--------------------------------------------------------------------------------
/example/ctr/src/release.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/release.png
--------------------------------------------------------------------------------
/example/ctr/src/transfer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/transfer.png
--------------------------------------------------------------------------------
/example/demo/collective/README.md:
--------------------------------------------------------------------------------
1 | # Purpose
2 | This article illustrates how to change the train program to an EDL program, and run on single or multiple nodes.
3 |
4 |
5 | ## How to change from a normal train program to an EDL train program
6 | The main changes are:
7 |
8 | - `load_checkpoint` should be added at the beginning of training and
9 | - `save_checkpoint` added at the end of every epoch.
10 | the checkpoint should be on a distributed file system such as HDFS so all trainers can download from it. A complete example is [here](https://github.com/elasticdeeplearning/edl/tree/develop/example/collective/resnet50)
11 |
12 | ```
13 | fs=HDFSClient(args.hdfs_name, args.hdfs_ugi,20*60*1000, 3 * 1000)
14 |
15 | train_status =TrainStatus()
16 | tmp_s = fleet.load_checkpoint(exe, args.checkpoint, fs=fs, trainer_id=trainer_id)
17 | if tmp_s is not None:
18 | train_status = tmp_s
19 |
20 | for pass_id in range(train_status.next(), params["num_epochs"]):
21 | train()
22 |
23 | if trainer_id == 0:
24 | saved_status = TrainStatus(pass_id)
25 | fleet.save_checkpoint(exe, train_status=saved_status,
26 | path=args.checkpoint, fs=fs)
27 | ```
28 |
29 | The epoch's number is stored in `train_status` and the epoch number will be restored when the checkpoint is loaded.
30 |
31 | ## Start Resnet50 demo training multiple nodes:
32 |
33 | 1. Start a JobServer on one node which generates changing scripts.
34 |
35 | ```
36 | node_ips="192.168.10.1,192.168.10.2"
37 | python -u paddle_edl.demo.collective.job_server_demo \
38 | --node_ips ${node_ips} \
39 | --pod_num_of_node 8 \
40 | --time_interval_to_change 900 \
41 | --gpu_num_of_node 8
42 | ```
43 |
44 | 1. Start a Jobclient on every node which controls the worker process.
45 |
46 | ```
47 | # set the ImageNet data path
48 | export PADDLE_EDL_IMAGENET_PATH=
49 | # set the checkpoint path
50 | export PADDLE_EDL_FLEET_CHECKPOINT_PATH=
51 | export PADDLE_JOBSERVER="http://192.168.10.1:8180"
52 |
53 | mkdir -p resnet50_pod
54 | unset http_proxy https_proxy
55 |
56 | # running under edl
57 | export PADDLE_RUNING_ENV=PADDLE_EDL
58 | export PADDLE_JOB_ID="test_job_id_1234"
59 | export PADDLE_POD_ID="not set"
60 |
61 | python -u paddle_edl.demo.collective.job_client_demo \
62 | --log_level 20 \
63 | --package_sh ./resnet50/package.sh \
64 | --pod_path ./resnet50_pod \
65 | ./train_pretrain.sh
66 | ```
67 |
68 |
69 | ## On Kubernetes
70 |
71 | We have built the docker images for you and you can start a demo on Kubernetes immediately:
72 | TBD
73 |
--------------------------------------------------------------------------------
/example/demo/collective/env.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #指定ImageNet的数据目录路径
16 | export PADDLE_EDL_IMAGENET_PATH=/root/go/dataset/ImageNet
17 | #指定`checkpoint`的目录,用来保存checkpoint
18 | export PADDLE_EDL_FLEET_CHECKPOINT_PATH=/root/go/checkpoints/resnet50_1
19 |
--------------------------------------------------------------------------------
/example/demo/collective/resnet50/package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -xe
18 |
19 | while true ; do
20 | case "$1" in
21 | -pod_id) pod_id="$2" ; shift 2 ;;
22 | *)
23 | if [[ ${#1} -gt 0 ]]; then
24 | echo "not supported arugments ${1}" ; exit 1 ;
25 | else
26 | break
27 | fi
28 | ;;
29 | esac
30 | done
31 |
32 |
33 | src_dir=../../../collective/resnet50
34 | dst_dir=resnet50_pod/${pod_id}
35 |
36 | echo "mkdir resnet50_pod/${pod_id}"
37 | mkdir -p "${dst_dir}"
38 |
39 | #copy resnet50 runtime env
40 | cp "${src_dir}"/*.py "${dst_dir}"/
41 | cp "${src_dir}"/*.sh "${dst_dir}"/
42 | cp -r "${src_dir}"/utils "${dst_dir}"/utils
43 | cp -r "${src_dir}"/models "${dst_dir}"/models
44 | cp -r "${src_dir}"/scripts "${dst_dir}"/scripts
45 |
46 | if [[ ! -d "${dst_dir}/ImageNet" ]]; then
47 | ln -s "${PADDLE_EDL_IMAGENET_PATH}" "${dst_dir}"/
48 | fi
49 |
50 | if [[ ! -d "${dst_dir}/fleet_checkpoints" ]]; then
51 | ln -s "${PADDLE_EDL_FLEET_CHECKPOINT_PATH}" "${dst_dir}/fleet_checkpoints"
52 | fi
53 |
--------------------------------------------------------------------------------
/example/demo/collective/start_job_client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | unset http_proxy https_proxy
19 |
20 | # running under edl
21 | export PADDLE_RUNING_ENV=PADDLE_EDL
22 | export PADDLE_JOBSERVER="http://127.0.0.1:8180"
23 | if [[ "${PADDLE_TRAINERS}x" != x ]]; then
24 | pod_arr=(${PADDLE_TRAINERS//,/ })
25 | export PADDLE_JOBSERVER="http://${pod_arr[0]}:8180"
26 | fi
27 | export PADDLE_JOB_ID="test_job_id_1234"
28 | export PADDLE_POD_ID="not set"
29 |
30 | BASEDIR=$(dirname $(readlink -f $0))
31 | echo $BASEDIR
32 |
33 | nohup python -u paddle_edl.demo.collective.job_client_demo \
34 | --log_level 20 \
35 | --package_sh ./resnet50/package.sh \
36 | --pod_path ./resnet50_pod \
37 | ./train_pretrain.sh > job_client.log 2>&1 &
38 |
--------------------------------------------------------------------------------
/example/demo/collective/start_job_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | node_ips="127.0.0.1"
18 | if [[ "${PADDLE_TRAINERS}x" != "x" ]]; then
19 | node_ips=${PADDLE_TRAINERS}
20 | fi
21 | echo "node_ips:${node_ips}"
22 |
23 | BASEDIR=$(dirname $(readlink -f $0))
24 | echo "${BASEDIR}"
25 |
26 | nohup python -u paddle_edl.demo.collective.job_server_demo \
27 | --node_ips ${node_ips} \
28 | --pod_num_of_node 8 \
29 | --time_interval_to_change 900 \
30 | --gpu_num_of_node 8 > job_server.log 2>&1 &
31 |
--------------------------------------------------------------------------------
/example/distill/k8s/balance.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: balance
5 | labels:
6 | edl-distill-demo-app: balance
7 | spec:
8 | replicas: 1
9 | template:
10 | metadata:
11 | name: balance
12 | labels:
13 | app: balance
14 | spec:
15 | hostNetwork: true
16 | containers:
17 | - name: balance
18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7
19 | imagePullPolicy: Always
20 | workingDir: /
21 | command: ['/bin/bash', '-c']
22 | args: ['sleep 3000']
23 | resources:
24 | requests:
25 | memory: 20Gi
26 | cpu: 2
27 | limits:
28 | memory: 20Gi
29 | cpu: 2
30 | restartPolicy: Never
31 |
--------------------------------------------------------------------------------
/example/distill/k8s/edl_k8s:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -x
4 |
5 | balance_label="edl-distill-demo-app=student"
6 | etcd_label="edl-distill-demo-app=etcd"
7 |
8 | start_balance() {
9 | stdbuf -oL python /root/k8s_tools.py wait_pods_running ${etcd_label} 1
10 | export etcd_ip=$(python /root/k8s_tools.py fetch_ips ${etcd_label})
11 | }
12 |
13 | usage() {
14 | echo "usage: paddle_k8s []:"
15 | echo " start_balance Start a blance"
16 | echo " start_student Start a stduent"
17 | echo " start_teacher Start a teacher"
18 | }
19 |
20 | case "$1" in
21 | start_balance)
22 | start_balance
23 | ;;
24 | start_stduent)
25 | start_student
26 | ;;
27 | start_teacher)
28 | start_teacher
29 | ;;
30 | --help)
31 | usage
32 | ;;
33 | *)
34 | usage
35 | ;;
36 | esac
37 |
--------------------------------------------------------------------------------
/example/distill/k8s/etcd.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: etcd
5 | labels:
6 | edl-distill-demo-app: etcd
7 | spec:
8 | replicas: 1
9 | template:
10 | metadata:
11 | name: etcd
12 | labels:
13 | app: etcd
14 | spec:
15 | hostNetwork: true
16 | containers:
17 | - name: etcd
18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7
19 | imagePullPolicy: Always
20 | workingDir: /
21 | command: ['/bin/bash', '-c']
22 | args: ['etcd']
23 | ports:
24 | - containerPort: 2379
25 | name: serving
26 | resources:
27 | requests:
28 | memory: 20Gi
29 | cpu: 2
30 | limits:
31 | memory: 20Gi
32 | cpu: 2
33 | restartPolicy: Never
34 |
--------------------------------------------------------------------------------
/example/distill/k8s/student.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: student
5 | labels:
6 | edl-distill-demo-app: student
7 | spec:
8 | replicas: 1
9 | template:
10 | metadata:
11 | name: student
12 | labels:
13 | app: student
14 | spec:
15 | hostNetwork: true
16 | containers:
17 | - name: student
18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7
19 | imagePullPolicy: Always
20 | workingDir: /
21 | command: ['/bin/bash', '-c']
22 | args: ['sleep 3000']
23 | resources:
24 | requests:
25 | memory: 20Gi
26 | cpu: 2
27 | limits:
28 | memory: 20Gi
29 | cpu: 2
30 | restartPolicy: Never
31 |
--------------------------------------------------------------------------------
/example/distill/k8s/teacher.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: teacher
5 | labels:
6 | edl-distill-demo-app: teacher
7 | spec:
8 | replicas: 1
9 | template:
10 | metadata:
11 | name: teacher
12 | labels:
13 | app: teacher
14 | spec:
15 | hostNetwork: true
16 | containers:
17 | - name: teacher
18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7
19 | imagePullPolicy: Always
20 | workingDir: /
21 | command: ['/bin/bash', '-c']
22 | args: ['sleep 3000']
23 | ports:
24 | - containerPort: 7001
25 | name: serving
26 | resources:
27 | requests:
28 | memory: 20Gi
29 | cpu: 2
30 | limits:
31 | memory: 20Gi
32 | cpu: 2
33 | restartPolicy: Never
34 |
--------------------------------------------------------------------------------
/example/distill/mnist_distill/image/infer_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/distill/mnist_distill/image/infer_3.png
--------------------------------------------------------------------------------
/example/distill/mnist_distill/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -eu
18 |
19 | if [ ! -f mnist_cnn_model.tar.gz ]; then
20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz
21 | fi
22 | tar -zxf mnist_cnn_model.tar.gz
23 |
24 | # at gpu 0, start paddle serving server on port 9292
25 | port=9292
26 | nohup python -m paddle_serving_server_gpu.serve \
27 | --model mnist_cnn_model \
28 | --thread 4 \
29 | --port ${port} \
30 | --mem_optim True \
31 | --gpu_ids 0 &
32 | serving_pid=$!
33 |
34 | # start distill train
35 | export CUDA_VISIBLE_DEVICES=0
36 | python train_with_fleet.py \
37 | --use_distill_service True \
38 | --distill_teachers 127.0.0.1:${port}
39 |
40 | # kill serving server
41 | pstree -p ${serving_pid} | awk -F"[()]" '{print $2}'| xargs kill -9
42 |
--------------------------------------------------------------------------------
/example/distill/nlp/README.md:
--------------------------------------------------------------------------------
1 | # ERNIE distillation
2 | We show how to distill knowledge from ERNIE to a mini model: BOW and other models on Chinese sentiment task.
3 |
4 | ## Quick start
5 | ### Download dataset
6 | ```
7 | wget https://paddle-edl.bj.bcebos.com/distillation/chnsenticorp/data.tgz
8 | tar -xzvf data.tgz
9 | ```
10 |
11 | ### Get the teacher model
12 | ```
13 | nohup python -u ./fine_tune.py > finetune.log 2>&1 &
14 | ```
15 |
16 | When the job completes, the directories needed for distillation: `ernie_senti_server` and `ernie_senti_client` will be generated.
17 |
18 | ### Or download the teacher model directly
19 | You can also download the teacher model directly and then you needn't generate the model yourself.
20 |
21 | ```
22 | wget https://paddle-edl.bj.bcebos.com/distillation/chnsenticorp/ernie_senti.tgz
23 | tar -xzvf ernie_senti.tgz
24 | ```
25 |
26 | ### Start a local teacher
27 | ```
28 | nohup python -m paddle_serving_server_gpu.serve \
29 | --model ./ernie_senti_server/ \
30 | --port 19290 \
31 | --thread 8 \
32 | --mem_optim \
33 | --gpu_ids 0 > teatcher.log 2>&1 &
34 | ```
35 |
36 | ### Start a student
37 | Now the student is BOW. CNN, LSTM, tiny ernie will be added later.
38 |
39 | ```
40 | python -u distill.py --fixed_teacher 127.0.0.1:19290
41 | ```
42 |
43 | ### Result
44 | | model | dev dataset(acc) | test dataset(acc) |
45 | | :----: | :-----: | :----: |
46 | | BOW | 0.901 | 0.908 |
47 | | BOW + distillation | 0.905 | 0.915 |
48 |
--------------------------------------------------------------------------------
/example/distill/nlp/test_distill.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
19 | export CUDA_VISIBLE_DEVICES=7
20 |
21 | fixed_teacher="127.0.0.1:19290,127.0.0.1:19291,127.0.0.1:19292,127.0.0.1:19293,127.0.0.1:19294,127.0.0.1:19295,127.0.0.1:19296,127.0.0.1:19297"
22 |
23 | for w in {1..10}
24 | do
25 | for T in {1..20}
26 | do
27 | wf=$( (echo scale=1 ; echo $w / 10 ) | bc )
28 | Tf=$( (echo scale=1 ; echo $T ) | bc )
29 | python3.6 -u distill.py \
30 | --fixed_teacher $fixed_teacher \
31 | --opt=AdamW \
32 | --s_weight "$wf" \
33 | --train_range 10 \
34 | --LR 1e-4 \
35 | --kl 0 \
36 | --T "$Tf" \
37 | --epoch_num 20 > log/"d_w${wf}_T${Tf}".log 2>&1
38 | done
39 | done
40 |
41 | exit 0
42 |
43 | nohup python3.6 -u distill.py \
44 | --fixed_teacher $fixed_teacher \
45 | --s_weight 0.05 \
46 | --epoch_num 20 > d_2.log 2>&1 &
47 |
48 | nohup python3.6 -u distill.py \
49 | --fixed_teacher $fixed_teacher \
50 | --opt=Adam \
51 | --LR=5e-5 \
52 | --s_weight 0.05 \
53 | --epoch_num 20 > d_3.log 2>&1 &
54 |
--------------------------------------------------------------------------------
/example/distill/nlp/test_train.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/
18 | export CUDA_VISIBLE_DEVICES=7
19 | nohup python3.6 -u train.py > train_with_test.log 2>&1 &
20 |
--------------------------------------------------------------------------------
/example/distill/qps_tools/distill_reader_qps.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 | import datetime
17 | import time
18 |
19 | from paddle_edl.distill.distill_reader import DistillReader
20 | from parse_config import get_ins_predicts
21 |
22 |
23 | def sample_reader(shapes, dtypes, sample_num=1 << 12):
24 | def __reader_impl__():
25 | for _ in range(sample_num):
26 | sample = tuple()
27 | for shape, dtype in zip(shapes, dtypes):
28 | sample += (np.random.random(shape).astype(dtype), )
29 | yield sample
30 |
31 | return __reader_impl__
32 |
33 |
34 | def qps(reader):
35 | pre_t = time.time()
36 | for step, _ in enumerate(reader()):
37 | if (step + 1) % 1000 == 0:
38 | now = datetime.datetime.now()
39 | t = time.time()
40 | print('{}, step={}, qps={} step/s'.format(now, step + 1, 1000.0 / (
41 | t - pre_t)))
42 | pre_t = t
43 |
44 |
45 | def main(args):
46 | ins, ins_shape, ins_dtype, predicts = get_ins_predicts()
47 | print('{}, {}, {}, {}'.format(ins, ins_shape, ins_dtype, predicts))
48 |
49 | reader = sample_reader(ins_shape, ins_dtype, 1 << 12)
50 |
51 | dr = DistillReader(ins=ins, predicts=predicts)
52 | dr.set_teacher_batch_size(args.teacher_bs)
53 | #dr.set_fixed_teacher(['10.255.100.13:9494'])
54 | distill_reader = dr.set_sample_generator(reader)
55 |
56 | qps(distill_reader)
57 |
58 |
59 | if __name__ == '__main__':
60 | import argparse
61 | parser = argparse.ArgumentParser(description='qps test')
62 | parser.add_argument(
63 | '--teacher_bs',
64 | type=int,
65 | default=1,
66 | help='teacher batch_size [default: %(default)s]')
67 | args = parser.parse_args()
68 | main(args)
69 |
--------------------------------------------------------------------------------
/example/distill/qps_tools/parse_config.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from paddle_serving_client import Client as ServingClient
17 |
18 |
19 | def get_ins_predicts(conf_file=None):
20 | """ May deprecated in future"""
21 | client_types = ['int64', 'float32']
22 |
23 | if conf_file is not None and os.path.isfile(conf_file):
24 | conf_file = conf_file
25 | elif os.path.isfile('./serving_conf/serving_client_conf.prototxt'):
26 | conf_file = './serving_conf/serving_client_conf.prototxt'
27 | else:
28 | conf_file = os.getenv('PADDLE_DISTILL_CONF_FILE')
29 | assert conf_file is not None
30 | assert os.path.isfile(conf_file)
31 |
32 | client = ServingClient()
33 | client.load_client_config(conf_file)
34 |
35 | feeds = client.get_feed_names()
36 | feeds_shapes = []
37 | feeds_dtype = []
38 | for feed_name in feeds:
39 | shape = client.feed_shapes_[feed_name]
40 | feeds_shapes.append(tuple(shape))
41 | feeds_dtype.append(client_types[client.feed_types_[feed_name]])
42 |
43 | predicts = client.get_fetch_names()
44 | return feeds, feeds_shapes, feeds_dtype, predicts
45 |
--------------------------------------------------------------------------------
/example/distill/qps_tools/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # local test qps
18 | export PADDLE_DISTILL_BALANCE_SERVER='10.255.100.13:9379'
19 | export PADDLE_DISTILL_SERVICE_NAME=MnistDistill
20 | export PADDLE_DISTILL_MAX_TEACHER=1
21 | export PADDLE_DISTILL_CONF_FILE="$PWD/../reader_demo/serving_conf/serving_client_conf.prototxt"
22 |
23 | batch_size=(1 2 4 8 16 24 32)
24 | for x in ${batch_size[@]}; do
25 | echo "-------- batch_size=$x ---------"
26 | python distill_reader_qps.py --teacher_bs $x
27 | echo
28 | done
29 |
--------------------------------------------------------------------------------
/example/distill/reader_demo/run_demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -eu
18 |
19 | if [ ! -f mnist_cnn_model.tar.gz ]; then
20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz
21 | fi
22 | tar -zxf mnist_cnn_model.tar.gz
23 |
24 | # at gpu 0, start paddle serving server on port 9292
25 | port=9292
26 | nohup python -m paddle_serving_server_gpu.serve \
27 | --model mnist_cnn_model \
28 | --thread 4 \
29 | --port ${port} \
30 | --mem_optim True \
31 | --gpu_ids 0 &
32 | serving_pid=$!
33 |
34 | python distill_reader_demo.py --distill_teachers 127.0.0.1:${port}
35 |
36 | # kill serving server
37 | pstree -p ${serving_pid} | awk -F"[()]" '{print $2}'| xargs kill -9
38 |
--------------------------------------------------------------------------------
/example/distill/resnet/README.md:
--------------------------------------------------------------------------------
1 | # ResNeXt101_32x16d_wsl distill ResNet50_vd
2 |
3 | ## Local test
4 | ### start local teacher
5 | start ResNeXt101_32x16d_wsl teacher on gpu 1
6 | ``` bash
7 | bash ./scripts/start_local_teacher.sh
8 | ```
9 | ### train student with local teacher
10 | At another terminal, train resnet50_vd student on gpu 0.
11 | ``` bash
12 | bash ./scripts/train_student.sh
13 | ```
14 |
--------------------------------------------------------------------------------
/example/distill/resnet/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .vgg import VGG11, VGG13, VGG16, VGG19
16 | from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152
17 | from .resnet_vd import ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd, ResNet200_vd
18 |
--------------------------------------------------------------------------------
/example/distill/resnet/scripts/start_local_teacher.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -eu
18 |
19 | if [ ! -f ResNeXt101_32x16d_wsl_model.tar.gz ]; then
20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/ResNeXt101_32x16d_wsl_model.tar.gz
21 | fi
22 | tar -zxf ResNeXt101_32x16d_wsl_model.tar.gz
23 |
24 | port=9898
25 | python -m paddle_serving_server_gpu.serve \
26 | --model ResNeXt101_32x16d_wsl_model \
27 | --thread 4 \
28 | --port ${port} \
29 | --mem_optim True \
30 | --gpu_ids 1
31 |
--------------------------------------------------------------------------------
/example/distill/resnet/scripts/train_student.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Unset proxy
18 | unset https_proxy http_proxy
19 |
20 | export GLOG_v=1
21 | export GLOG_logtostderr=1
22 | export FLAGS_eager_delete_tensor_gb=0
23 | export NCCL_DEBUG=INFO
24 |
25 | python -m paddle.distributed.launch --selected_gpus 0 \
26 | ./train_with_fleet.py \
27 | --model=ResNet50_vd \
28 | --data_dir=./ImageNet \
29 | --lr_strategy=cosine_warmup_decay \
30 | --use_distill_service=True \
31 | --distill_teachers=127.0.0.1:9898
32 |
--------------------------------------------------------------------------------
/example/distill/resnet/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/example/fit_a_line/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM paddlepaddle/paddlecloud-job:0.11.0
2 | RUN mkdir -p /data/recordio/imikolov && \
3 | python -c "import paddle; import paddle.v2.dataset as dataset; word_dict = dataset.imikolov.build_dict(); \
4 | dataset.imikolov.train(word_dict, 5); dataset.imikolov.test(word_dict, 5); \
5 | dataset.common.convert('/data/recordio/imikolov/', dataset.imikolov.train(word_dict, 5), 5000, 'imikolov-train')"
6 |
7 | RUN mkdir -p /workspace
8 | ADD train_ft.py /workspace
9 |
--------------------------------------------------------------------------------
/example/fit_a_line/collector.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/collector.pyc
--------------------------------------------------------------------------------
/example/fit_a_line/del_jobs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | function delete_job() {
18 | jobname=$1
19 | if [[ "$jobname" == "" ]]; then
20 | echo "Usage: sh edl_jobs.sh [all|]"
21 | exit 0
22 | fi
23 | kubectl delete trainingjob $jobname
24 | kubectl delete job $jobname-trainer
25 | kubectl delete rs $jobname-master $jobname-pserver
26 | }
27 |
28 | function delete_all() {
29 | jobs=$(kubectl get trainingjob | tail -n +2 | awk '{print $1}')
30 | for job in ${jobs[@]}
31 | do
32 | delete_job $job
33 | done
34 | }
35 |
36 | case "$1" in
37 | all)
38 | delete_all
39 | ;;
40 | *)
41 | delete_job $1
42 | ;;
43 | esac
44 |
--------------------------------------------------------------------------------
/example/fit_a_line/examplejob.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: paddlepaddle.org/v1
2 | kind: TrainingJob
3 | metadata:
4 | name: example
5 | spec:
6 | image: "paddlepaddle/edl-example"
7 | port: 7164
8 | ports_num: 1
9 | ports_num_for_sparse: 1
10 | fault_tolerant: true
11 | trainer:
12 | entrypoint: "python /workspace/train_ft.py"
13 | workspace: "/workspace"
14 | passes: 50
15 | min-instance: 2
16 | max-instance: 10
17 | resources:
18 | limits:
19 | #alpha.kubernetes.io/nvidia-gpu: 1
20 | cpu: "200m"
21 | memory: "200Mi"
22 | requests:
23 | cpu: "200m"
24 | memory: "200Mi"
25 | pserver:
26 | min-instance: 2
27 | max-instance: 2
28 | resources:
29 | limits:
30 | cpu: "800m"
31 | memory: "1Gi"
32 | requests:
33 | cpu: "500m"
34 | memory: "600Mi"
35 | master:
36 | resources:
37 | limits:
38 | cpu: "1"
39 | memory: "1Gi"
40 | requests:
41 | cpu: "500m"
42 | memory: "600Mi"
43 |
--------------------------------------------------------------------------------
/example/fit_a_line/fluid/common.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import cPickle
17 | import paddle
18 | import glob
19 |
20 |
21 | def prepare_dataset(output_path, name_prefix, reader_func, sample_count=128):
22 | if not os.path.exists(output_path):
23 | os.makedirs(output_path)
24 |
25 | suffix = "%s/%s-%%05d.pickle" % (output_path, name_prefix)
26 | lines = []
27 | indx_f = 0
28 | for i, d in enumerate(reader_func()):
29 | lines.append(d)
30 | if i >= sample_count and i % sample_count == 0:
31 | with open(suffix % indx_f, "w") as f:
32 | cPickle.dump(lines, f)
33 | lines = []
34 | indx_f += 1
35 | if lines:
36 | with open(suffix % indx_f, "w") as f:
37 | cPickle.dump(lines, f)
38 |
39 |
40 | def cluster_reader(files_path, trainers, trainer_id):
41 | def reader():
42 | flist = glob.glob(files_path)
43 | flist.sort()
44 | my_file_list = []
45 | for idx, fn in enumerate(flist):
46 | if idx % trainers == trainer_id:
47 | print("append file for current trainer: %s" % fn)
48 | my_file_list.append(fn)
49 |
50 | for fn in my_file_list:
51 | print("processing file: ", fn)
52 | with open(fn, "r") as f:
53 | lines = cPickle.load(f)
54 | for line in lines:
55 | yield line
56 |
57 | return reader
58 |
--------------------------------------------------------------------------------
/example/fit_a_line/fluid/image/infer_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/fluid/image/infer_3.png
--------------------------------------------------------------------------------
/example/fit_a_line/fluid/image/ranges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/fluid/image/ranges.png
--------------------------------------------------------------------------------
/example/fit_a_line/nginx.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: nginx-deployment
5 | spec:
6 | selector:
7 | matchLabels:
8 | app: nginx
9 | replicas: 5 # tells deployment to run 2 pods matching the template
10 | template:
11 | metadata:
12 | labels:
13 | app: nginx
14 | spec:
15 | containers:
16 | - name: nginx
17 | image: nginx:1.7.9
18 | ports:
19 | - containerPort: 80
20 | resources:
21 | limits:
22 | cpu: "600m"
23 | memory: "200Mi"
24 | requests:
25 | cpu: "400m"
26 | memory: "100Mi"
27 |
--------------------------------------------------------------------------------
/k8s/edl_controller.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: Deployment
3 | metadata:
4 | name: training-job-controller
5 | namespace: paddlecloud
6 | spec:
7 | replicas: 1
8 | template:
9 | metadata:
10 | labels:
11 | name: training-job-controller
12 | spec:
13 | containers:
14 | - name: training-job-controller
15 | image: yancey1989/edl-controller
16 | env:
17 | - name: https_proxy
18 | value: ""
19 | - name: http_proxy
20 | value: ""
21 | command: ["/usr/local/bin/edl", "-logtostderr", "-log_level", "debug", "-max_load_desired", "0.9"]
22 |
--------------------------------------------------------------------------------
/k8s/rbac_admin.yaml:
--------------------------------------------------------------------------------
1 | kind: ClusterRoleBinding
2 | apiVersion: rbac.authorization.k8s.io/v1alpha1
3 | metadata:
4 | name: cluster-admin--default-system:default
5 | roleRef:
6 | apiGroup: rbac.authorization.k8s.io
7 | kind: ClusterRole
8 | name: cluster-admin
9 | subjects:
10 | - kind: ServiceAccount
11 | name: default
12 | namespace: default
13 | ---
14 | kind: ClusterRoleBinding
15 | apiVersion: rbac.authorization.k8s.io/v1alpha1
16 | metadata:
17 | name: cluster-admin--paddlecloud-system:default
18 | roleRef:
19 | apiGroup: rbac.authorization.k8s.io
20 | kind: ClusterRole
21 | name: cluster-admin
22 | subjects:
23 | - kind: ServiceAccount
24 | name: default
25 | namespace: paddlecloud
26 |
--------------------------------------------------------------------------------
/k8s/thirdpartyresource.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: extensions/v1beta1
2 | kind: ThirdPartyResource
3 | metadata:
4 | name: training-job.paddlepaddle.org
5 | description: "PaddlePaddle TrainingJob operator"
6 | versions:
7 | - name: v1
8 |
--------------------------------------------------------------------------------
/logo/edl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/logo/edl.png
--------------------------------------------------------------------------------
/logo/paddle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/logo/paddle.png
--------------------------------------------------------------------------------
/python/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh discovery/*.py distill/*.py distill/redis/*.py setup.py)
2 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
3 | add_custom_command(
4 | OUTPUT ${EDL_BINARY_DIR}/.timestamp
5 | COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
6 | COMMAND python3.6 ./setup.py bdist_wheel --universal
7 | DEPENDS ${EDL_FILES})
8 | add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
9 | add_subdirectory(edl/tests/unittests)
10 |
--------------------------------------------------------------------------------
/python/edl/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/collective/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/collective/dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | class FileSplitter(object):
17 | """
18 | This the interface user should inherit.
19 | It will let's the framework knows the data file it's processing.
20 | TxtDataReader is an example.
21 | """
22 |
23 | def __init__(self, data_file):
24 | self._data_file = data_file
25 |
26 | def __iter__(self):
27 | """
28 | yield idx, record data
29 | """
30 | raise NotImplementedError()
31 |
32 |
33 | class TxtFileSplitter(FileSplitter):
34 | def __init__(self, data_file):
35 | super(TxtFileSplitter, self).__init__(data_file)
36 |
37 | def __iter__(self):
38 | idx = 0
39 | with open(self._data_file, "r") as f:
40 | for line in f:
41 | line = line.strip()
42 | if len(line) <= 0:
43 | continue
44 | idx += 1
45 | yield idx, line
46 |
--------------------------------------------------------------------------------
/python/edl/collective/launch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """
15 | paddle.distributed.launch is a module that spawns multiple distributed
16 | process on each training node for gpu training.
17 | """
18 |
19 | from __future__ import print_function
20 |
21 | import sys
22 | from edl.utils import args_utils
23 | from edl.utils import env as edl_env
24 | from edl.utils import etcd_db
25 | from edl.utils import launcher as edl_launcher
26 | from edl.utils import log_utils
27 | from edl.utils import status as edl_status
28 | from edl.utils.log_utils import logger
29 | from edl.utils import pod as edl_pod
30 |
31 |
32 | def main():
33 | log_utils.get_logger(log_level=10)
34 | args = args_utils.parse_args()
35 | args_dict = args_utils.convert_args_to_dict(args)
36 |
37 | # job enviroment.
38 | job_env = edl_env.JobEnv(args_dict)
39 | logger.info("get job env:{}".format(str(job_env)))
40 |
41 | # get global etcd and lock
42 | etcd = etcd_db.get_global_etcd(job_env.etcd_endpoints, job_env.job_id)
43 |
44 | last_status = edl_status.load_job_status_from_etcd(etcd, timeout=30)
45 | if last_status == edl_status.Status.SUCCEED:
46 | logger.info("job:{} has completed! Need't try!".format(job_env.job_id))
47 | sys.exit(0)
48 |
49 | # local pod, and the pod's id does't change.
50 | pod = edl_pod.Pod()
51 | pod.from_env(job_env)
52 |
53 | launcher = edl_launcher.Launcher(job_env=job_env, pod=pod, etcd=etcd, args=args)
54 | launcher.init()
55 | launcher.launch()
56 |
57 |
58 | if __name__ == "__main__":
59 | main()
60 |
--------------------------------------------------------------------------------
/python/edl/collective/serializable.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from edl.utils.json_serializable import SerializableBase
16 |
17 | __all__ = ["SerializableBase"]
18 |
--------------------------------------------------------------------------------
/python/edl/discovery/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/discovery/server_alive.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import socket
16 | from contextlib import closing
17 |
18 |
19 | def is_server_alive(server):
20 | """ is server alive
21 | return alive, client_addr
22 | """
23 | alive = True
24 | client_addr = None
25 | ip, port = server.split(":")
26 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
27 | try:
28 | s.settimeout(1.5)
29 | s.connect((ip, int(port)))
30 | client_addr = s.getsockname()
31 | s.shutdown(socket.SHUT_RDWR)
32 | except socket.error:
33 | alive = False
34 | return alive, client_addr
35 |
--------------------------------------------------------------------------------
/python/edl/distill/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/distill/redis/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/distill/redis/redis_store.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import redis
16 |
17 |
18 | class RedisStore(object):
19 | def __init__(self, ip="127.0.0.1", port=6379, passwd=None):
20 | self._ip = ip
21 | self._port = port
22 | self._passwd = passwd
23 | self._redis = redis.Redis(
24 | host=ip, port=port, password=passwd, decode_responses=True
25 | )
26 | print("connected to redis ip:{} port:{}".format(ip, port))
27 |
28 | def get_service(self, service_name):
29 | servers = []
30 | for key in self._redis.scan_iter("/service/{}/nodes/*".format(service_name)):
31 | servers.append(self._redis.hgetall(key))
32 | return servers
33 |
34 | def remove_service(self, service_name):
35 | for key in self._redis.scan_iter("/service/{}/*".format(service_name)):
36 | self._redis.delete(key)
37 |
38 | def set_server(self, service_name, server, info, ttl=6):
39 | server_info = {"server": server, "info": info}
40 | key = "/service/{}/nodes/{}".format(service_name, server)
41 | self._redis.hmset(key, server_info)
42 | self._redis.expire(key, ttl)
43 |
44 | def remove_server(self, service_name, server):
45 | self._redis.delete("/service/{}/nodes/{}".format(service_name, server))
46 |
47 | def refresh(self, service_name, server, info=None, ttl=6):
48 | if info is not None:
49 | self.set_server(self, service_name, server, info, ttl)
50 | return True
51 | key = "/service/{}/nodes/{}".format(service_name, server)
52 | time = self._redis.ttl(key)
53 | if time < 0:
54 | return False
55 | self._redis.expire(key, ttl)
56 | return True
57 |
58 | def get_client(self, client):
59 | # Todo
60 | pass
61 |
62 | def set_client(self, client, service_name):
63 | # Todo
64 | pass
65 |
66 |
67 | if __name__ == "__main__":
68 | service_name = "TestService"
69 | store = RedisStore("127.0.0.1", 6379)
70 | print(store.get_service(service_name))
71 | store.set_server(service_name, "127.0.0.1:5454", "{cpu: 10%, gpu: 20%}")
72 | print(store.get_service(service_name))
73 |
--------------------------------------------------------------------------------
/python/edl/distill/timeline.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import sys
17 | import time
18 |
19 |
20 | class _NopTimeLine(object):
21 | def record(self, name):
22 | pass
23 |
24 | def reset(self):
25 | pass
26 |
27 |
28 | class _RealTimeLine(object):
29 | def __init__(self):
30 | self.pid = os.getpid()
31 | self.time = time.time()
32 |
33 | def record(self, name):
34 | new_time = time.time()
35 | sys.stderr.write(
36 | "pid={} op={} time={}ms\n".format(
37 | self.pid, name, (new_time - self.time) * 1000
38 | )
39 | )
40 | self.time = new_time
41 |
42 | def reset(self):
43 | self.time = time.time()
44 |
45 |
46 | _is_profile = int(os.environ.get("DISTILL_READER_PROFILE", 0))
47 | _TimeLine = _RealTimeLine if _is_profile else _NopTimeLine
48 |
--------------------------------------------------------------------------------
/python/edl/distill/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from paddle.distributed.fs_wrapper import BDFS
17 |
18 |
19 | def download_hdfs_file(model_name, dst_path):
20 | """
21 | teacher model name
22 | dst_path: dst directory name
23 | """
24 | hdfs_name = os.getenv("PADDLE_DISTILL_HDFS_NAME")
25 | hdfs_ugi = os.getenv("PADDLE_DISTILL_HDFS_UGI")
26 | hdfs_path = os.getenv("PADDLE_DISTILL_HDFS_PATH")
27 | assert hdfs_name, "hdfs_name must be set"
28 | assert hdfs_ugi, "hdfs_ugi must be set"
29 | assert hdfs_path, "hdfs_path must be set"
30 |
31 | fs = BDFS(hdfs_name, hdfs_ugi)
32 |
33 | proto_path = hdfs_path + "/" + model_name + "/serving_server_conf.prototxt"
34 | fs.download(proto_path, dst_path)
35 |
--------------------------------------------------------------------------------
/python/edl/liveft/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/liveft/launch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import sys
16 | import signal
17 |
18 | from edl.liveft.elastic import ElasticManager
19 | from edl.liveft.elastic import LauncherInterface
20 | from edl.liveft.elastic import ElasticStatus
21 | from edl.liveft.elastic import ELASTIC_EXIT_CODE
22 |
23 |
24 | def launch():
25 | # user interface for launching the pserver.
26 | # launch_ps()
27 | # return
28 |
29 | elastic = ElasticManager()
30 |
31 | signal.signal(signal.SIGTERM, elastic.signal_handler)
32 | signal.signal(signal.SIGABRT, elastic.signal_handler)
33 | signal.signal(signal.SIGINT, elastic.signal_handler)
34 |
35 | while True:
36 |
37 | # wait for all nodes ready to run
38 | elastic.wait()
39 |
40 | # run self with specified launcher
41 | elastic.run(LauncherInterface)
42 |
43 | # keep watching the health status of self and being notified for other's failure
44 | ret = elastic.watch()
45 | if ret == ElasticStatus.COMPLETED:
46 | break
47 | if ret == ElasticStatus.HOLD:
48 | continue
49 | if ret == ElasticStatus.EXIT:
50 | break
51 | if ret == ElasticStatus.ERROR:
52 | sys.exit(3)
53 | if ret == ElasticStatus.RESTART:
54 | sys.exit(ELASTIC_EXIT_CODE)
55 |
56 | if int(elastic.sigint) > 0:
57 | sys.exit(128 + int(elastic.sigint))
58 | else:
59 | sys.exit(0)
60 |
61 |
62 | if __name__ == "__main__":
63 | launch()
64 |
--------------------------------------------------------------------------------
/python/edl/protos/common.proto:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | syntax = "proto3";
16 |
17 | option go_package = ".;masterpb";
18 | package common;
19 |
20 | message Status {
21 | string type = 1;
22 | string detail = 2;
23 | }
24 |
25 | message EmptyRet { Status status = 1; }
26 |
--------------------------------------------------------------------------------
/python/edl/protos/data_server.proto:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | syntax = "proto3";
16 |
17 | import "common.proto";
18 |
19 | option go_package = ".;masterpb";
20 | package data_server;
21 |
22 | message ShutDownRequest {}
23 | message EmptyRequest {}
24 |
25 | message FileListElement {
26 | int64 idx = 1;
27 | string path = 2;
28 | }
29 |
30 | message FileListRequest {
31 | string pod_id = 1;
32 | string reader_name = 2;
33 | // used for verify
34 | repeated FileListElement file_list = 3;
35 | }
36 |
37 | message FileListResponse {
38 | common.Status status = 1;
39 | repeated FileListElement file_list = 2;
40 | }
41 |
42 | message Record {
43 | int64 record_no = 1;
44 | repeated bytes field_data = 2;
45 | }
46 |
47 | message BatchData {
48 | string batch_data_id = 1;
49 | repeated Record records = 2;
50 | }
51 |
52 | message BatchDataMeta {
53 | string reader_name = 1;
54 | string producer_pod_id = 2;
55 | string consumer_pod_id = 3;
56 | string data_server_endpoint = 4;
57 |
58 | repeated string batch_data_ids = 5;
59 | }
60 |
61 | message ReportBatchDataMetaRequest {
62 | string reader_name = 1;
63 | string pod_id = 2;
64 | string data_server_endpoint = 3;
65 |
66 | repeated string batch_data_ids = 4;
67 | }
68 |
69 | message GetBatchDataMetaRequest {
70 | string reader_name = 1;
71 | string pod_id = 2;
72 | }
73 |
74 | message ReachDataEndRequest {
75 | string reader_name = 1;
76 | string pod_id = 2;
77 | }
78 |
79 | message BatchDataMetaResponse {
80 | common.Status status = 1;
81 | repeated BatchDataMeta data = 2;
82 | }
83 |
84 | message BatchDataResponse {
85 | common.Status status = 1;
86 | repeated BatchData data = 2;
87 | }
88 |
89 | service DataServer {
90 | // only leader can do this
91 | rpc ReportBatchDataMeta(ReportBatchDataMetaRequest)
92 | returns (common.EmptyRet) {}
93 | rpc ReachDataEnd(ReachDataEndRequest) returns (common.EmptyRet) {}
94 | rpc GetBatchDataMeta(GetBatchDataMetaRequest)
95 | returns (BatchDataMetaResponse) {}
96 |
97 | // all data servers can do this
98 | rpc GetFileList(FileListRequest) returns (FileListResponse) {}
99 | rpc GetBatchData(BatchDataMeta) returns (BatchDataResponse) {}
100 | }
101 |
--------------------------------------------------------------------------------
/python/edl/protos/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -xe
18 |
19 | #TODO(gongwb): reopen them then async trainning
20 | #pushd /tmp/
21 | #go get -u -v github.com/golang/protobuf/protoc-gen-go@v1.3.0
22 | #popd
23 |
24 | #protoc --go_out=plugins=grpc:./ master.proto
25 | #protoc --go_out=plugins=grpc:./ common.proto
26 |
27 | #mkdir -p ../../../pkg/masterpb
28 | #mv *.go ../../../pkg/masterpb
29 |
30 | # see the build.sh to get the pakage version
31 | which python
32 | python ./run_codegen.py
33 |
34 | # generate python compatabile path
35 | sed -i -r 's/^import (.+_pb2.*)/from . import \1/g' ./*_pb2*.py
36 |
37 | # import os
38 | mv pod_server*.py data_server*.py common*.py ../utils/
39 | mv distill_discovery*.py ../distill/
40 |
--------------------------------------------------------------------------------
/python/edl/protos/pod_server.proto:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | syntax = "proto3";
16 |
17 | import "common.proto";
18 |
19 | option go_package = ".;masterpb";
20 | package pod_server;
21 |
22 | message BarrierRequest {
23 | string job_id = 1;
24 | string pod_id = 2;
25 | }
26 |
27 | message BarrierResponse {
28 | common.Status status = 1;
29 | string cluster_json = 2;
30 | }
31 |
32 | message ScaleInRequest { int32 num = 1; }
33 |
34 | message ScaleOutRequest {}
35 |
36 | service PodServer {
37 | rpc Barrier(BarrierRequest) returns (BarrierResponse) {}
38 |
39 | // Cluster controller -> master
40 | rpc ScaleOut(ScaleOutRequest) returns (common.Status) {}
41 | rpc ScaleIn(ScaleInRequest) returns (common.Status) {}
42 | }
43 |
--------------------------------------------------------------------------------
/python/edl/protos/run_codegen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Copyright 2015 gRPC authors.
16 | #
17 | # Licensed under the Apache License, Version 2.0 (the "License");
18 | # you may not use this file except in compliance with the License.
19 | # You may obtain a copy of the License at
20 | #
21 | # http://www.apache.org/licenses/LICENSE-2.0
22 | #
23 | # Unless required by applicable law or agreed to in writing, software
24 | # distributed under the License is distributed on an "AS IS" BASIS,
25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | # See the License for the specific language governing permissions and
27 | # limitations under the License.
28 | """Runs protoc with the gRPC plugin to generate messages and gRPC stubs."""
29 |
30 | from grpc_tools import protoc
31 | import pkg_resources
32 | import sys
33 |
34 | print("run code gen python verion:", sys.version_info)
35 |
36 | # python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. data_server.proto
37 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "common.proto",))
38 |
39 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "pod_server.proto",))
40 |
41 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "data_server.proto",))
42 |
43 | proto_include = pkg_resources.resource_filename("grpc_tools", "_proto")
44 | protoc.main(
45 | (
46 | "",
47 | "-I.",
48 | "-I{}".format(proto_include),
49 | "--python_out=.",
50 | "--grpc_python_out=.",
51 | "distill_discovery.proto",
52 | )
53 | )
54 |
--------------------------------------------------------------------------------
/python/edl/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/data_server/a.txt:
--------------------------------------------------------------------------------
1 | a0
2 | a1
3 | a2
4 | a3
5 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/data_server/b.txt:
--------------------------------------------------------------------------------
1 | b0
2 | b1
3 | b2
4 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/del_from_etcd.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | from edl.utils import constants
17 | from edl.discovery import etcd_client
18 |
19 | g_etcd_endpoints = "127.0.0.1:2379"
20 |
21 | job_id = os.environ["PADDLE_JOB_ID"]
22 | etcd_endpoints = os.environ["PADDLE_ETCD_ENDPOINTS"]
23 | etcd = etcd_client.EtcdClient([g_etcd_endpoints], root=job_id)
24 | etcd.init()
25 | constants.clean_etcd(etcd)
26 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/distill_reader_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import edl.distill.distill_reader as distill_reader
16 | import numpy as np
17 | import unittest
18 |
19 |
20 | class TestDistillReader(unittest.TestCase):
21 | def test_distill_reader(self):
22 | # temp local test
23 | distill_reader.distill_worker._NOP_PREDICT_TEST = True
24 |
25 | # test mnist distill reader
26 | def _reader():
27 | img = np.array(
28 | [(i + 1) / 28.0 for i in range(28)] * 28, dtype=np.float32
29 | ).reshape((1, 28, 28))
30 | label = np.array([100], dtype=np.int64)
31 | for i in range(24):
32 | yield 8 * [(img, label)]
33 | yield 2 * [(img, label)]
34 |
35 | dr = distill_reader.DistillReader(ins=["img", "label"], predicts=["prediction"])
36 | dr.set_teacher_batch_size(4)
37 | dr.set_fixed_teacher(["127.0.0.1:9292", "127.0.0.1:9293"])
38 | # dr.set_dynamic_teacher(['127.0.0.1:7001'], 'DistillReaderTest', 3)
39 |
40 | train_reader = dr.set_sample_list_generator(_reader)
41 | dr.print_config()
42 |
43 | for epoch in range(300):
44 | for step, batch in enumerate(train_reader()):
45 | if epoch == 0 and step == 0:
46 | dr.print_config()
47 | if epoch % 10 == 0:
48 | print(
49 | "^^^^^^^^^^^^^ epoch={} predict[0][0]={}^^^^^^^^^^^^^^".format(
50 | epoch, batch[-1][-1][0]
51 | )
52 | )
53 |
54 |
55 | if __name__ == "__main__":
56 | unittest.main()
57 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/edl_demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 |
16 | pod_id = os.getenv("PADDLE_POD_ID", "")
17 | print(pod_id + "__edl_demo__")
18 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/etcd_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | name=${TEST_TARGET_NAME}
18 | TEST_TIMEOUT=${TEST_TIMEOUT}
19 |
20 | if [[ ${name}"x" == "x" ]]; then
21 | echo "can't find ${name}, please set ${TEST_TARGET_NAME} first"
22 | exit 1
23 | fi
24 |
25 | if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
26 | echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
27 | exit 1
28 | fi
29 |
30 | # rm flag file
31 | rm -f "${name}"_*.log
32 |
33 | # start the unit test
34 | run_time=$(( TEST_TIMEOUT - 10 ))
35 | echo "run_time: ${run_time}"
36 |
37 | timeout -s SIGKILL ${run_time} "${PYTHON_EXECUTABLE}" -u "${name}.py" > "${name}_run.log" 2>&1
38 | exit_code=$?
39 |
40 | echo "${name} faild with ${exit_code}"
41 | if [[ $exit_code -eq 0 ]]; then
42 | exit 0
43 | fi
44 |
45 | echo "${name} log"
46 | for log in ./"${name}"_*.log
47 | do
48 | printf "\ncat %s\n", "${log}"
49 | cat -n "${log}"
50 | done
51 |
52 | exit 1
53 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/etcd_test_base.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import copy
16 | import edl.utils.constants as constants
17 | import edl.utils.log_utils as log_utils
18 | import os
19 | import unittest
20 | from edl.discovery.etcd_client import EtcdClient
21 | from edl.utils import env as edl_env
22 |
23 | g_etcd_endpoints = "127.0.0.1:2379"
24 |
25 |
26 | class EtcdTestBase(unittest.TestCase):
27 | def setUp(self, job_id):
28 | log_utils.get_logger(log_level=10)
29 | self._etcd = EtcdClient([g_etcd_endpoints], root=job_id)
30 | self._etcd.init()
31 |
32 | self._old_environ = copy.copy(dict(os.environ))
33 | proc_env = {
34 | "PADDLE_TRAINER_ID": "0",
35 | "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD",
36 | "PADDLE_JOB_ID": job_id,
37 | "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7",
38 | "PADDLE_EDL_HDFS_NAME": "",
39 | "PADDLE_EDL_HDFS_UGI": "",
40 | "PADDLE_EDL_HDFS_PATH": "test_register_path",
41 | "PADDLE_EDL_ONLY_FOR_CE_TEST": "1",
42 | "PADDLE_EDL_FS_CACHE": ".test_register_cache",
43 | "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0",
44 | "PADDLE_EDL_NODES_RANGE": "1:4",
45 | "PADDLE_EDL_NPROC_PERNODE": "1",
46 | "PADDLE_ETCD_ENDPOINTS": "127.0.0.1:2379",
47 | "PADDLE_EDLNODES_RANAGE": "2:2",
48 | "CUDA_VISIBLE_DEVICES": "0",
49 | "PADDLE_TRAINER_PORTS": "6670",
50 | }
51 | os.environ.pop("https_proxy", None)
52 | os.environ.pop("http_proxy", None)
53 | os.environ.update(proc_env)
54 |
55 | self._job_env = edl_env.JobEnv(None)
56 | constants.clean_etcd(self._etcd)
57 |
58 | def tearDown(self):
59 | os.environ.clear()
60 | os.environ.update(self._old_environ)
61 | constants.clean_etcd(self._etcd)
62 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/launch_demo.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import sys
17 |
18 | exit_code = int(os.getenv("PADDLE_DEMO_EXIT_CODE"))
19 | print("exit code:", exit_code)
20 | sys.exit(exit_code)
21 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/master_client_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 | import paddle_edl.utils.master_pb2 as master_pb2
17 | import unittest
18 | from edl.utils.master_client import Client
19 | from edl.utils.utils import get_file_list, get_logger
20 |
21 | os.environ["https_proxy"] = ""
22 | os.environ["http_proxy"] = ""
23 |
24 |
25 | class TestMasterClient(unittest.TestCase):
26 | def setUp(self):
27 | self._client = Client("127.0.0.1:8080")
28 |
29 | def test_add_dataset(self):
30 | dataset = master_pb2.DataSet()
31 | dataset.name = "train"
32 | for t in get_file_list("./test_file_list.txt"):
33 | dataset.file_list.append(t[0])
34 |
35 | res = self._client.add_dataset(dataset)
36 | assert res is None or res.type == "", "must not any error"
37 |
38 | res = self._client.add_dataset(dataset)
39 | assert res.type == "DuplicateInitDataSet", "must error"
40 |
41 |
42 | if __name__ == "__main__":
43 | logger = get_logger(10)
44 | unittest.main()
45 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/serving_conf/serving_client_conf.prototxt:
--------------------------------------------------------------------------------
1 | feed_var {
2 | name: "img"
3 | alias_name: "img"
4 | is_lod_tensor: false
5 | feed_type: 1
6 | shape: 1
7 | shape: 28
8 | shape: 28
9 | }
10 | fetch_var {
11 | name: "fc_0.tmp_2"
12 | alias_name: "prediction"
13 | is_lod_tensor: false
14 | fetch_type: 1
15 | shape: 10
16 | }
17 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_cluster.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 | from edl.tests.unittests import etcd_test_base
17 | from edl.utils import cluster as edl_cluster
18 |
19 |
20 | class TestCluster(etcd_test_base.EtcdTestBase):
21 | def setUp(self):
22 | super(TestCluster, self).setUp("test_cluster")
23 |
24 | def test_cluster_basic(self):
25 | cluster = edl_cluster.Cluster()
26 |
27 | cluster2 = edl_cluster.Cluster()
28 | cluster2.from_json(cluster.to_json())
29 | self.assertEqual(cluster, cluster2)
30 |
31 |
32 | if __name__ == "__main__":
33 | unittest.main()
34 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_consistent_hash.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import copy
16 | import six
17 | import unittest
18 | from edl.discovery.consistent_hash import ConsistentHash
19 |
20 |
21 | class TestConsistentHash(unittest.TestCase):
22 | def test_consistent_hash(self):
23 | nodes = ["127.0.0.1:1234", "127.0.0.1:2345", "127.0.0.1:3456"]
24 | sample_count = 10000
25 | node_to_count = {key: 0 for key in nodes}
26 | sample_to_node = dict()
27 |
28 | cs_hash = ConsistentHash(nodes)
29 |
30 | def hash_test(ip):
31 | for i in range(sample_count):
32 | key = "{}:{}".format(ip, i)
33 | node = cs_hash.get_node(key)
34 | if key not in sample_to_node:
35 | sample_to_node[key] = node
36 | node_to_count[node] += 1
37 | else:
38 | old_node = sample_to_node[key]
39 | node_to_count[old_node] -= 1
40 |
41 | sample_to_node[key] = node
42 | node_to_count[node] += 1
43 |
44 | for node, count in six.iteritems(node_to_count):
45 | print("node={}, count={}".format(node, count))
46 |
47 | hash_test("1.1.1.1")
48 | old_node_to_count = copy.deepcopy(node_to_count)
49 | for count in node_to_count.values():
50 | # test Balance
51 | assert count > 3000
52 |
53 | # remove node
54 | print("\nremove node={}".format(nodes[1]))
55 |
56 | cs_hash.remove_node(nodes[1])
57 | hash_test("1.1.1.1")
58 | # test Monotonicity, remove
59 | assert 0 == node_to_count[nodes[1]]
60 |
61 | # recover node
62 | print("\nrecover node={}".format(nodes[1]))
63 | cs_hash.add_new_node(nodes[1])
64 | hash_test("1.1.1.1")
65 | # test Monotonicity, recover
66 | assert node_to_count == old_node_to_count
67 |
68 | # add new node
69 | new_node = "8.8.8.8:8888"
70 | print("\nadd new node={}".format(new_node))
71 | nodes.append(new_node)
72 | node_to_count[new_node] = 0
73 | cs_hash.add_new_node(new_node)
74 |
75 | hash_test("8.8.8.8")
76 | # test Balance, Monotonicity
77 | assert node_to_count[new_node] < 3000
78 |
79 |
80 | if __name__ == "__main__":
81 | unittest.main()
82 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_data_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 | from edl.collective.data_reader import DistributedDataReader
17 | from edl.collective.dataset import TxtFileSplitter
18 |
19 |
20 | class TestDataReader(unittest.TestCase):
21 | def setUp(self):
22 | self._file_list = ["./data_server/a.txt", "./data_server/b.txt"]
23 | self._data = {}
24 | for idx, p in enumerate(self._file_list):
25 | s = TxtFileSplitter(p)
26 | for r in s:
27 | if idx not in self._data:
28 | self._data[idx] = []
29 | d = ((p), (r[0], r[1:]))
30 | self._data[idx].append(d) # [(path),(rec_no, splitted_fiels)]...
31 |
32 | def test_data_reader(self):
33 | reader1 = DistributedDataReader(
34 | file_list=self._file_list,
35 | file_splitter_cls=TxtFileSplitter,
36 | splitted_data_field=["line"],
37 | batch_size=1,
38 | )
39 |
40 | reader2 = DistributedDataReader(
41 | file_list=self._file_list,
42 | file_splitter_cls=TxtFileSplitter,
43 | splitted_data_field=["line"],
44 | batch_size=1,
45 | )
46 |
47 | size1 = 0
48 | for meta, batch in reader1:
49 | self.assertTrue(meta._size, 1)
50 | for k, v in meta._batch:
51 | c = self._data[k._idx]
52 | self.assertTrue(c[0][0], k._path)
53 | size1 += 1
54 |
55 | size2 = 0
56 | for meta, batch in reader2:
57 | self.assertTrue(meta._size, 1)
58 | for k, v in meta._batch:
59 | c = self._data[k._idx]
60 | self.assertTrue(c[0][0], k._path)
61 | size2 += 1
62 |
63 | self.assertTrue(size1, size2)
64 |
65 |
66 | if __name__ == "__main__":
67 | unittest.main()
68 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_distill_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | unset https_proxy http_proxy
18 |
19 | version_str=$(python --version 2>&1)
20 | if [[ ${version_str} > "Python 3" ]]; then
21 | echo "fix me under Python 3"
22 | exit 0
23 | fi
24 |
25 | nohup etcd > test_distill_reader_etcd.log 2>&1 &
26 | etcd_pid=$!
27 |
28 | # wait etcd start
29 | sleep 10
30 |
31 | nohup python -m edl.discovery.register --service_name DistillReaderTest --server 127.0.0.1:2379 > run_discovery_register.log 2>&1 &
32 | register_pid=$!
33 |
34 | nohup python -m edl.distill.discovery_server > run_discovery_server.log 2>&1 &
35 | discovery_pid=$!
36 |
37 | # wait discovery start
38 | sleep 5
39 |
40 | export PADDLE_DISTILL_BALANCE_TYPE=etcd
41 |
42 | export PADDLE_DISTILL_BALANCE_SERVER=127.0.0.1:7001
43 | export PADDLE_DISTILL_SERVICE_NAME=DistillReaderTest
44 | export PADDLE_DISTILL_MAX_TEACHER=4
45 | python distill_reader_test.py
46 |
47 | kill -9 $discovery_pid $register_pid $etcd_pid
48 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_etcd_client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 |
19 | nohup etcd > test_etcd_client_etcd.log 2>&1 &
20 | etcd_pid=$!
21 |
22 | unset https_proxy http_proxy
23 | python -u ./etcd_client_test.py
24 |
25 | set +e
26 | kill -9 $etcd_pid
27 | echo $etcd_pid
28 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_file_list.txt:
--------------------------------------------------------------------------------
1 | data_server/a.txt
2 | data_server/b.txt
3 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_launch.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from __future__ import print_function
16 |
17 | from edl.tests.unittests import etcd_test_base
18 | from edl.utils import status as edl_status
19 | from edl.utils.log_utils import logger
20 | from edl.utils import launcher as edl_launcher
21 |
22 |
23 | class TestLauncher(etcd_test_base.EtcdTestBase):
24 | def setUp(self):
25 | super(TestLauncher, self).setUp("test_launcher")
26 |
27 | def test_normal_exit(self):
28 | launcher = edl_launcher(self._job_env, self._pod, self._etcd, None)
29 | launcher.init()
30 | launcher.launch()
31 |
32 | last_status = edl_status.load_job_status_from_etcd(self._etcd)
33 | if last_status == edl_status.Status.SUCCEED:
34 | logger.info(
35 | "job:{} has completed! Need't try!".format(self._job_env.job_id)
36 | )
37 | return
38 | self.assertFalse(True)
39 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | unset https_proxy http_proxy
19 |
20 | name=${TEST_TARGET_NAME}
21 | TEST_TIMEOUT=${TEST_TIMEOUT}
22 |
23 | # rm flag file
24 | rm -f "${name}"_*.log
25 |
26 | nohup etcd > "${name}_etcd.log" 2>&1 &
27 | etcd_pid=$!
28 |
29 | echo "etcd_pid:${etcd_pid} ${name}_etcd.log"
30 |
31 | if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then
32 | echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first"
33 | exit 1
34 | fi
35 |
36 | # start the unit test
37 | run_time=$(( TEST_TIMEOUT - 10 ))
38 | echo "run_time: ${run_time}"
39 |
40 | export PADDLE_JOB_ID="test_success_job"
41 | export PADDLE_ETCD_ENDPOINTS="127.0.0.1:2379"
42 | export PADDLE_EDLNODES_RANAGE="2:2"
43 | export PADDLE_EDL_ONLY_FOR_CE_TEST="1"
44 | export PADDLE_EDL_HDFS_PATH="./success_job"
45 | export PADDLE_EDL_HDFS_HOME="./hadoop"
46 |
47 | #clean keys
48 | python del_from_etcd.py
49 |
50 | # all success----
51 | export CUDA_VISIBLE_DEVICES=0
52 | export PADDLE_DEMO_EXIT_CODE=0
53 | timeout -s SIGKILL "${run_time}" python -m edl.collective.launch --log_dir 00 launch_demo.py > "${name}_run_00.log" 2>&1 &
54 | pid_00=$!
55 |
56 | export CUDA_VISIBLE_DEVICES=1
57 | export PADDLE_DEMO_EXIT_CODE=0
58 | timeout -s SIGKILL "${run_time}" python -m edl.collective.launch --log_dir 01 launch_demo.py > "${name}_run_01.log" 2>&1 &
59 | pid_01=$!
60 |
61 | key="/${PADDLE_JOB_ID}/job_flag/nodes/job_status"
62 | value="$(etcdctl get "${key}")"
63 | echo "job complete flag:${value}"
64 |
65 | job_flag=True
66 | for pid in $pid_00 $pid_01; do
67 | echo "wait ${pid}"
68 | if ! wait ${pid} ; then
69 | job_flag=False
70 | fi
71 | done
72 | #----
73 |
74 | if [[ $job_flag == "True" ]]; then
75 | exit 0
76 | fi
77 |
78 | echo "cat ${name}_run_00.log"
79 | cat "${name}_run_00.log"
80 |
81 | echo "cat ${name}_run_01.log"
82 | cat "${name}_run_01.log"
83 |
84 |
85 | set +e
86 | kill -9 $etcd_pid
87 | echo $etcd_pid
88 | exit 1
89 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_leader_pod.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | import unittest
17 | from edl.tests.unittests import etcd_test_base
18 | from edl.utils import constants
19 | from edl.utils import leader_pod
20 | from edl.utils import pod as edl_pod
21 | from edl.utils import resource_pods
22 | from edl.utils import cluster_generator
23 |
24 |
25 | class TestLeaderPod(etcd_test_base.EtcdTestBase):
26 | def setUp(self):
27 | super(TestLeaderPod, self).setUp("test_leader_pod")
28 |
29 | def _add_pod(self):
30 | pod = edl_pod.Pod()
31 | pod.from_env(self._job_env)
32 | resource_register = resource_pods.Register(
33 | self._job_env,
34 | pod_id=pod.pod_id,
35 | pod_json=pod.to_json(),
36 | ttl=constants.ETCD_TTL,
37 | )
38 | generator = cluster_generator.Generator(self._job_env, pod.pod_id)
39 | leader_register = leader_pod.Register(
40 | self._job_env, pod.pod_id, cluster_generator=generator
41 | )
42 |
43 | return (pod, leader_register, resource_register)
44 |
45 | def test_seize_leader(self):
46 | pod0, leader_register0, resource_register0 = self._add_pod()
47 | time.sleep(constants.ETCD_TTL)
48 | pod1, leader_register1, resource_register1 = self._add_pod()
49 |
50 | leader_id = leader_pod.get_pod_leader_id(self._etcd, timeout=15)
51 | self.assertEqual(pod0.pod_id, leader_id)
52 |
53 | leader_register0.stop()
54 | time.sleep(constants.ETCD_TTL)
55 |
56 | leader_id = leader_pod.get_pod_leader_id(self._etcd, timeout=15)
57 | self.assertEqual(pod1.pod_id, leader_id)
58 | leader_register1.stop()
59 |
60 | resource_register0.stop()
61 | resource_register1.stop()
62 |
63 |
64 | if __name__ == "__main__":
65 | unittest.main()
66 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_pod.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import unittest
15 |
16 | from edl.tests.unittests import etcd_test_base
17 | from edl.utils import pod as edl_pod
18 |
19 |
20 | class TestPod(etcd_test_base.EtcdTestBase):
21 | def setUp(self):
22 | super(TestPod, self).setUp("test_pod")
23 |
24 | def test_pod(self):
25 | pod = edl_pod.Pod()
26 | pod.from_env(self._job_env)
27 |
28 | pod2 = edl_pod.Pod()
29 | pod2.from_json(pod.to_json())
30 | self.assertEqual(pod, pod2)
31 |
32 |
33 | if __name__ == "__main__":
34 | unittest.main()
35 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_redis_distill_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | unset https_proxy http_proxy
18 |
19 | nohup redis-server --port 3456 2>&1 &
20 | redis_pid=$!
21 |
22 | # wait redis start
23 | sleep 10
24 |
25 | nohup python -m edl.distill.redis.server_register \
26 | --db_endpoints 127.0.0.1:3456 \
27 | --service_name DistillReaderTest \
28 | --server 127.0.0.1:3456 > test_redist_distill_reader.1.log 2>&1 &
29 | register_pid=$!
30 |
31 | nohup python -m edl.distill.redis.balance_server --db_endpoints 127.0.0.1:3456 > test_redist_distill_reader.2.log 2>&1 &
32 | discovery_pid=$!
33 | # wait balance start
34 | sleep 10
35 |
36 | export PADDLE_DISTILL_BALANCE_SERVER=127.0.0.1:7001
37 | export PADDLE_DISTILL_SERVICE_NAME=DistillReaderTest
38 | export PADDLE_DISTILL_MAX_TEACHER=4
39 | python distill_reader_test.py
40 |
41 | kill -9 $discovery_pid $register_pid $redis_pid
42 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_resource_pods.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import six
15 | import time
16 | import unittest
17 | from edl.tests.unittests import etcd_test_base
18 | from edl.utils import constants
19 | from edl.utils import pod as edl_pod
20 | from edl.utils import resource_pods
21 |
22 |
23 | class TestRegister(etcd_test_base.EtcdTestBase):
24 | def setUp(self):
25 | super(TestRegister, self).setUp("test_register")
26 |
27 | def test_register_resource_pod(self):
28 | try:
29 | pod0 = edl_pod.Pod()
30 | pod0._id = "0"
31 |
32 | pod1 = edl_pod.Pod()
33 | pod1._id = "1"
34 |
35 | ttl = constants.ETCD_TTL
36 | register1 = resource_pods.Register(
37 | self._job_env, pod_id="0", pod_json=pod0.to_json(), ttl=ttl
38 | )
39 | register2 = resource_pods.Register(
40 | self._job_env, pod_id="1", pod_json=pod1.to_json(), ttl=ttl
41 | )
42 |
43 | # check if the ttl is valid
44 | time.sleep(ttl + 2)
45 |
46 | pods = resource_pods.load_from_etcd(self._etcd, timeout=15)
47 | self.assertEqual(len(pods), 2)
48 | for pod_id, pod in six.iteritems(pods):
49 | if pod_id == "0":
50 | self.assertEqual(pod, pod0)
51 | elif pod_id == "1":
52 | self.assertEqual(pod, pod1)
53 | else:
54 | raise Exception("not supported pod_id:{}".format(pod_id))
55 | except Exception as e:
56 | raise e
57 | finally:
58 | register1.stop()
59 | register2.stop()
60 |
61 | time.sleep(ttl + 1)
62 | pods_dict = resource_pods.load_from_etcd(self._etcd, timeout=15)
63 | self.assertEqual(len(pods_dict), 0)
64 |
65 |
66 | if __name__ == "__main__":
67 | unittest.main()
68 |
--------------------------------------------------------------------------------
/python/edl/tests/unittests/test_train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import unittest
16 | import edl
17 | from edl.collective.data_reader import DistributedDataReader, FileMeta
18 | from edl.collective.dataset import TxtFileSplitter
19 | from paddle.fluid.incubate.fleet.collective import fleet
20 |
21 | learning_rate = 1.0
22 | start_program = None
23 | main_program = None
24 | exe = None
25 |
26 |
27 | def adjust():
28 | learing_rate = learning_rate * edl.size() # noqa: F841
29 |
30 |
31 | class TestDataReader(unittest.TestCase):
32 | def setUp(self):
33 | self._file_list = ["./data_server/a.txt", "./data_server/b.txt"]
34 | self._data = {}
35 | for idx, p in enumerate(self._file_list):
36 | s = TxtFileSplitter(p)
37 | m = FileMeta()
38 | for r in s:
39 | if idx not in m:
40 | self._data[idx] = []
41 | record = ((p), (r[0], r[1:]))
42 | self._data[idx].append(record) # [(path),(rec_no, splitted_fiels)]...
43 |
44 | def _train(self, state):
45 | print("learning_rate:", learning_rate)
46 | reader = DistributedDataReader(
47 | file_list=self._file_list,
48 | file_splitter_cls=TxtFileSplitter,
49 | splitted_data_field=["line"],
50 | batch_size=1,
51 | trainer_rank=0,
52 | )
53 |
54 | for epoch in range(state.epoch, 5):
55 | for meta, batch in reader:
56 | edl.notify_end_one_batch(meta, state)
57 | edl.notify_end_one_epoch(state)
58 |
59 | def test_data_reader(self):
60 | fleet.init()
61 | state = edl.PaddleState(
62 | exe, start_program, main_program, optimizer=None, batch=0, epoch=0
63 | )
64 | state.register_adjust_function([adjust])
65 | self._train(state)
66 |
67 |
68 | if __name__ == "__main__":
69 | unittest.main()
70 |
--------------------------------------------------------------------------------
/python/edl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/python/edl/utils/client.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import grpc
16 |
17 |
18 | class Client(object):
19 | def __init__(self, endpoint):
20 | self._endpoint = endpoint
21 |
22 | def connect(self):
23 | self._channel = grpc.insecure_channel(self._endpoint)
24 |
25 | def shutdown(self):
26 | self._channel = None
27 |
--------------------------------------------------------------------------------
/python/edl/utils/constants.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ETCD_POD_RESOURCE = "resource"
16 | ETCD_POD_RANK = "rank"
17 | ETCD_POD_STATUS = "pod_status"
18 | ETCD_JOB_STATUS = "job_status"
19 | ETCD_TRAIN_STATUS = "train_status"
20 | ETCD_CLUSTER = "cluster"
21 | ETCD_READER = "reader"
22 | ETCD_STATE = "state"
23 | ETCD_POD_LEADER = "0"
24 |
25 | ETCD_CONN_TIMEOUT = 6
26 | ETCD_TTL = 15
27 | ETCD_OPERATION_TIMEOUT = 60
28 |
29 |
30 | def clean_etcd(etcd):
31 | etcd.remove_service(ETCD_POD_RESOURCE)
32 | etcd.remove_service(ETCD_POD_RANK)
33 | etcd.remove_service(ETCD_POD_STATUS)
34 | etcd.remove_service(ETCD_JOB_STATUS)
35 | etcd.remove_service(ETCD_TRAIN_STATUS)
36 | etcd.remove_service(ETCD_CLUSTER)
37 | etcd.remove_service(ETCD_READER)
38 | etcd.remove_service(ETCD_STATE)
39 | etcd.remove_service(ETCD_POD_LEADER)
40 |
--------------------------------------------------------------------------------
/python/edl/utils/data_filter.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | class DatatFilter(object):
17 | def __init__(self, checkpoint):
18 | pass
19 |
20 | def is_processed(self, idx, path, record_idx):
21 | pass
22 |
23 | def add_processed(self, idx, path, record_idx):
24 | pass
25 |
--------------------------------------------------------------------------------
/python/edl/utils/error_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import functools
16 | import time
17 |
18 | from edl.utils import exceptions
19 | from edl.utils.log_utils import logger
20 |
21 |
22 | def handle_errors_until_timeout(f):
23 | def handler(*args, **kwargs):
24 | begin = time.time()
25 | timeout = kwargs["timeout"]
26 | while True:
27 | try:
28 | return f(*args, **kwargs)
29 | except exceptions.EdlDataEndError:
30 | raise exceptions.EdlDataEndError
31 | except exceptions.EdlException as e:
32 | if time.time() - begin >= timeout:
33 | logger.warning("{} execute timeout:{}".format(f.__name__, timeout))
34 | raise e
35 |
36 | time.sleep(3)
37 | continue
38 |
39 | return functools.wraps(f)(handler)
40 |
--------------------------------------------------------------------------------
/python/edl/utils/etcd_db.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from ..discovery.etcd_client import EtcdClient
15 |
16 | g_etcd = None
17 |
18 |
19 | def get_global_etcd(etcd_endpoints=None, job_id=None):
20 | global g_etcd
21 | if g_etcd is None:
22 | assert etcd_endpoints is not None and job_id is not None
23 | g_etcd = EtcdClient(endpoints=etcd_endpoints, root=job_id, timeout=6)
24 | g_etcd.init()
25 | return g_etcd
26 |
27 | return g_etcd
28 |
--------------------------------------------------------------------------------
/python/edl/utils/etcd_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from edl.utils import constants
16 |
17 |
18 | def get_train_status_table_key(self, server_name):
19 | return self._etcd.get_full_path(constants.ETCD_TRAIN_STATUS, server_name)
20 |
21 |
22 | def get_cluster_table_key(self):
23 | return self._etcd.get_full_path(constants.ETCD_CLUSTER, constants.ETCD_CLUSTER)
24 |
25 |
26 | def get_rank_table_key(self):
27 | return self._etcd.get_full_path(constants.ETCD_POD_RANK, constants.ETCD_POD_LEADER)
28 |
--------------------------------------------------------------------------------
/python/edl/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import sys
16 |
17 | from edl.utils import common_pb2
18 |
19 |
20 | class EdlException(Exception):
21 | pass
22 |
23 |
24 | class EdlStopIteration(EdlException):
25 | pass
26 |
27 |
28 | class EdlRegisterError(EdlException):
29 | pass
30 |
31 |
32 | class EdlBarrierError(EdlException):
33 | pass
34 |
35 |
36 | class EdlUnkownError(EdlException):
37 | pass
38 |
39 |
40 | class EdlRankError(EdlException):
41 | pass
42 |
43 |
44 | class EdlInternalError(EdlException):
45 | pass
46 |
47 |
48 | class EdlWaitFollowersReleaseError(EdlException):
49 | pass
50 |
51 |
52 | class EdlLeaderError(EdlException):
53 | pass
54 |
55 |
56 | class EdlGenerateClusterError(EdlException):
57 | pass
58 |
59 |
60 | class EdlTableError(EdlException):
61 | pass
62 |
63 |
64 | class EdlEtcdIOError(EdlException):
65 | pass
66 |
67 |
68 | class EdlDataEndError(EdlException):
69 | pass
70 |
71 |
72 | class EdlPodIDNotExistError(EdlException):
73 | pass
74 |
75 |
76 | class EdlReaderNameError(EdlException):
77 | pass
78 |
79 |
80 | class EdlFileListNotMatchError(EdlException):
81 | pass
82 |
83 |
84 | class EdlDataGenerateError(EdlException):
85 | pass
86 |
87 |
88 | class EdlNotLeaderError(EdlException):
89 | pass
90 |
91 |
92 | def deserialize(pb_status):
93 | thismodule = sys.modules[__name__]
94 | try:
95 | cls = getattr(thismodule, pb_status.type)(pb_status.detail)
96 | except Exception as e:
97 | raise Exception(
98 | "type:{} detail:{} meets error:{}".format(
99 | pb_status.type, pb_status.detail, str(e)
100 | )
101 | )
102 | raise cls
103 |
104 |
105 | def serialize_to_pb_status(exception):
106 | pb_status = common_pb2.Status()
107 | pb_status.type = exception.__class__.__name__
108 | pb_status.detail = str(exception)
109 | return pb_status
110 |
111 |
112 | def serialize(pb_response, exception, stack_info=None):
113 | pb_response.status.type = exception.__class__.__name__
114 | if stack_info is not None:
115 | pb_response.status.detail = str(exception) + stack_info
116 | else:
117 | pb_response.status.detail = str(exception)
118 |
--------------------------------------------------------------------------------
/python/edl/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | def read_txt_lines(file_list):
17 | """
18 | return [(file_path, line_no)...]
19 | """
20 | line_no = -1
21 | ret = []
22 | with open(file_list, "r") as f:
23 | for line in f:
24 | line = line.strip()
25 | if len(line) <= 0:
26 | continue
27 |
28 | line_no += 1
29 | ret.append((line, line_no))
30 | return ret
31 |
--------------------------------------------------------------------------------
/python/edl/utils/json_serializable.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import json
15 | import six
16 |
17 |
18 | class SerializableBase(object):
19 | def to_json(self):
20 | raise NotImplementedError
21 |
22 | def from_json(self):
23 | raise NotImplementedError
24 |
25 |
26 | class Serializable(SerializableBase):
27 | def to_json(self, filter_names=set()):
28 | d = {}
29 | for k, v in six.iteritems(self.__dict__):
30 | if k in filter_names:
31 | continue
32 |
33 | if isinstance(v, SerializableBase):
34 | d[k] = v.to_json()
35 | continue
36 |
37 | d[k] = v
38 |
39 | return json.dumps(d)
40 |
41 | def from_json(self, json_str):
42 | d = json.loads(json_str)
43 | for k, v in six.iteritems(self.__dict__):
44 | if k not in d:
45 | continue
46 |
47 | self.__dict__[k] = d[k]
48 |
49 | return self
50 |
51 | def __eq__(self, other):
52 | if other is None:
53 | return False
54 |
55 | return self.__dict__ == other.__dict__
56 |
57 | def __ne__(self, other):
58 | return not self.__eq__(other)
59 |
60 | def __str__(self):
61 | return self.to_json()
62 |
--------------------------------------------------------------------------------
/python/edl/utils/log_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | logger = logging.getLogger("root")
18 | logger.propagate = False
19 |
20 |
21 | def get_logger(log_level, name="root"):
22 | logger = logging.getLogger(name)
23 | logger.setLevel(log_level)
24 |
25 | log_handler = logging.StreamHandler()
26 | log_format = logging.Formatter(
27 | "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
28 | )
29 | log_handler.setFormatter(log_format)
30 | logger.addHandler(log_handler)
31 |
32 | return logger
33 |
--------------------------------------------------------------------------------
/python/edl/utils/network_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import socket
16 |
17 | from contextlib import closing
18 |
19 |
20 | def get_extern_ip():
21 | return socket.gethostbyname(socket.gethostname())
22 |
23 |
24 | def get_host_name_ip():
25 | host_name = socket.gethostname()
26 | host_ip = socket.gethostbyname(host_name)
27 | return host_name, host_ip
28 |
29 |
30 | def find_free_ports(num):
31 | if num <= 0:
32 | return None
33 |
34 | def __free_port():
35 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
36 | s.bind(("", 0))
37 | return s.getsockname()[1]
38 |
39 | port_set = set()
40 | step = 0
41 | while True:
42 | port = __free_port()
43 | if port not in port_set:
44 | port_set.add(port)
45 |
46 | if len(port_set) >= num:
47 | return port_set
48 |
49 | step += 1
50 | if step > 100:
51 | print("can't find avilable port and use the specified static port now!")
52 | return None
53 |
54 | return None
55 |
--------------------------------------------------------------------------------
/python/edl/utils/pb_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 |
17 |
18 | def record_to_string(rec):
19 | return "record_no:{} fields_len:{}".format(rec.record_no, len(rec.field_data))
20 |
21 |
22 | def batch_data_response_to_string(res):
23 | r = []
24 | for data in res.data:
25 | s = {}
26 | s["batch_data_id"] = data.batch_data_id
27 | s["records_num"] = len(data.records)
28 |
29 | records_str = []
30 | for rec in data.records:
31 | records_str.append(record_to_string(rec))
32 |
33 | s["records"] = ",".join(records_str)
34 | r.append(json.dumps(s))
35 |
36 | return ";".jion(r)
37 |
38 |
39 | def batch_data_meta_response_to_string(res):
40 | r = []
41 | for data in res.data:
42 | r.append(str(data))
43 |
44 | return ",".join(r)
45 |
--------------------------------------------------------------------------------
/python/edl/utils/pod_server_client.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | from edl.utils import client as client_base
17 | from edl.utils import cluster as edl_cluster
18 | from edl.utils import exceptions
19 | from edl.utils import pod_server_pb2
20 | from edl.utils import pod_server_pb2_grpc
21 | from edl.utils.log_utils import logger
22 |
23 |
24 | class Client(client_base.Client):
25 | def __init__(self, endpoint):
26 | super(Client, self).__init__(endpoint)
27 |
28 | def connect(self):
29 | super(Client, self).connect()
30 | self._stub = pod_server_pb2_grpc.PodServerStub(self._channel)
31 | return self._channel, self._stub
32 |
33 | def shutdown(self):
34 | super(Client, self).shutdown()
35 | self._stub = None
36 |
37 | def barrier(self, job_id, pod_id, timeout=15):
38 | """
39 | try to barrier on master with other launchers until timeout
40 | """
41 | req = pod_server_pb2.BarrierRequest()
42 | req.job_id = job_id
43 | req.pod_id = pod_id
44 |
45 | c, s = self.connect()
46 | begin = time.time()
47 | cluster = edl_cluster.Cluster()
48 | while True:
49 | res = s.Barrier(req)
50 | if res.status.type == "":
51 | cluster.from_json(res.cluster_json)
52 | logger.debug("pod client get cluster:{}".format(cluster))
53 | logger.info("barrier ok!")
54 | return cluster
55 |
56 | if time.time() - begin > timeout:
57 | message = "job_id:{} pod_id:{} barrier time out".format(job_id, pod_id)
58 | logger.info(message)
59 | exceptions.deserialize(res.status)
60 | time.sleep(1)
61 |
--------------------------------------------------------------------------------
/python/edl/utils/process.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import multiprocessing
16 | import threading
17 |
18 | from edl.utils.log_utils import logger
19 |
20 |
21 | class ProcessWrapper(object):
22 | def __init__(self):
23 | self._stop = None
24 | self._lock = None
25 | self._worker = None
26 |
27 | self._stop = multiprocessing.Event()
28 | self._lock = threading.Lock()
29 | self._worker = multiprocessing.Process(target=self._worker_func)
30 |
31 | def _worker_func(self):
32 | raise NotImplementedError
33 |
34 | def start(self):
35 | self._worker.start()
36 |
37 | def stop(self):
38 | self._stop.set()
39 | with self._lock:
40 | if self._worker:
41 | self._worker.join()
42 | self._worker = None
43 |
44 | logger.info("{} exit".format(self.__class__.__name__))
45 |
46 | def is_stopped(self):
47 | with self._lock:
48 | return self._worker is None
49 |
50 | def __exit__(self):
51 | self.stop()
52 |
--------------------------------------------------------------------------------
/python/edl/utils/resource_pods.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from edl.utils import constants
16 | from edl.utils import error_utils
17 | from edl.utils import pod
18 | from edl.utils import register
19 | from edl.utils import string_utils
20 | from edl.utils import exceptions
21 | from edl.utils.log_utils import logger
22 |
23 |
24 | class Register(register.Register):
25 | def __init__(self, job_env, pod_id, pod_json, ttl=constants.ETCD_TTL):
26 | service = constants.ETCD_POD_RESOURCE
27 | server = "{}".format(pod_id)
28 | value = pod_json
29 |
30 | super(Register, self).__init__(
31 | etcd_endpoints=job_env.etcd_endpoints,
32 | job_id=job_env.job_id,
33 | service=service,
34 | server=server,
35 | info=value,
36 | ttl=ttl,
37 | )
38 |
39 | def stop(self):
40 | super(Register, self).stop()
41 | logger.info("pod:{} resource_register stopped")
42 |
43 |
44 | @error_utils.handle_errors_until_timeout
45 | def load_from_etcd(etcd, timeout=15):
46 | servers = etcd.get_service(constants.ETCD_POD_RESOURCE)
47 |
48 | pods = {}
49 | for s in servers:
50 | p = pod.Pod()
51 | p.from_json(string_utils.bytes_to_string(s.info))
52 | pods[p.get_id()] = p
53 |
54 | return pods
55 |
56 |
57 | @error_utils.handle_errors_until_timeout
58 | def wait_resource(etcd, pod_id, timeout=15):
59 | pods = load_from_etcd(etcd, timeout=timeout)
60 | if len(pods) == 1:
61 | if pod_id in pods:
62 | return True
63 |
64 | if len(pods) == 0:
65 | return True
66 |
67 | raise exceptions.EdlWaitFollowersReleaseError(
68 | "can't wait all resource exit:{}".format(pods.keys())
69 | )
70 |
71 | return False
72 |
--------------------------------------------------------------------------------
/python/edl/utils/string_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | def dataset_to_string(o):
17 | """
18 | FileMeta to string
19 | """
20 | ret = "idx_in_list:{}, file_path:{}".format(o.idx_in_list, o.file_path)
21 |
22 | ret += " record:["
23 | for rs in o.records:
24 | for rec_no in range(rs.begin, rs.end + 1):
25 | ret += "(record_no:{})".format(rec_no)
26 | ret += "]"
27 |
28 | return ret
29 |
30 |
31 | def data_request_to_string(o):
32 | """
33 | DataMeta to string
34 | """
35 | ret = "idx_in_list:{} file_path:{}".format(o.idx_in_list, o.file_path)
36 | for rs in o.chunks:
37 | ret += " chunk:["
38 | ret += chunk_to_string(rs)
39 | ret += "]"
40 |
41 | return ret
42 |
43 |
44 | def chunk_to_string(rs):
45 | ret = "status:{} ".format(rs.status)
46 | for rec_no in range(rs.meta.begin, rs.meta.end + 1):
47 | ret += "(record_no:{}) ".format(rec_no)
48 |
49 | return ret
50 |
51 |
52 | def bytes_to_string(o, codec="utf-8"):
53 | if o is None:
54 | return None
55 |
56 | if not isinstance(o, str):
57 | return o.decode(codec)
58 |
59 | return o
60 |
--------------------------------------------------------------------------------
/python/edl/utils/train_status.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import enum
16 | import json
17 | from edl.utils import constants
18 | from edl.utils import error_utils
19 |
20 |
21 | class TrainStatus(enum.IntEnum):
22 | INITIAL = 0
23 | RUNNING = 1
24 | NEARTHEEND = 3
25 | SUCCEED = 3
26 | FAILED = 4
27 |
28 |
29 | @error_utils.handle_errors_until_timeout
30 | def save_to_etcd(etcd, pod_id, status, timeout=30):
31 | service = constants.ETCD_TRAIN_STATUS
32 | server = pod_id
33 | info = json.dumps({"status": int(status)})
34 | etcd.set_server_permanent(service, server, info)
35 |
36 |
37 | @error_utils.handle_errors_until_timeout
38 | def load_from_etcd(etcd, pod_id, timeout=30):
39 | value = etcd.get_value(constants.ETCD_TRAIN_STATUS, pod_id)
40 |
41 | if value is None:
42 | return None
43 |
44 | d = json.load(value)
45 | return d["status"]
46 |
--------------------------------------------------------------------------------
/python/edl/utils/trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import uuid
16 | from edl.utils import json_serializable
17 |
18 |
19 | class Trainer(json_serializable.Serializable):
20 | def __init__(self):
21 | self._id = None
22 | self._rank_in_pod = None
23 | self._gpus = []
24 | self._endpoint = None
25 | self._global_rank = None
26 |
27 | def __str__(self):
28 | s = "id:{} rank_in_pod:{} gpus:{} endpoint:{} global_rank:{}".format(
29 | self._ids, self._rank_in_pod, self._gpus, self._endpoint, self._global_rank
30 | )
31 |
32 | return s
33 |
34 | @property
35 | def global_rank(self):
36 | return self._global_rank
37 |
38 | @property
39 | def rank_in_pod(self):
40 | return self._rank_in_pod
41 |
42 | @property
43 | def gpus(self):
44 | return self._gpus
45 |
46 | @property
47 | def endpoint(self):
48 | return self._endpoint
49 |
50 | def from_pod(self, endpoint, rank_in_pod, gpus):
51 | self._id = str(uuid.uuid1())
52 | self._global_rank = None
53 | self._rank_in_pod = rank_in_pod
54 | self._endpoint = endpoint
55 | self._gpus = gpus
56 |
--------------------------------------------------------------------------------
/python/edl/utils/unique_name.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import collections
16 |
17 |
18 | class UniqueNameGenerator(object):
19 | """
20 | Generate unique name with prefix.
21 |
22 | Args:
23 | prefix(str): The generated name prefix. All generated name will be
24 | started with this prefix.
25 | """
26 |
27 | def __init__(self, prefix=None):
28 | self.ids = collections.defaultdict(int)
29 | if prefix is None:
30 | prefix = ""
31 | self.prefix = prefix
32 |
33 | def __call__(self, key):
34 | """
35 | Generate unique names with prefix
36 |
37 | Args:
38 | key(str): The key of return string.
39 |
40 | Returns(str): A unique string with the prefix
41 | """
42 | tmp = self.ids[key]
43 | self.ids[key] += 1
44 | return self.prefix + "_".join([key, str(tmp)])
45 |
46 |
47 | generator = UniqueNameGenerator()
48 |
49 |
50 | def generate(key):
51 | return generator(key)
52 |
--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | trap 'kill -9 $(jobs -p)' EXIT
18 | unset https_proxy http_proxy
19 |
20 | set -e
21 | if [[ $# != 1 ]] ; then
22 | echo "must set python version"
23 | exit 0
24 | fi
25 |
26 | unset GREP_OPTIONS
27 | BASEDIR="$(dirname "$(readlink -f "${0}")")"
28 | cd "${BASEDIR}"/..
29 |
30 | function gen_env(){
31 | py_version=$1
32 | old_path=$PATH
33 | python_path="$(which python"${py_version}")"
34 |
35 | tmp_path=/tmp/edl-build/python${py_version}/bin
36 | mkdir -p "${tmp_path}"
37 | rm -f "${tmp_path}/python"
38 |
39 | ln -s "${python_path}" "${tmp_path}/python"
40 | export PATH="${tmp_path}:${old_path}"
41 | echo "current path:${PATH}"
42 | }
43 |
44 | py_version=$1
45 | gen_env "$py_version"
46 |
47 | # check python version
48 | which python
49 | version_str=$(python --version 2>&1)
50 | echo "python version:${version_str}"
51 | if [[ ${version_str} != "Python ${py_version}"* ]]; then
52 | echo "${version_str} not valid for argument:${py_version}"
53 | exit 1
54 | fi
55 |
56 |
57 | pushd python/edl/protos/
58 | bash generate.sh
59 | popd
60 |
61 | build_dir=build
62 | rm -rf ${build_dir}
63 | mkdir -p ${build_dir}/cmd/master/
64 | # TODO(gongwb): add them on async training
65 | # go
66 | #go build -o build/cmd/master/master cmd/master/master.go
67 |
68 | nohup etcd > "build_etcd.log" 2>&1 &
69 |
70 | #build python
71 | pushd ${build_dir}
72 | cmake .. -DPY_VERSION="${py_version}"
73 | make clean && make -j
74 | ctest -V -R
75 | popd
76 |
77 | #test all go test
78 | #go test --cover ./...
79 |
--------------------------------------------------------------------------------
/scripts/custom-boilerplate.go.txt:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright (c) 2016 PaddlePaddle Authors All Rights Reserved.
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
--------------------------------------------------------------------------------
/scripts/download_etcd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | ETCD_VER=v3.4.7
19 |
20 | # choose either URL
21 | DOWNLOAD_URL=https://paddle-edl.bj.bcebos.com/etcd-${ETCD_VER}-linux-amd64.tar.gz
22 |
23 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
24 | rm -rf /tmp/etcd-download-test && mkdir -p /tmp/etcd-download-test
25 |
26 | wget -q ${DOWNLOAD_URL} -O /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
27 | tar xzvf /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C /tmp/etcd-download-test --strip-components=1
28 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz
29 |
30 | /tmp/etcd-download-test/etcd --version
31 | /tmp/etcd-download-test/etcdctl version
32 |
33 | mv /tmp/etcd-download-test/etcd /usr/local/bin/
34 | mv /tmp/etcd-download-test/etcdctl /usr/local/bin/
35 |
36 | rm -rf /tmp/etcd-download-test
37 |
--------------------------------------------------------------------------------
/scripts/run_build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | unset GREP_OPTIONS
19 | BASEDIR=$(dirname "$(readlink -f "${0}")")
20 |
21 |
22 | echo "base_dir:${BASEDIR}"
23 | cd "${BASEDIR}"
24 |
25 | # 2.7 is deprecated
26 | # ./build.sh 2.7
27 |
28 | function abort(){
29 | echo "Your change doesn't follow Edl's code style." 1>&2
30 | echo "Please use pre-commit to check what is wrong." 1>&2
31 | exit 1
32 | }
33 |
34 |
35 | function check_style() {
36 | trap 'abort' 0
37 |
38 | set +e
39 | upstream_url='https://github.com/elasticdeeplearning/edl'
40 | git remote remove upstream
41 | git remote add upstream $upstream_url
42 | set -e
43 | git fetch upstream develop
44 |
45 | pre-commit install
46 | changed_files="$(git diff --name-only upstream/develop)"
47 | echo "$changed_files" | xargs pre-commit run --files
48 |
49 | trap : 0
50 | }
51 |
52 | pushd "${BASEDIR}/../"
53 | check_style
54 | popd
55 |
56 |
57 | ./build.sh 3.7
58 |
--------------------------------------------------------------------------------