├── .dockerignore ├── .github ├── issue_template.md └── pull_request_template.md ├── .gitignore ├── .pre-commit-config.yaml ├── .tools ├── codestyle │ ├── .gitignore │ ├── clang_format.hook │ ├── copyright.py │ └── docstring_checker.py └── test_runner.py ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── FAQ.md ├── LICENSE ├── OWNERS.md ├── README.md ├── RELEASE.md ├── cmake ├── python.cmake └── python_module.cmake ├── doc ├── Elastic Deep Learning Survey.pdf ├── ROADMAP.md ├── SUPPORT.md ├── boss_tutorial.md ├── build.md ├── checkpoint_based_edl.gif ├── distill.gif ├── edl_collective_design_doc.md ├── edl_collective_design_doc_cn.md ├── edl_design_doc.md ├── edl_design_doc_cn.md ├── edl_distill_design_doc.md ├── edl_live_fault_tolerance.md ├── experiment │ └── distill_resnet50.md ├── fault_tolerance.md ├── fault_tolerance_cn.md ├── images │ ├── edl-arch.png │ ├── launcher.png │ └── trainer.png ├── install.md └── usage.md ├── docker ├── Dockerfile ├── Dockerfile.runtime ├── README.md ├── build-devel.sh ├── build-runtime.sh ├── dev_requirements.txt ├── release-run-time.sh └── requirements.txt ├── example ├── collective │ └── resnet50 │ │ ├── dali.py │ │ ├── models │ │ ├── __init__.py │ │ ├── resnet.py │ │ └── vgg.py │ │ ├── scripts │ │ └── train_gpu.sh │ │ ├── train_pretrain.sh │ │ ├── train_with_fleet.py │ │ └── utils │ │ ├── __init__.py │ │ ├── fp16_utils.py │ │ ├── img_tool.py │ │ ├── learning_rate.py │ │ ├── reader_cv2.py │ │ └── utility.py ├── ctr │ ├── Dockerfile │ ├── README │ ├── ctr │ │ ├── dumper.py │ │ ├── kvtool.py │ │ ├── save_program.py │ │ └── train.py │ ├── deploy_ctr_on_baidu_cloud_cn.rst │ ├── k8s │ │ ├── ctr.yaml │ │ ├── cube.yaml │ │ ├── ftp.yaml │ │ ├── paddle-suite.sh │ │ ├── paddle-suite.yaml │ │ ├── pdclient.yaml │ │ ├── pdserving.yaml │ │ └── transfer.yaml │ ├── ps-train │ │ ├── pserver.yaml │ │ └── trainer.yaml │ ├── script │ │ ├── README │ │ ├── ctr.yaml │ │ ├── cube.yaml │ │ ├── defaultserviceaccountclusterrole.yaml │ │ ├── fileserver.yaml │ │ ├── ftp.yaml │ │ ├── paddle-suite.sh │ │ ├── paddle-suite.yaml │ │ ├── pdclient.yaml │ │ ├── pdserving.yaml │ │ └── transfer.yaml │ └── src │ │ ├── baidu_cloud │ │ ├── cluster-info.png │ │ ├── concole.png │ │ ├── conf-download.png │ │ ├── ctr-models.png │ │ ├── ctr-prediction-end-to-end-deployment.png │ │ ├── ctr-running.png │ │ ├── eip.png │ │ ├── file_server.png │ │ ├── helm-version.png │ │ ├── kubectl-version.png │ │ ├── load_balancer.png │ │ ├── pserver-log.png │ │ ├── tiller.png │ │ ├── trainer-log.png │ │ ├── volcano.png │ │ ├── wget_example.png │ │ └── workload.png │ │ ├── create_gpu_machine.png │ │ ├── create_image.png │ │ ├── create_more_nodes.png │ │ ├── ctr.png │ │ ├── ctr_kubectl_download.png │ │ ├── ctr_node.png │ │ ├── ctr_pods.png │ │ ├── ctr_pserver_log.png │ │ ├── ctr_trainer_log.png │ │ ├── ctr_volcano_install.png │ │ ├── ctryaml1.png │ │ ├── ctryaml2.png │ │ ├── ctryaml3.png │ │ ├── cube.png │ │ ├── cube_config1.png │ │ ├── cube_config2.png │ │ ├── dist_train_demo.py │ │ ├── dist_train_nccl2.graffle │ │ ├── dist_train_nccl2.png │ │ ├── dist_train_pserver.graffle │ │ ├── dist_train_pserver.png │ │ ├── file_server_pod.png │ │ ├── file_server_svc.png │ │ ├── overview.png │ │ ├── paddleclient.png │ │ ├── paddleserving_pod.png │ │ ├── paddleserving_svc.png │ │ ├── parallelism.png │ │ ├── pyreader.png │ │ ├── release.png │ │ └── transfer.png ├── demo │ └── collective │ │ ├── README.md │ │ ├── env.sh │ │ ├── resnet50 │ │ └── package.sh │ │ ├── start_job_client.sh │ │ └── start_job_server.sh ├── distill │ ├── README.md │ ├── k8s │ │ ├── balance.yaml │ │ ├── edl_k8s │ │ ├── etcd.yaml │ │ ├── student.yaml │ │ └── teacher.yaml │ ├── mnist_distill │ │ ├── README_CN.md │ │ ├── image │ │ │ └── infer_3.png │ │ ├── run.sh │ │ └── train_with_fleet.py │ ├── nlp │ │ ├── README.md │ │ ├── distill.py │ │ ├── fine_tune.py │ │ ├── model.py │ │ ├── reader.py │ │ ├── test_distill.sh │ │ ├── test_train.sh │ │ └── train.py │ ├── qps_tools │ │ ├── distill_reader_qps.py │ │ ├── parse_config.py │ │ └── run.sh │ ├── reader_demo │ │ ├── distill_reader_demo.py │ │ └── run_demo.sh │ └── resnet │ │ ├── README.md │ │ ├── dali.py │ │ ├── models │ │ ├── __init__.py │ │ ├── resnet.py │ │ ├── resnet_vd.py │ │ └── vgg.py │ │ ├── scripts │ │ ├── start_local_teacher.sh │ │ └── train_student.sh │ │ ├── train_with_fleet.py │ │ └── utils │ │ ├── __init__.py │ │ ├── fp16_utils.py │ │ ├── img_tool.py │ │ ├── learning_rate.py │ │ ├── reader_cv2.py │ │ └── utility.py └── fit_a_line │ ├── Dockerfile │ ├── collector.py │ ├── collector.pyc │ ├── del_jobs.sh │ ├── examplejob.yaml │ ├── fluid │ ├── common.py │ ├── fit_a_line.py │ ├── image │ │ ├── infer_3.png │ │ └── ranges.png │ └── recognize_digits.py │ ├── nginx.yaml │ ├── train_ft.py │ └── train_local.py ├── k8s ├── edl_controller.yaml ├── k8s_tools.py ├── rbac_admin.yaml └── thirdpartyresource.yaml ├── logo ├── edl.png └── paddle.png ├── python ├── CMakeLists.txt ├── edl │ ├── __init__.py │ ├── collective │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── distribute_reader.py │ │ ├── launch.py │ │ └── serializable.py │ ├── discovery │ │ ├── __init__.py │ │ ├── consistent_hash.py │ │ ├── etcd_client.py │ │ ├── register.py │ │ └── server_alive.py │ ├── distill │ │ ├── __init__.py │ │ ├── balance_table.py │ │ ├── discovery_client.py │ │ ├── discovery_server.py │ │ ├── distill_reader.py │ │ ├── distill_worker.py │ │ ├── redis │ │ │ ├── __init__.py │ │ │ ├── balance_server.py │ │ │ ├── client.py │ │ │ ├── redis_store.py │ │ │ ├── server_register.py │ │ │ └── service_table.py │ │ ├── timeline.py │ │ └── utils.py │ ├── liveft │ │ ├── __init__.py │ │ ├── elastic.py │ │ └── launch.py │ ├── protos │ │ ├── common.proto │ │ ├── data_server.proto │ │ ├── distill_discovery.proto │ │ ├── generate.sh │ │ ├── pod_server.proto │ │ └── run_codegen.py │ ├── tests │ │ ├── __init__.py │ │ └── unittests │ │ │ ├── CMakeLists.txt │ │ │ ├── __init__.py │ │ │ ├── data_server │ │ │ ├── a.txt │ │ │ └── b.txt │ │ │ ├── data_server_tmp.py │ │ │ ├── del_from_etcd.py │ │ │ ├── distill_reader_test.py │ │ │ ├── edl_demo.py │ │ │ ├── etcd_client_test.py │ │ │ ├── etcd_test.sh │ │ │ ├── etcd_test_base.py │ │ │ ├── launch_demo.py │ │ │ ├── master_client_test.py │ │ │ ├── serving_conf │ │ │ └── serving_client_conf.prototxt │ │ │ ├── test_cluster.py │ │ │ ├── test_cluster_generator.py │ │ │ ├── test_cluster_watcher.py │ │ │ ├── test_consistent_hash.py │ │ │ ├── test_data_reader.py │ │ │ ├── test_data_server.py │ │ │ ├── test_distill_reader.sh │ │ │ ├── test_etcd_client.sh │ │ │ ├── test_file_list.txt │ │ │ ├── test_launch.py │ │ │ ├── test_launch.sh │ │ │ ├── test_leader_pod.py │ │ │ ├── test_pod.py │ │ │ ├── test_redis_distill_reader.sh │ │ │ ├── test_resource_pods.py │ │ │ ├── test_state.py │ │ │ └── test_train.py │ └── utils │ │ ├── __init__.py │ │ ├── args_utils.py │ │ ├── client.py │ │ ├── cluster.py │ │ ├── cluster_generator.py │ │ ├── cluster_watcher.py │ │ ├── constants.py │ │ ├── data_filter.py │ │ ├── data_server.py │ │ ├── data_server_client.py │ │ ├── env.py │ │ ├── error_utils.py │ │ ├── etcd_db.py │ │ ├── etcd_utils.py │ │ ├── exceptions.py │ │ ├── file_utils.py │ │ ├── json_serializable.py │ │ ├── launcher.py │ │ ├── leader_pod.py │ │ ├── log_utils.py │ │ ├── network_utils.py │ │ ├── pb_utils.py │ │ ├── pod.py │ │ ├── pod_server.py │ │ ├── pod_server_client.py │ │ ├── process.py │ │ ├── reader.py │ │ ├── register.py │ │ ├── resource_pods.py │ │ ├── state.py │ │ ├── status.py │ │ ├── string_utils.py │ │ ├── train_process.py │ │ ├── train_status.py │ │ ├── trainer.py │ │ └── unique_name.py └── setup.py.in └── scripts ├── build.sh ├── custom-boilerplate.go.txt ├── download_etcd.sh └── run_build.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | *~ 2 | vendor/ 3 | .glide/ 4 | -------------------------------------------------------------------------------- /.github/issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 训练(Training issue) 3 | about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training 4 | issue. 5 | 6 | --- 7 | 8 | 为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 9 | 10 | 如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: 11 | - 版本、环境信息: 12 | - PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.8或CommitID 13 | - EDL版本:请提供您的EDL版本号,例如0.3或CommitID 14 | - CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 15 | - GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 16 | - 系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本 17 | - 复现信息:如为报错,请给出复现环境、复现步骤 18 | - 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 19 | 20 | Thank you for contributing to EDL. 21 | Before submitting the issue, you could search the issue in the GitHub in case that there was a similar issue submitted or resolved before. 22 | If there is no solution, please make sure that this is a training issue including the following details: 23 | 24 | - System information: 25 | - PaddlePaddle version (eg.1.8)or commit 26 | - EDL version (eg.0.3)or commit 27 | - CPU: including CPUMKL/OpenBlas/MKLDNN version 28 | - GPU: including CUDA/CUDNN version 29 | - OS Platform (eg.Mac OS 10.14) 30 | - To Reproduce: 31 | - steps to reproduce the behavior 32 | - Describe your current behavior 33 | - Code to reproduce the issue 34 | 35 | - Other info/logs 36 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 非常感谢您给EDL项目提交PR! 2 | 在提交PR之前,请帮忙回答一下问题以帮助我们判断PR的意图: 3 | - 这个PR要解决什么问题? 4 | - 有没有对应的ISSUE? 5 | Fix 6 | - 这个PR里边有没有要注意的地方? 7 | - 这个PR改变了用户接口? 8 | - 其他的说明? 9 | 10 | 11 | You are welcome to submmit PR for EDL. 12 | Before this, would you like to answer some questions to help others to get what the PR does? 13 | - What this PR does / why we need it? 14 | - Which issue(s) this PR fixes? 15 | Fix 16 | - Special notes for your reviewer: 17 | - Does this PR introduce a user-facing change? 18 | - Additional documentation? 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | vendor/ 3 | .glide/ 4 | .vscode/ 5 | *.pyc 6 | build/ 7 | *.log 8 | resnet50_pod/ 9 | .*.swp 10 | *_pb2.py 11 | *_pb2_grpc.py 12 | *.pb.go 13 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 19.10b0 4 | hooks: 5 | - id: black 6 | exclude: ^(\.tools\/|example\/|k8s\/) 7 | - repo: https://github.com/pre-commit/pygrep-hooks 8 | rev: v1.5.1 9 | hooks: 10 | - id: python-use-type-annotations 11 | exclude: ^(\.tools\/|example\/|k8s\/) 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v3.2.0 14 | hooks: 15 | - id: trailing-whitespace 16 | files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$ 17 | - id: check-docstring-first 18 | - id: check-json 19 | - id: check-added-large-files 20 | - id: debug-statements 21 | exclude: ^(\.tools\/|example\/|k8s\/) 22 | - id: requirements-txt-fixer 23 | - id: check-merge-conflict 24 | - id: check-symlinks 25 | - id: detect-private-key 26 | - id: end-of-file-fixer 27 | - repo: local 28 | hooks: 29 | - id: copyright_checker 30 | name: copyright_checker 31 | entry: python .tools/codestyle/copyright.py 32 | language: system 33 | files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$ 34 | - repo: https://gitlab.com/pycqa/flake8 35 | rev: 3.8.3 36 | hooks: 37 | - id: flake8 38 | exclude: ^(\.tools\/|example\/|k8s\/) 39 | args: ['--max-line-length=100', '--extend-ignore=E203'] 40 | - repo: local 41 | hooks: 42 | - id: shellcheck 43 | name: shellcheck 44 | entry: shellcheck 45 | language: system 46 | files: .sh$ 47 | exclude: ^(\.tools\/|example\/|k8s\/) 48 | -------------------------------------------------------------------------------- /.tools/codestyle/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /.tools/codestyle/clang_format.hook: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | readonly VERSION="3.8" 5 | 6 | version=$(clang-format -version) 7 | 8 | if ! [[ $version == *"$VERSION"* ]]; then 9 | echo "clang-format version check failed." 10 | echo "a version contains '$VERSION' is needed, but get '$version'" 11 | echo "you can install the right version, and make an soft-link to '\$PATH' env" 12 | exit -1 13 | fi 14 | 15 | clang-format $@ 16 | -------------------------------------------------------------------------------- /.tools/test_runner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | import unittest 18 | import os 19 | import sys 20 | import paddle.fluid as fluid 21 | import importlib 22 | from six.moves import cStringIO 23 | 24 | 25 | def main(): 26 | sys.path.append(os.getcwd()) 27 | some_test_failed = False 28 | for module_name in sys.argv[1:]: 29 | buffer = cStringIO() 30 | main = fluid.Program() 31 | startup = fluid.Program() 32 | scope = fluid.core.Scope() 33 | with fluid.program_guard(main, startup): 34 | with fluid.scope_guard(scope): 35 | with fluid.unique_name.guard(): 36 | test_loader = unittest.TestLoader() 37 | module = importlib.import_module(module_name) 38 | tests = test_loader.loadTestsFromModule(module) 39 | res = unittest.TextTestRunner(stream=buffer).run(tests) 40 | if not res.wasSuccessful(): 41 | some_test_failed = True 42 | print( 43 | module_name, 44 | 'failed\n', 45 | buffer.getvalue(), 46 | file=sys.stderr) 47 | 48 | if some_test_failed: 49 | exit(1) 50 | 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 3 | set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 4 | set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) 5 | SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output) 6 | SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}") 7 | project(edl) 8 | 9 | option(WITH_TESTING "Compile EDL with unit testing" ON) 10 | option(WITH_COVERAGE "Compile EDL with code coverage" OFF) 11 | option(PY_VERSION "Compile EDL with python3 support" ${PY_VERSION}) 12 | 13 | # PY_VERSION 14 | if(NOT PY_VERSION) 15 | set(PY_VERSION 2.7) 16 | endif() 17 | 18 | include(python) 19 | 20 | IF(WITH_TESTING) 21 | ENABLE_TESTING() 22 | ENDIF() 23 | 24 | add_subdirectory(python) 25 | -------------------------------------------------------------------------------- /FAQ.md: -------------------------------------------------------------------------------- 1 | ## Frequent asked questions 2 | - what is edl? 3 | - Computing resources on cloud such as Amazon AWS、Baidu Cloud have multi-tenancy. Deep learning model training and inference with elastic resources will be common on cloud. We propose Elastic Deep Learning (EDL) that makes training and inference of deep learning models on cloud easier and more efficient. 4 | -------------------------------------------------------------------------------- /OWNERS.md: -------------------------------------------------------------------------------- 1 | ## Owner: 2 | EDL project aims to supported PaddlePaddle's distributed training, current owner and contributors are as follows: 3 | - Owner: [guru4elephant](https://github.com/guru4elephant) 4 | 5 | ## Contributors: 6 | - [Yancey1898](https://github.com/Yancey1989) 7 | - [gonweibao](https://github.com/gongweibao) 8 | - [helinwang](https://github.com/helinwang) 9 | - [typhoonzero](https://github.com/typhoonzero) 10 | - [putcn](https://github.com/putcn) 11 | - [m3ngyang](https://github.com/m3ngyang) 12 | - [wangkuiyi](https://github.com/wangkuiyi) 13 | - [qizheng09](https://github.com/qizheng09) 14 | - [wangjiawei04](https://github.com/wangjiawei04) 15 | - [wopeizl](https://github.com/wopeizl) 16 | - [drinktee](https://github.com/drinktee) 17 | - [wanghaoshuang](https://github.com/wanghaoshuang) 18 | - [denkensk](https://github.com/denkensk) 19 | - [tizhou86](https://github.com/tizhou86) 20 | - [luotao1](https://github.com/luotao1) 21 | - [gangliao](https://github.com/gangliao) 22 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Note 2 | 3 | Please turn to [here](https://github.com/PaddlePaddle/edl/releases) for release note. 4 | -------------------------------------------------------------------------------- /cmake/python.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | INCLUDE(python_module) 16 | 17 | FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) 18 | FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) 19 | 20 | if(WIN32) 21 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" 22 | "from distutils import sysconfig as s;import sys;import struct; 23 | print(sys.prefix); 24 | print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); 25 | " 26 | RESULT_VARIABLE _PYTHON_SUCCESS 27 | OUTPUT_VARIABLE _PYTHON_VALUES 28 | ERROR_VARIABLE _PYTHON_ERROR_VALUE) 29 | 30 | if(NOT _PYTHON_SUCCESS MATCHES 0) 31 | set(PYTHONLIBS_FOUND FALSE) 32 | return() 33 | endif() 34 | 35 | # Convert the process output into a list 36 | string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) 37 | string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) 38 | list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) 39 | list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) 40 | 41 | # Make sure all directory separators are '/' 42 | string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) 43 | 44 | set(PYTHON_LIBRARY 45 | "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") 46 | 47 | # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the 48 | # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. 49 | if(NOT EXISTS "${PYTHON_LIBRARY}") 50 | get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) 51 | set(PYTHON_LIBRARY 52 | "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") 53 | endif() 54 | 55 | # raise an error if the python libs are still not found. 56 | if(NOT EXISTS "${PYTHON_LIBRARY}") 57 | message(FATAL_ERROR "Python libraries not found") 58 | endif() 59 | SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") 60 | endif(WIN32) 61 | 62 | # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. 63 | ADD_LIBRARY(python SHARED IMPORTED GLOBAL) 64 | SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) 65 | 66 | SET(py_env "") 67 | IF(PYTHONINTERP_FOUND) 68 | find_python_module(pip REQUIRED) 69 | find_python_module(wheel REQUIRED) 70 | ENDIF(PYTHONINTERP_FOUND) 71 | INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) 72 | INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) 73 | -------------------------------------------------------------------------------- /cmake/python_module.cmake: -------------------------------------------------------------------------------- 1 | # Find if a Python module is installed 2 | # Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html 3 | # To use do: find_python_module(PyQt4 REQUIRED) 4 | function(find_python_module module) 5 | string(TOUPPER ${module} module_upper) 6 | if(NOT PY_${module_upper}) 7 | if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED") 8 | set(${module}_FIND_REQUIRED TRUE) 9 | else() 10 | set(${module}_FIND_REQUIRED FALSE) 11 | endif() 12 | # A module's location is usually a directory, but for binary modules 13 | # it's a .so file. 14 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" 15 | "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))" 16 | RESULT_VARIABLE _${module}_status 17 | OUTPUT_VARIABLE _${module}_location 18 | ERROR_QUIET 19 | OUTPUT_STRIP_TRAILING_WHITESPACE) 20 | if(NOT _${module}_status) 21 | set(PY_${module_upper} ${_${module}_location} CACHE STRING 22 | "Location of Python module ${module}") 23 | endif(NOT _${module}_status) 24 | endif(NOT PY_${module_upper}) 25 | find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper}) 26 | if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED) 27 | message(FATAL_ERROR "python module ${module} is not found") 28 | endif() 29 | 30 | execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" 31 | "import sys, ${module}; sys.stdout.write(${module}.__version__)" 32 | OUTPUT_VARIABLE _${module}_version 33 | RESULT_VARIABLE _${module}_status 34 | ERROR_QUIET 35 | OUTPUT_STRIP_TRAILING_WHITESPACE) 36 | if(NOT _${module}_status) 37 | set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING 38 | "Version of Python module ${module}") 39 | endif(NOT _${module}_status) 40 | 41 | set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE) 42 | set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE) 43 | endfunction(find_python_module) 44 | -------------------------------------------------------------------------------- /doc/Elastic Deep Learning Survey.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/Elastic Deep Learning Survey.pdf -------------------------------------------------------------------------------- /doc/ROADMAP.md: -------------------------------------------------------------------------------- 1 | ## 2020-03 Release 0.2.0 2 | 3 | ### Elastic Inference 4 | - Release features for inference with paddle serving on EDL 5 | 6 | ### Elastic Training with Checkpoint 7 | - Release features for training with paddlepaddle based on checkpoint 8 | - Verify the correctness of at least one training task. 9 | 10 | ### Release 0.2.0 11 | - Official release 0.2.0 with features of Elastic Inference and Checkpoint based Elastic Training 12 | 13 | ## 2020-06 Release 0.3.0 14 | 15 | ### EDL High Level API Design 16 | - Support User defined Training Data and Parameter Adaptation API 17 | - A user can define what to adapt when computing resources are adjusted 18 | 19 | ### Release Verified Model Training Scripts based on EDL API 20 | - NLP models and CV models trained based on GPU will be verified. 21 | Bert/Ernie and Resnet50 for classification will be considered currently. 22 | 23 | ### Release 0.3.0 24 | - Features above will be released 25 | 26 | ## 2020-09 Release 0.4.0 27 | 28 | ### Online Training 29 | - Support Elastic Online Training Solution with resources allocated dynamically along the training timeline 30 | - Recommendation scenarios should be considered on high priority, verified model will be released 31 | 32 | ### EDL API upgraded 33 | Update EDL API to support online elastic deep learning 34 | 35 | ### Release 0.4.0 36 | - Features above will be released 37 | 38 | ## 2020-12 39 | - More Application on EDL 40 | - More machine learning tools integrated. 41 | -------------------------------------------------------------------------------- /doc/SUPPORT.md: -------------------------------------------------------------------------------- 1 | To get support for EDL and participate in the discussions, please join one or more appropriate mailing list below: 2 | 3 | * [EDL Announce Mailing List](https://lists.lfai.foundation/g/edl-announce) 4 | * [EDL Technical Discussion](https://lists.lfai.foundation/g/edl-technical-discuss) 5 | * [EDL Technical Steering Committee](https://lists.lfai.foundation/g/edl-tsc) 6 | -------------------------------------------------------------------------------- /doc/build.md: -------------------------------------------------------------------------------- 1 | # How to Build EDL Component 2 | 3 | This article contains instructions of build EDL and how to pack them into 4 | Docker image so that the EDL component can run in the Kubernetes cluster. 5 | 6 | ## Build EDL Controller 7 | 8 | ```bash 9 | glide install --strip-vendor 10 | go build github.com/paddlepaddle/edl/cmd/edl 11 | ``` 12 | 13 | The above step will generate a binary file named `edl` which should 14 | run as a daemon process on the Kubernetes cluster. 15 | 16 | ## Build EDL Controller Image 17 | 18 | To build your own docker images, run the following command: 19 | 20 | ```bash 21 | docker build -t yourRepoName/edl-controller . 22 | ``` 23 | 24 | This command will take the `Dockerfile`, build the EDL docker image and tag it as `yourRepoName/edl-controller` 25 | 26 | Now you want to push it to your docker hub so that Kubernetes cluster is able to pull and deploy it. 27 | 28 | ``` bash 29 | docker push yourRepoName/edl-controller 30 | ``` 31 | -------------------------------------------------------------------------------- /doc/checkpoint_based_edl.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/checkpoint_based_edl.gif -------------------------------------------------------------------------------- /doc/distill.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/distill.gif -------------------------------------------------------------------------------- /doc/edl_collective_design_doc_cn.md: -------------------------------------------------------------------------------- 1 | # 概述 2 | Collective通信(同步)模式的训练因为其精度稳定、好复现的特点广泛用于图像、文本、语音等深度学习领域。 3 | 本文将阐述Collective通信模式下的EDL的设计思路和方法 4 | 5 | # 难点 6 | 当用户把自己的单机程序改成多机程序,他的程序需要增加的唯一的超参是节点的个数,由此带来的程序的改动可能会有几个,如:数据如何切分;batchsize、learning rate类的调整。 7 | 当用户把多机程序改成可以适应EDL的程序,需要在节点个数的基础上增加另外一个考虑:数据的一致性的问题。需要保证各处的跟节点数目相关的参数在节点变化的时候多个节点间数据是一致的。我们需要在框架端把这个考虑带来的影响减少到最小。 8 | 9 | 这带来几个难点问题: 10 | 11 | 1. 如何保存Python端的用户逻辑. 12 | 如数据如何切分、 文件的位置、及其他的Paddle框架之外的参数等。 13 | 这些参数是比较自由的、用户自定义的,我们在训练引擎端无法控制的。所以我们采用stop-resume的方式解决,用户程序面对新的超参只有节点个数一个。 14 | 15 | 2. 如何尽可能的保证精度、结果可复现。 16 | 训练的任务提交之前,用户需要指定自己的训练节点的最小和最大的节点的个数,同时需要指定batchsize是保持不变还是随着节点数目线性增长,因为batchsize是精度相关的超参,有些模型超过了一定阈值就需要做额外的调整,如Resnet50 total batchsize 超过8K的时候需要对学习率做额外的调整。 17 | 但是,保持总的batchsize不变也会带来扩展的效率问题:单卡batchsize减少,训练的性能可能会降低。 18 | 考虑到上述两个问题,这个地方需要用户自己根据节点的个数和自己的模型的特点做决定。 19 | 20 | 3. 如何让用户的程序改动少。 21 | stop-resume的方式是需要`save_checkpoint`和`load_checkpoint`的时机。因为需要用户在Python端的显示调用,这部分很难隐藏到接口里边去。 22 | 除了这个之外,其他无改动。 23 | 24 | 4. 如何对接多个集群。 25 | Kubernetes虽然用的越来越多,但是实际生产中会有多种类型的在线、离线集群。为了和这些集群对接,我们提出了一个中间层:Jobserver。 26 | 用这个模块来对接各种各样的集群接口。 27 | 28 | 5. 如何防止没有意义的调度。 29 | - 当一个训练任务临近结束的时候其实是没有必要进行伸缩的,这个时候的伸缩反而会降低效率 30 | - 某些场景下,需要优先scale资源利用率高的作业而不是利用率低的,这有利于整体吞吐量的提升。 31 | 考虑到上述的原因,Paddle需要把作业的性能统计信息传递给调度端以便调度进行决策. 32 | 33 | 可能会有多种需要考虑的场景,而不仅仅是上述的两个。Paddle(计算引擎)需要把训练节点的信息汇报给调度端,以便于调度端做调度的决策。 34 | 35 | 6. 如何做数据的切分。 36 | 节点的变化一般会带来数据切分方式的变化。这就需要用户对数据有全局观。要么用户把数据全部下载下来,要么采用mount一个分布式的文件系统(如Ceph等)的方式。 37 | 38 | # 方案设计 39 | ## 架构图 40 | 41 | 42 | ## Launcher module 43 | 44 | Launcher模块主要负责多个trainer端的协调 45 | 46 | ## Trainer module 47 | . 48 | Trainer模块主要负责EDL功能里边的`save_checkpoint` `load_checkpoint` 49 | -------------------------------------------------------------------------------- /doc/edl_design_doc.md: -------------------------------------------------------------------------------- 1 | # Design Doc: Elastic Deep Learning 2 | 3 | TBD 4 | -------------------------------------------------------------------------------- /doc/edl_design_doc_cn.md: -------------------------------------------------------------------------------- 1 | # 2 | -------------------------------------------------------------------------------- /doc/edl_distill_design_doc.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | Distilling the Knowledge in a Neural Network[1](#r_1) is a different type of training used to transfer the knowledge from the cumbersome models(teachers) to a small model(student) that is more suitable for deployment. 3 | 4 | EDL Distillation is a large scale and universal solution for knowledge distillation. 5 | 6 | - Decouple the teacher and student models 7 | - They can run in the same or different nodes and transfer knowledge via network even on heterogeneous machines. 8 | Use Distillation on resnet50 as an example: The teachers(Resnet101 for example) can be deployed on P4 GPU cards since they compute forward network generally and the student can be deployed on v100 GPU cards since they need more GPU memory. 9 | 10 | - It's flexible and efficient. 11 | - Teachers and students can be adjusted elastically in training by the resource utilization 12 | - Easier to use and deploy. 13 | - Few lines need to change. 14 | - End to end use. We release the Kubernetes' deployment solution for you. 15 | 16 | # Design 17 | ## Architecture 18 | ## Student 19 | ## Teacher 20 | ## Reader 21 | ## Balancer 22 | 23 | ## Reference 24 | 1.[Distilling the Knowledge in a Neural Network](https://arxiv.org/pdf/1503.02531.pdf) 25 | -------------------------------------------------------------------------------- /doc/experiment/distill_resnet50.md: -------------------------------------------------------------------------------- 1 | # Distill experiment on resnet50 2 | TBD 3 | -------------------------------------------------------------------------------- /doc/fault_tolerance.md: -------------------------------------------------------------------------------- 1 | # Fault tolerance for sync training 2 | ## Design 3 | In the process of training, we may meet that one or more trainers crash. We use checkpoints to continue training. 4 | 5 | There may be several design-tricks for it: 6 | 7 | 1. How does Paddle save checkpoint itself? 8 | Paddle implements `save_persistables` to save all persistable variables. 9 | 10 | 2. How to save user's Python frontend logic? 11 | Such as current epoch number, step number in an epoch, and the data slice and offset and so on. 12 | 13 | 3. How to save checkpoints? 14 | - Which trainer saves the checkpoint? 15 | If there are many trainers, the trainer who `rank`==0 will do it. 16 | 17 | - Where do we save the checkpoint? 18 | It can be saved to the local file system, but eventually, it should be saved to a file-system that can be seen by all trainers such as a distributed HDFS. 19 | 20 | - How to guarantee the checkpoint's integrity and correctness? 21 | It's a process to save a file and it's not an atomic action but `rm` `rename` `mv` and others should be. 22 | We can use it and don't change any checkpoint when it's written with a version number. All checkpoints will be saved to the file system with an increment version number. The interface generates a temporary checkpoint file and then `rename` it to valid when it has done. 23 | 24 | - when is the checkpoint saved? 25 | Now the trainer saves checkpoint every epoch and it need not save the data offset, it's very simple. Of course, this method is not friendly when an epoch takes a too long time. We will implement a step level(time-limited) checkpoint interface the next version. 26 | 27 | ## Interface 28 | There are two interfaces `save_check_point` and `load_check_point` to save/load a checkpoint. 29 | There are two arguments should be careful: 30 | 31 | 1. fs: 32 | It's an abstract interface to file system and there are two implementations: local file system and HDFS. 33 | You can implement the member function of this class to use the checkpoint interface. 34 | 35 | 2. train_status: 36 | Now there is only one member variable `epoch_no` and there will be more variables here after 0.2 version. 37 | 38 | ## Example 39 | 1.save_check_point: 40 | 41 | ``` 42 | if trainer_id == 0: 43 | saved_status = TrainStatus(pass_id) 44 | if args.checkpoint: 45 | if not os.path.isdir(args.checkpoint): 46 | os.makedirs(args.checkpoint) 47 | 48 | print("save_check_point:{}".format(args.checkpoint)) 49 | fleet.save_check_point(executor=exe, train_status=saved_status, 50 | path=args.checkpoint, fs=fs)#, main_program=fleet._origin_program) 51 | ``` 52 | 53 | 2.load_check_point: 54 | 55 | ``` 56 | if args.checkpoint is not None: 57 | tmp_s = fleet.load_check_point(exe, args.checkpoint, fs=fs, trainer_id=trainer_id) 58 | if tmp_s is not None: 59 | train_status = tmp_s 60 | 61 | for pass_id in range(train_status.next(), params["num_epochs"]): 62 | train() 63 | ``` 64 | 65 | # Async training 66 | TBD 67 | -------------------------------------------------------------------------------- /doc/fault_tolerance_cn.md: -------------------------------------------------------------------------------- 1 | # 同步训练的FaultTolerance 2 | ## 设计思路 3 | 在训练的过程中我们可能会碰到因为各种的问题造成的训练单个(或者多个)trainer挂掉的问题。我们采用checkpoint的方式记录当前状态,保证重启之后训练任务能够正常运行。 4 | 这里边可能有几个地方需要考虑: 5 | 6 | 1. Paddle本身的checkpoint 7 | Paddle本身提供`save_persistables `保存所有持久的变量。 8 | 9 | 2. 用户python端逻辑的checkpoint问题 10 | 主要是当前epoch number,数据切分方法和位置等。 11 | 12 | 3. checkpoint保存的问题 13 | - 谁来保存 14 | 如果有多个trainer节点,我们一般会选择rank=0的trainer来负责保存checkpoint 15 | 16 | - 保存的位置 17 | 可以保存到本地,但是最终要保存到重启任务能够看到的文件系统里边,如分布式的HDFS文件系统。 18 | 19 | - 如何确保checkpoint的正确性 20 | 保存文件一个持续性的过程,不是一个原子性的过程,不能保证事务性。但是一般的文件系统的操作`mv` `rename` `rm` 是。 21 | 可以利用这个特点,对已经保存的checkpoint不变,递增当前的 checkpoint的版本号,先写入一个临时文件,完成之后再rename成一个有效文件名的checkpoint。 22 | 23 | - 何时保存 24 | 我们现在推荐的方式是每一个epoch保存一次。因为一个epoch完成之后,可以认为两个epoch数据上没有关系。这样我们只需要保存当前的epoch号就可以了,不用保存当前的文件逻辑切分和位置等。减少了复杂度。当然,这种方式对一个epoch过大的的不友好。我们准备以后的版本开发step级别(时间)的checkpoint 25 | 26 | ## 接口介绍 27 | Paddle提供`save_check_point`和`load_check_point`两种方式来存、读checkpoint。 28 | 其中有两个参数需要注意一下: 29 | 1.fs 30 | 这个是我们对文件系统的抽象,目前的实现有两种:本地和远程HDFS。您可以实现自己的`FS`类来实现保存和读取checkpoint的功能 31 | 32 | 2.train_status 33 | 目前该类只有`epoch_no`的类变量,0.2以后的版本将尝试增加用户自定义的member等更多的值。 34 | 35 | ## 使用样例 36 | 1. save_check_point的样例: 37 | 38 | ``` 39 | if trainer_id == 0: 40 | saved_status = TrainStatus(pass_id) 41 | if args.checkpoint: 42 | if not os.path.isdir(args.checkpoint): 43 | os.makedirs(args.checkpoint) 44 | 45 | print("save_check_point:{}".format(args.checkpoint)) 46 | fleet.save_check_point(executor=exe, train_status=saved_status, 47 | path=args.checkpoint, fs=fs)#, main_program=fleet._origin_program) 48 | ``` 49 | 50 | 2. load_check_point的样例: 51 | 52 | ``` 53 | if args.checkpoint is not None: 54 | tmp_s = fleet.load_check_point(exe, args.checkpoint, fs=fs, trainer_id=trainer_id) 55 | if tmp_s is not None: 56 | train_status = tmp_s 57 | ``` 58 | 59 | 60 | # 异步训练的FaultTolerance 61 | TBD 62 | -------------------------------------------------------------------------------- /doc/images/edl-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/edl-arch.png -------------------------------------------------------------------------------- /doc/images/launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/launcher.png -------------------------------------------------------------------------------- /doc/images/trainer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/doc/images/trainer.png -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM hub.baidubce.com/paddlepaddle/paddle: 2 | 3 | # gcc 5 4 | RUN ln -sf /usr/bin/gcc-5 /usr/bin/gcc 5 | # python3 default use python3.7 6 | RUN ln -sf /usr/local/bin/python3.7 /usr/local/bin/python3 7 | 8 | # Install Go 9 | RUN rm -rf /usr/local/go && wget -qO- https://dl.google.com/go/go1.13.10.linux-amd64.tar.gz | \ 10 | tar -xz -C /usr/local && \ 11 | mkdir -p /root/gopath && \ 12 | mkdir -p /root/gopath/bin && \ 13 | mkdir -p /root/gopath/src 14 | ENV GOROOT=/usr/local/go GOPATH=/root/gopath 15 | # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. 16 | ENV PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin 17 | 18 | # python 19 | ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt 20 | RUN python3.7 -m pip install pip==20.1.1 21 | RUN python3.7 -m pip install --upgrade setuptools 22 | RUN python3.7 -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 23 | 24 | ADD ./docker/dev_requirements.txt /root/paddle_edl/dev_requirements.txt 25 | RUN python3.7 -m pip install -r /root/paddle_edl/dev_requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 26 | 27 | # python 2.7 is deprecated 28 | # RUN python -m pip install pip==20.1.1 29 | # RUN python -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 30 | 31 | #etcd 32 | ENV HOME /root 33 | WORKDIR /root/paddle_edl 34 | ADD ./scripts/download_etcd.sh /root/paddle_edl/download_etcd.sh 35 | RUN bash /root/paddle_edl/download_etcd.sh 36 | 37 | # Install redis 38 | RUN cd /tmp/ && wget -q https://paddle-edl.bj.bcebos.com/redis-6.0.1.tar.gz && \ 39 | tar xzf redis-6.0.1.tar.gz && \ 40 | cd redis-6.0.1 && make -j && \ 41 | mv src/redis-server /usr/local/bin && \ 42 | mv src/redis-cli /usr/local/bin && \ 43 | cd .. && rm -rf redis-6.0.1.tar.gz redis-6.0.1 44 | 45 | 46 | # protoc 47 | RUN mkdir -p /tmp/protoc && cd /tmp/protoc && \ 48 | wget -q -O protoc-3.11.4-linux-x86_64.zip --no-check-certificate https://paddle-edl.bj.bcebos.com/protoc-3.11.4-linux-x86_64.zip && \ 49 | unzip protoc-3.11.4-linux-x86_64.zip && mv bin/protoc /usr/local/bin 50 | 51 | RUN echo "export PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin" >> /root/.bashrc 52 | RUN echo "go env -w GO111MODULE=on && go env -w GOPROXY=https://goproxy.io,direct" >> /root/.bashrc 53 | ENV GO111MODULE=on 54 | ENV GOPROXY=https://goproxy.io,direct 55 | 56 | RUN rm -f /usr/bin/python /usr/bin/pip /usr/local/bin/pip && \ 57 | ln -s /usr/local/bin/python3.7 /usr/bin/python && \ 58 | ln -s /usr/local/bin/pip3.7 /usr/bin/pip && \ 59 | ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip 60 | 61 | RUN apt-get update && apt-get install -y shellcheck clang-format-3.8 62 | -------------------------------------------------------------------------------- /docker/Dockerfile.runtime: -------------------------------------------------------------------------------- 1 | FROM hub.baidubce.com/paddlepaddle/paddle: 2 | 3 | # gcc 5 4 | RUN ln -sf /usr/bin/gcc-5 /usr/bin/gcc 5 | # python3 default use python3.6 6 | RUN ln -sf /usr/local/bin/python3.6 /usr/local/bin/python3 7 | 8 | # Install Go 9 | RUN rm -rf /usr/local/go && wget -qO- https://dl.google.com/go/go1.13.10.linux-amd64.tar.gz | \ 10 | tar -xz -C /usr/local && \ 11 | mkdir -p /root/gopath && \ 12 | mkdir -p /root/gopath/bin && \ 13 | mkdir -p /root/gopath/src 14 | ENV GOROOT=/usr/local/go GOPATH=/root/gopath 15 | # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. 16 | ENV PATH=$PATH:{GOROOT}/bin:${GOPATH}/bin 17 | 18 | ADD ./docker/requirements.txt /root/paddle_edl/requirements.txt 19 | RUN python -m pip install pip==20.1.1 20 | RUN python3.6 -m pip install pip==20.1.1 21 | RUN python3.6 -m pip install --upgrade setuptools 22 | RUN python -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 23 | RUN python3.6 -m pip install -r /root/paddle_edl/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 24 | 25 | # etcd 26 | ENV HOME /root 27 | WORKDIR /root/paddle_edl 28 | ADD ./scripts/download_etcd.sh /root/paddle_edl/download_etcd.sh 29 | RUN bash /root/paddle_edl/download_etcd.sh 30 | 31 | # Install redis 32 | RUN cd /tmp/ && wget -q https://paddle-edl.bj.bcebos.com/redis-6.0.1.tar.gz && \ 33 | tar xzf redis-6.0.1.tar.gz && \ 34 | cd redis-6.0.1 && make -j && \ 35 | mv src/redis-server /usr/local/bin && \ 36 | mv src/redis-cli /usr/local/bin && \ 37 | cd .. && rm -rf redis-6.0.1.tar.gz redis-6.0.1 38 | 39 | RUN echo "export PATH=$PATH:${GOROOT}/bin:${GOPATH}/bin" >> /root/.bashrc 40 | RUN echo "go env -w GO111MODULE=on && go env -w GOPROXY=https://goproxy.io,direct" >> /root/.bashrc 41 | ENV GO111MODULE=on 42 | ENV GOPROXY=https://goproxy.io,direct 43 | 44 | # install edl 45 | ADD ./build/python/dist/paddle_edl-0.0.0-py2.py3-none-any.whl /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl 46 | RUN python -m pip install /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl 47 | RUN python3.6 -m pip install /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl 48 | RUN rm -f /tmp/paddle_edl-0.0.0-py2.py3-none-any.whl 49 | 50 | # add example 51 | ADD ./example /root/paddle_edl/example 52 | ADD ./k8s/k8s_tools.py ./example/distill/k8s/edl_k8s /root/paddle_edl/ 53 | 54 | # add mnist distill teacher model 55 | RUN cd /root/paddle_edl/example/distill/mnist_distill && \ 56 | wget -q https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz && \ 57 | tar xzf mnist_cnn_model.tar.gz 58 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | ## Build Runtime Docker Image for Kubernetes 2 | 3 | For the distributed training job on Kubernetes, we package Paddle binary files and some tools for Kubernetes into a runtime Docker image, the runtime Docker image gets scheduled by Kubernetes to run during training. 4 | 5 | You can build CPU and GPU Docker image which based on different PaddlePaddle product Docker image: 6 | 7 | ```bash 8 | ./build_docker.sh 9 | ``` 10 | 11 | - Build CPU runtime Docker image 12 | 13 | ```bash 14 | ./build_docker.sh paddlepaddle/paddle:0.11.0 paddlepaddle/paddlecloud-job:0.11.0 15 | ``` 16 | 17 | - Build GPU runtime Docker image 18 | 19 | ```bash 20 | ./build_docker.sh paddlepaddle/paddle:0.11.0-gpu paddlepaddle/paddlecloud-job:0.11.0-gpu 21 | ``` 22 | -------------------------------------------------------------------------------- /docker/build-devel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | 19 | unset GREP_OPTIONS 20 | BASEDIR="$(dirname "$(readlink -f "${0}")")" 21 | cd "${BASEDIR}"/.. 22 | 23 | image=hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7-dev 24 | sed 's//latest-gpu-cuda10.0-cudnn7-dev/g' docker/Dockerfile > docker/Dockerfile.cuda10 25 | docker build --pull --network host . -t ${image} -f docker/Dockerfile.cuda10 26 | docker push ${image} 27 | 28 | image=hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda9.0-cudnn7-dev 29 | sed 's//latest-gpu-cuda9.0-cudnn7-dev/g' docker/Dockerfile > docker/Dockerfile.cuda9 30 | docker build --pull --network host . -t ${image} -f docker/Dockerfile.cuda9 31 | docker push ${image} 32 | -------------------------------------------------------------------------------- /docker/build-runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | 19 | if [[ $# != 1 ]] ; then 20 | echo "must set version" 21 | exit 0 22 | fi 23 | 24 | unset GREP_OPTIONS 25 | BASEDIR="$(dirname "$(readlink -f "${0}")")" 26 | cd "${BASEDIR}"/.. 27 | 28 | build_image(){ 29 | cuda_version=$1 30 | latest_image="hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda${cuda_version}-cudnn7" 31 | sed 's//1.8.0-gpu-cuda'"${cuda_version}"'-cudnn7/g' docker/Dockerfile.runtime > "docker/Dockerfile.runtime.cuda${cuda_version}" 32 | docker build --pull --network host . -t "${latest_image}" -f "docker/Dockerfile.runtime.cuda${cuda_version}" 33 | docker push "${latest_image}" 34 | 35 | version=$2 36 | version_image="hub.baidubce.com/paddle-edl/paddle_edl:${version}-cuda${cuda_version}-cudnn7" 37 | docker tag "${latest_image}" "${version_image}" 38 | docker push "${version_image}" 39 | } 40 | 41 | version=$1 42 | cuda_version="10.0" 43 | echo "build cuda:${cuda_version} edl version:${version}" 44 | build_image "${cuda_version}" "$version" 45 | 46 | cuda_version="9.0" 47 | echo "build cuda:${cuda_version} edl version:${version}" 48 | build_image "${cuda_version}" "$version" 49 | -------------------------------------------------------------------------------- /docker/dev_requirements.txt: -------------------------------------------------------------------------------- 1 | astroid 2 | cpplint 3 | isort 4 | pre-commit 5 | pylint 6 | pytest 7 | -------------------------------------------------------------------------------- /docker/release-run-time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | RED='\033[0;31m' 18 | NC='\033[0m' # No Color 19 | GREEN='\033[0;32m' 20 | 21 | if [[ $# != 1 ]] ; then 22 | echo "must set version" 23 | exit 0 24 | fi 25 | 26 | version=$1 27 | 28 | echo -e "${GREEN} Press 'y' to release ${RED} docker version ${version} ${NC}" 29 | while : ; do 30 | read -n 1 k <&1 31 | if [[ $k == y ]] ; then 32 | break 33 | else 34 | echo "exit" 35 | exit 0 36 | fi 37 | done 38 | 39 | echo -e "\n${GREEN} Begin to release ${RED} edl docker ${version} ${NC}\n" 40 | 41 | unset GREP_OPTIONS 42 | BASEDIR="$(dirname "$(readlink -f "${0}")")" 43 | cd "${BASEDIR}" 44 | 45 | bash ./build-runtime.sh "$version" 46 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | etcd3==0.12.0 2 | flask==1.1.2 3 | grpcio==1.28.1 4 | grpcio_tools==1.28.1 5 | kubernetes 6 | paddle-serving-app 7 | paddle-serving-client 8 | paddle-serving-server-gpu 9 | paddlepaddle-gpu==1.8.0.post107 10 | pathlib2==2.3.5 11 | protobuf==3.8.0 12 | psutil 13 | redis 14 | -------------------------------------------------------------------------------- /example/collective/resnet50/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .vgg import VGG11, VGG13, VGG16, VGG19 16 | from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 17 | -------------------------------------------------------------------------------- /example/collective/resnet50/train_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | export FLAGS_sync_nccl_allreduce=1 18 | export FLAGS_cudnn_exhaustive_search=1 19 | #export FLAGS_conv_workspace_size_limit=4000 #MB 20 | export FLAGS_cudnn_batchnorm_spatial_persistent=1 21 | 22 | export GLOG_v=1 23 | export GLOG_logtostderr=1 24 | export FLAGS_eager_delete_tensor_gb=0 25 | export NCCL_DEBUG=INFO 26 | # Unset proxy 27 | unset https_proxy http_proxy 28 | 29 | FP16=False #whether to use float16 30 | use_dali=False 31 | DATA_FORMAT="NCHW" 32 | if [[ ${use_dali} == "True" ]]; then 33 | export FLAGS_fraction_of_gpu_memory_to_use=0.8 34 | fi 35 | 36 | python -m paddle_edl.collective.launch ${distributed_args} \ 37 | --log_dir log \ 38 | --log_level 20 \ 39 | ./train_with_fleet.py \ 40 | --model=ResNet50 \ 41 | --batch_size=128 \ 42 | --total_images=1281167 \ 43 | --data_dir=./ImageNet \ 44 | --class_dim=1000 \ 45 | --image_shape=3,224,224 \ 46 | --model_save_dir=output/ \ 47 | --with_mem_opt=False \ 48 | --lr_strategy=piecewise_decay \ 49 | --lr=0.1\ 50 | --l2_decay=1e-4 \ 51 | --scale_loss=1.0 \ 52 | --num_epochs=90 \ 53 | --num_threads=2 \ 54 | --nccl_comm_num=1 \ 55 | --fuse=True \ 56 | --use_hierarchical_allreduce=False \ 57 | --fp16=${FP16} \ 58 | --use_dali=${use_dali} \ 59 | --checkpoint=./fleet_checkpoints \ 60 | --do_test=False \ 61 | --data_format=${DATA_FORMAT} 62 | -------------------------------------------------------------------------------- /example/collective/resnet50/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /example/ctr/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER peizhilin@baidu.com 3 | 4 | RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv git curl 5 | 6 | RUN pip install -U pip 7 | RUN pip install -U kubernetes paddlepaddle 8 | RUN mkdir -p /workspace 9 | 10 | RUN mkdir -p /temp && cd /temp && git clone https://github.com/PaddlePaddle/models.git && cd models && git checkout f503908d && mv /temp/models/fluid/PaddleRec/ctr /workspace/ 11 | 12 | ADD script/paddle_k8s /usr/bin 13 | ADD script/k8s_tools.py /root 14 | RUN chmod +x /usr/bin/paddle_k8s 15 | 16 | COPY ctr /workspace/ctr 17 | -------------------------------------------------------------------------------- /example/ctr/README: -------------------------------------------------------------------------------- 1 | 2 | CTR分布式训练 3 | 4 | 这是一个paddlepaddle分布式训练任务的示例和安装教程,在一个标准k8s集群上可以通过脚本直接构建一个分布式训练CTR任务。 5 | 6 | 整个工程分为三部分 7 | 8 | 1。镜像文件 9 | Dockerfile -- docker构建文件 10 | script -- 构建docker用到的脚本, 拷贝 edl/docker/k8s_tools 和 edl/docker/paddle_k8s 到此目录 11 | ctr -- paddlepaddle分布式训练CTR例子 12 | 13 | 2。部署文件 14 | ps-train -- 部署k8s的yaml文件 15 | 16 | 3。其它 17 | image -- 任务图例 18 | 百度云部署ctr分布式训练任务.rst -- 百度云搭建CTR任务说明 19 | -------------------------------------------------------------------------------- /example/ctr/k8s/cube.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cube-0 5 | labels: 6 | app: cube-0 7 | spec: 8 | containers: 9 | - name: cube-0 10 | image: wangjiawei1993/cube:v11 11 | workingDir: /cube 12 | command: ['/bin/bash'] 13 | args: ['start.sh'] 14 | ports: 15 | - containerPort: 8001 16 | name: cube-agent 17 | - containerPort: 8027 18 | name: cube-server 19 | 20 | --- 21 | 22 | apiVersion: v1 23 | kind: Pod 24 | metadata: 25 | name: cube-1 26 | labels: 27 | app: cube-1 28 | spec: 29 | containers: 30 | - name: cube-1 31 | image: wangjiawei1993/cube:v11 32 | workingDir: /cube 33 | command: ['/bin/bash'] 34 | args: ['start.sh'] 35 | ports: 36 | - containerPort: 8001 37 | name: cube-agent 38 | - containerPort: 8027 39 | name: cube-server 40 | 41 | --- 42 | 43 | kind: Service 44 | apiVersion: v1 45 | metadata: 46 | name: cube-0 47 | spec: 48 | ports: 49 | - name: agent 50 | port: 8001 51 | protocol: TCP 52 | - name: server 53 | port: 8027 54 | protocol: TCP 55 | selector: 56 | app: cube-0 57 | 58 | --- 59 | 60 | kind: Service 61 | apiVersion: v1 62 | metadata: 63 | name: cube-1 64 | spec: 65 | ports: 66 | - name: agent 67 | port: 8001 68 | protocol: TCP 69 | - name: server 70 | port: 8027 71 | protocol: TCP 72 | selector: 73 | app: cube-1 74 | -------------------------------------------------------------------------------- /example/ctr/k8s/ftp.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: file-server 5 | labels: 6 | app: file-server 7 | spec: 8 | volumes: 9 | - hostPath: 10 | path: /home/work 11 | type: "" 12 | name: file-home 13 | containers: 14 | - name: file-server 15 | image: halverneus/static-file-server 16 | ports: 17 | - containerPort: 8080 18 | volumeMounts: 19 | - mountPath: /web 20 | name: file-home 21 | nodeSelector: 22 | nodeType: model 23 | --- 24 | kind: Service 25 | apiVersion: v1 26 | metadata: 27 | name: file-server 28 | spec: 29 | type: LoadBalancer 30 | ports: 31 | - name: file-server 32 | port: 8080 33 | targetPort: 8080 34 | selector: 35 | app: file-server 36 | -------------------------------------------------------------------------------- /example/ctr/k8s/paddle-suite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml 18 | OUTPUT_NODE=$(kubectl get no | awk '{print $1}' | sed -n '2p') 19 | kubectl label nodes $OUTPUT_NODE nodeType=model --overwrite 20 | kubectl apply -f paddle-suite.yaml 21 | -------------------------------------------------------------------------------- /example/ctr/k8s/pdclient.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: pdservingclient 5 | labels: 6 | app: pdservingclient 7 | spec: 8 | containers: 9 | - name: pdservingclient 10 | image: wangjiawei1993/pdservingclient:v4 11 | workingDir: / 12 | command: ['bash'] 13 | args: ['nonstop.sh'] 14 | -------------------------------------------------------------------------------- /example/ctr/k8s/pdserving.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: paddleserving 5 | labels: 6 | app: paddleserving 7 | spec: 8 | containers: 9 | - name: paddleserving 10 | image: wangjiawei1993/paddleserving:v7-debug 11 | workingDir: /serving 12 | command: ['/bin/bash'] 13 | args: ['run.sh'] 14 | ports: 15 | - containerPort: 8010 16 | name: serving 17 | 18 | --- 19 | apiVersion: v1 20 | kind: Service 21 | metadata: 22 | name: paddleserving 23 | spec: 24 | ports: 25 | - name: serving 26 | port: 8010 27 | protocol: TCP 28 | selector: 29 | app: paddleserving 30 | -------------------------------------------------------------------------------- /example/ctr/k8s/transfer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cube-transfer 5 | labels: 6 | app: cube-transfer 7 | spec: 8 | containers: 9 | - name: cube-transfer 10 | image: wangjiawei1993/cube-transfer:v18 11 | workingDir: / 12 | env: 13 | - name: POD_IP 14 | valueFrom: 15 | fieldRef: 16 | apiVersion: v1 17 | fieldPath: status.podIP 18 | command: ['bash'] 19 | args: ['nonstop.sh'] 20 | ports: 21 | - containerPort: 8099 22 | name: cube-transfer 23 | - containerPort: 8098 24 | name: cube-http 25 | -------------------------------------------------------------------------------- /example/ctr/ps-train/pserver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: ReplicaSet 3 | metadata: {name: fluid-ctr-pserver} 4 | spec: 5 | replicas: 2 6 | template: 7 | metadata: 8 | labels: {paddle-job-pserver: fluid-ctr} 9 | spec: 10 | containers: 11 | - command: [paddle_k8s, start_fluid] 12 | env: 13 | - {name: GLOG_v, value: '0'} 14 | - {name: GLOG_logtostderr, value: '1'} 15 | - {name: TOPOLOGY, value: ''} 16 | - {name: TRAINER_PACKAGE, value: /workspace} 17 | - {name: PADDLE_INIT_NICS, value: eth2} 18 | - name: NAMESPACE 19 | valueFrom: 20 | fieldRef: {fieldPath: metadata.namespace} 21 | - name: POD_IP 22 | valueFrom: 23 | fieldRef: {fieldPath: status.podIP} 24 | - name: POD_NAME 25 | valueFrom: 26 | fieldRef: {fieldPath: metadata.name} 27 | - name: PADDLE_CURRENT_IP 28 | valueFrom: 29 | fieldRef: {fieldPath: status.podIP} 30 | - {name: PADDLE_JOB_NAME, value: fluid-ctr} 31 | - {name: PADDLE_IS_LOCAL, value: '0'} 32 | - {name: PADDLE_TRAINERS_NUM, value: '2'} 33 | - {name: PADDLE_PSERVERS_NUM, value: '2'} 34 | - {name: FLAGS_rpc_deadline, value: '36000000'} 35 | - {name: ENTRY, value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1} 36 | - {name: PADDLE_PORT, value: '30236'} 37 | - {name: LD_LIBRARY_PATH, value: '/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind'} 38 | - {name: PADDLE_TRAINING_ROLE, value: PSERVER} 39 | - {name: TRAINING_ROLE, value: PSERVER} 40 | image: wopeizl/paddle_ctr_distribute 41 | imagePullPolicy: Always 42 | name: pserver 43 | volumeMounts: 44 | - {mountPath: /mnt/seqdata, name: seqdata} 45 | resources: 46 | limits: {cpu: '10', memory: 30Gi} 47 | requests: {cpu: '1', memory: 100M} 48 | hostNetwork: true 49 | imagePullSecrets: 50 | - {name: regcred} 51 | volumes: 52 | - hostPath: {path: /home/work/} 53 | name: seqdata 54 | -------------------------------------------------------------------------------- /example/ctr/ps-train/trainer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: {name: fluid-ctr-trainer} 4 | spec: 5 | completions: 2 6 | parallelism: 2 7 | template: 8 | metadata: 9 | labels: {paddle-job: fluid-ctr} 10 | spec: 11 | restartPolicy: "OnFailure" 12 | containers: 13 | - command: [paddle_k8s, start_fluid] 14 | env: 15 | - {name: GLOG_v, value: '0'} 16 | - {name: GLOG_logtostderr, value: '1'} 17 | - {name: TOPOLOGY, value: ''} 18 | - {name: TRAINER_PACKAGE, value: /workspace} 19 | - {name: PADDLE_INIT_NICS, value: eth2} 20 | - name: NAMESPACE 21 | valueFrom: 22 | fieldRef: {fieldPath: metadata.namespace} 23 | - name: POD_IP 24 | valueFrom: 25 | fieldRef: {fieldPath: status.podIP} 26 | - name: POD_NAME 27 | valueFrom: 28 | fieldRef: {fieldPath: metadata.name} 29 | - name: PADDLE_CURRENT_IP 30 | valueFrom: 31 | fieldRef: {fieldPath: status.podIP} 32 | - {name: PADDLE_JOB_NAME, value: fluid-ctr} 33 | - {name: PADDLE_IS_LOCAL, value: '0'} 34 | - {name: FLAGS_rpc_deadline, value: '36000000'} 35 | - {name: PADDLE_PORT, value: '30236'} 36 | - {name: PADDLE_PSERVERS_NUM, value: '2'} 37 | - {name: PADDLE_TRAINERS_NUM, value: '2'} 38 | - {name: PADDLE_TRAINING_ROLE, value: TRAINER} 39 | - {name: TRAINING_ROLE, value: TRAINER} 40 | - {name: LD_LIBRARY_PATH, value: '/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind'} 41 | - {name: ENTRY, value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1} 42 | image: wopeizl/paddle_ctr_distribute 43 | imagePullPolicy: Always 44 | name: trainer 45 | volumeMounts: 46 | - {mountPath: /mnt/seqdata, name: seqdata} 47 | resources: 48 | limits: {cpu: '10', memory: 30Gi} 49 | requests: {cpu: '1', memory: 100M} 50 | hostNetwork: true 51 | imagePullSecrets: 52 | - {name: regcred} 53 | volumes: 54 | - hostPath: {path: /home/work/} 55 | name: seqdata 56 | -------------------------------------------------------------------------------- /example/ctr/script/README: -------------------------------------------------------------------------------- 1 | please copy the edl/docker/k8s_tools and edl/docker/paddle_k8s into this folder if want to build the docker image by yourself. 2 | -------------------------------------------------------------------------------- /example/ctr/script/cube.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cube-0 5 | labels: 6 | app: cube-0 7 | spec: 8 | containers: 9 | - name: cube-0 10 | image: hub.baidubce.com/ctr/cube:latest 11 | workingDir: /cube 12 | command: ['/bin/bash'] 13 | args: ['start.sh'] 14 | ports: 15 | - containerPort: 8001 16 | name: cube-agent 17 | - containerPort: 8027 18 | name: cube-server 19 | 20 | --- 21 | 22 | apiVersion: v1 23 | kind: Pod 24 | metadata: 25 | name: cube-1 26 | labels: 27 | app: cube-1 28 | spec: 29 | containers: 30 | - name: cube-1 31 | image: hub.baidubce.com/ctr/cube:latest 32 | workingDir: /cube 33 | command: ['/bin/bash'] 34 | args: ['start.sh'] 35 | ports: 36 | - containerPort: 8001 37 | name: cube-agent 38 | - containerPort: 8027 39 | name: cube-server 40 | 41 | --- 42 | 43 | kind: Service 44 | apiVersion: v1 45 | metadata: 46 | name: cube-0 47 | spec: 48 | ports: 49 | - name: agent 50 | port: 8001 51 | protocol: TCP 52 | - name: server 53 | port: 8027 54 | protocol: TCP 55 | selector: 56 | app: cube-0 57 | 58 | --- 59 | 60 | kind: Service 61 | apiVersion: v1 62 | metadata: 63 | name: cube-1 64 | spec: 65 | ports: 66 | - name: agent 67 | port: 8001 68 | protocol: TCP 69 | - name: server 70 | port: 8027 71 | protocol: TCP 72 | selector: 73 | app: cube-1 74 | -------------------------------------------------------------------------------- /example/ctr/script/defaultserviceaccountclusterrole.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: default 5 | namespace: default 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods"] 9 | verbs: ["get", "list", "watch"] 10 | 11 | --- 12 | kind: ClusterRoleBinding 13 | apiVersion: rbac.authorization.k8s.io/v1 14 | metadata: 15 | name: default 16 | namespace: default 17 | subjects: 18 | - kind: ServiceAccount 19 | name: default 20 | namespace: default 21 | roleRef: 22 | kind: ClusterRole 23 | name: default 24 | apiGroup: rbac.authorization.k8s.io 25 | -------------------------------------------------------------------------------- /example/ctr/script/fileserver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: file-server 5 | labels: 6 | app: file-server 7 | spec: 8 | nodeSelector: 9 | nodeType: model 10 | volumes: 11 | - hostPath: 12 | path: /home/work 13 | type: "" 14 | name: file-home 15 | containers: 16 | - name: file-server 17 | image: halverneus/static-file-server 18 | ports: 19 | - containerPort: 8080 20 | volumeMounts: 21 | - mountPath: /web 22 | name: file-home 23 | --- 24 | kind: Service 25 | apiVersion: v1 26 | metadata: 27 | name: loadbalancer 28 | spec: 29 | type: LoadBalancer 30 | ports: 31 | - name: file-server 32 | port: 8080 33 | targetPort: 8080 34 | selector: 35 | app: file-server 36 | -------------------------------------------------------------------------------- /example/ctr/script/ftp.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: file-server 5 | labels: 6 | app: file-server 7 | spec: 8 | volumes: 9 | - hostPath: 10 | path: /home/work 11 | type: "" 12 | name: file-home 13 | containers: 14 | - name: file-server 15 | image: hub.baidubce.com/ctr/file-server:latest 16 | ports: 17 | - containerPort: 8080 18 | volumeMounts: 19 | - mountPath: /web 20 | name: file-home 21 | nodeSelector: 22 | nodeType: model 23 | --- 24 | kind: Service 25 | apiVersion: v1 26 | metadata: 27 | name: file-server 28 | spec: 29 | type: LoadBalancer 30 | ports: 31 | - name: file-server 32 | port: 8080 33 | targetPort: 8080 34 | selector: 35 | app: file-server 36 | -------------------------------------------------------------------------------- /example/ctr/script/paddle-suite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/master/installer/volcano-development.yaml 18 | OUTPUT_NODE=$(kubectl get no | awk '{print $1}' | sed -n '2p') 19 | kubectl label nodes $OUTPUT_NODE nodeType=model --overwrite 20 | kubectl apply -f paddle-suite.yaml 21 | -------------------------------------------------------------------------------- /example/ctr/script/pdclient.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: pdservingclient 5 | labels: 6 | app: pdservingclient 7 | spec: 8 | containers: 9 | - name: pdservingclient 10 | image: hub.baidubce.com/ctr/pdservingclient:latest 11 | workingDir: / 12 | command: ['bash'] 13 | args: ['nonstop.sh'] 14 | -------------------------------------------------------------------------------- /example/ctr/script/pdserving.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: paddleserving 5 | labels: 6 | app: paddleserving 7 | spec: 8 | containers: 9 | - name: paddleserving 10 | image: hub.baidubce.com/ctr/paddleserving:latest 11 | workingDir: /serving 12 | command: ['/bin/bash'] 13 | args: ['run.sh'] 14 | ports: 15 | - containerPort: 8010 16 | name: serving 17 | 18 | --- 19 | apiVersion: v1 20 | kind: Service 21 | metadata: 22 | name: paddleserving 23 | spec: 24 | ports: 25 | - name: serving 26 | port: 8010 27 | protocol: TCP 28 | selector: 29 | app: paddleserving 30 | -------------------------------------------------------------------------------- /example/ctr/script/transfer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: cube-transfer 5 | labels: 6 | app: cube-transfer 7 | spec: 8 | containers: 9 | - name: cube-transfer 10 | image: hub.baidubce.com/ctr/cube-transfer:latest 11 | workingDir: / 12 | env: 13 | - name: POD_IP 14 | valueFrom: 15 | fieldRef: 16 | apiVersion: v1 17 | fieldPath: status.podIP 18 | command: ['bash'] 19 | args: ['nonstop.sh'] 20 | ports: 21 | - containerPort: 8099 22 | name: cube-transfer 23 | - containerPort: 8098 24 | name: cube-http 25 | -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/cluster-info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/cluster-info.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/concole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/concole.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/conf-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/conf-download.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/ctr-models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-models.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/ctr-prediction-end-to-end-deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-prediction-end-to-end-deployment.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/ctr-running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/ctr-running.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/eip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/eip.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/file_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/file_server.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/helm-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/helm-version.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/kubectl-version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/kubectl-version.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/load_balancer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/load_balancer.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/pserver-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/pserver-log.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/tiller.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/tiller.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/trainer-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/trainer-log.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/volcano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/volcano.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/wget_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/wget_example.png -------------------------------------------------------------------------------- /example/ctr/src/baidu_cloud/workload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/baidu_cloud/workload.png -------------------------------------------------------------------------------- /example/ctr/src/create_gpu_machine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_gpu_machine.png -------------------------------------------------------------------------------- /example/ctr/src/create_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_image.png -------------------------------------------------------------------------------- /example/ctr/src/create_more_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/create_more_nodes.png -------------------------------------------------------------------------------- /example/ctr/src/ctr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_kubectl_download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_kubectl_download.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_node.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_node.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_pods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_pods.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_pserver_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_pserver_log.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_trainer_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_trainer_log.png -------------------------------------------------------------------------------- /example/ctr/src/ctr_volcano_install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctr_volcano_install.png -------------------------------------------------------------------------------- /example/ctr/src/ctryaml1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml1.png -------------------------------------------------------------------------------- /example/ctr/src/ctryaml2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml2.png -------------------------------------------------------------------------------- /example/ctr/src/ctryaml3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/ctryaml3.png -------------------------------------------------------------------------------- /example/ctr/src/cube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube.png -------------------------------------------------------------------------------- /example/ctr/src/cube_config1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube_config1.png -------------------------------------------------------------------------------- /example/ctr/src/cube_config2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/cube_config2.png -------------------------------------------------------------------------------- /example/ctr/src/dist_train_nccl2.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_nccl2.graffle -------------------------------------------------------------------------------- /example/ctr/src/dist_train_nccl2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_nccl2.png -------------------------------------------------------------------------------- /example/ctr/src/dist_train_pserver.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_pserver.graffle -------------------------------------------------------------------------------- /example/ctr/src/dist_train_pserver.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/dist_train_pserver.png -------------------------------------------------------------------------------- /example/ctr/src/file_server_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/file_server_pod.png -------------------------------------------------------------------------------- /example/ctr/src/file_server_svc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/file_server_svc.png -------------------------------------------------------------------------------- /example/ctr/src/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/overview.png -------------------------------------------------------------------------------- /example/ctr/src/paddleclient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleclient.png -------------------------------------------------------------------------------- /example/ctr/src/paddleserving_pod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleserving_pod.png -------------------------------------------------------------------------------- /example/ctr/src/paddleserving_svc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/paddleserving_svc.png -------------------------------------------------------------------------------- /example/ctr/src/parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/parallelism.png -------------------------------------------------------------------------------- /example/ctr/src/pyreader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/pyreader.png -------------------------------------------------------------------------------- /example/ctr/src/release.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/release.png -------------------------------------------------------------------------------- /example/ctr/src/transfer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/ctr/src/transfer.png -------------------------------------------------------------------------------- /example/demo/collective/README.md: -------------------------------------------------------------------------------- 1 | # Purpose 2 | This article illustrates how to change the train program to an EDL program, and run on single or multiple nodes. 3 | 4 | 5 | ## How to change from a normal train program to an EDL train program 6 | The main changes are: 7 | 8 | - `load_checkpoint` should be added at the beginning of training and 9 | - `save_checkpoint` added at the end of every epoch. 10 | the checkpoint should be on a distributed file system such as HDFS so all trainers can download from it. A complete example is [here](https://github.com/elasticdeeplearning/edl/tree/develop/example/collective/resnet50) 11 | 12 | ``` 13 | fs=HDFSClient(args.hdfs_name, args.hdfs_ugi,20*60*1000, 3 * 1000) 14 | 15 | train_status =TrainStatus() 16 | tmp_s = fleet.load_checkpoint(exe, args.checkpoint, fs=fs, trainer_id=trainer_id) 17 | if tmp_s is not None: 18 | train_status = tmp_s 19 | 20 | for pass_id in range(train_status.next(), params["num_epochs"]): 21 | train() 22 | 23 | if trainer_id == 0: 24 | saved_status = TrainStatus(pass_id) 25 | fleet.save_checkpoint(exe, train_status=saved_status, 26 | path=args.checkpoint, fs=fs) 27 | ``` 28 | 29 | The epoch's number is stored in `train_status` and the epoch number will be restored when the checkpoint is loaded. 30 | 31 | ## Start Resnet50 demo training multiple nodes: 32 | 33 | 1. Start a JobServer on one node which generates changing scripts. 34 | 35 | ``` 36 | node_ips="192.168.10.1,192.168.10.2" 37 | python -u paddle_edl.demo.collective.job_server_demo \ 38 | --node_ips ${node_ips} \ 39 | --pod_num_of_node 8 \ 40 | --time_interval_to_change 900 \ 41 | --gpu_num_of_node 8 42 | ``` 43 | 44 | 1. Start a Jobclient on every node which controls the worker process. 45 | 46 | ``` 47 | # set the ImageNet data path 48 | export PADDLE_EDL_IMAGENET_PATH= 49 | # set the checkpoint path 50 | export PADDLE_EDL_FLEET_CHECKPOINT_PATH= 51 | export PADDLE_JOBSERVER="http://192.168.10.1:8180" 52 | 53 | mkdir -p resnet50_pod 54 | unset http_proxy https_proxy 55 | 56 | # running under edl 57 | export PADDLE_RUNING_ENV=PADDLE_EDL 58 | export PADDLE_JOB_ID="test_job_id_1234" 59 | export PADDLE_POD_ID="not set" 60 | 61 | python -u paddle_edl.demo.collective.job_client_demo \ 62 | --log_level 20 \ 63 | --package_sh ./resnet50/package.sh \ 64 | --pod_path ./resnet50_pod \ 65 | ./train_pretrain.sh 66 | ``` 67 | 68 | 69 | ## On Kubernetes 70 | 71 | We have built the docker images for you and you can start a demo on Kubernetes immediately: 72 | TBD 73 | -------------------------------------------------------------------------------- /example/demo/collective/env.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #指定ImageNet的数据目录路径 16 | export PADDLE_EDL_IMAGENET_PATH=/root/go/dataset/ImageNet 17 | #指定`checkpoint`的目录,用来保存checkpoint 18 | export PADDLE_EDL_FLEET_CHECKPOINT_PATH=/root/go/checkpoints/resnet50_1 19 | -------------------------------------------------------------------------------- /example/demo/collective/resnet50/package.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -xe 18 | 19 | while true ; do 20 | case "$1" in 21 | -pod_id) pod_id="$2" ; shift 2 ;; 22 | *) 23 | if [[ ${#1} -gt 0 ]]; then 24 | echo "not supported arugments ${1}" ; exit 1 ; 25 | else 26 | break 27 | fi 28 | ;; 29 | esac 30 | done 31 | 32 | 33 | src_dir=../../../collective/resnet50 34 | dst_dir=resnet50_pod/${pod_id} 35 | 36 | echo "mkdir resnet50_pod/${pod_id}" 37 | mkdir -p "${dst_dir}" 38 | 39 | #copy resnet50 runtime env 40 | cp "${src_dir}"/*.py "${dst_dir}"/ 41 | cp "${src_dir}"/*.sh "${dst_dir}"/ 42 | cp -r "${src_dir}"/utils "${dst_dir}"/utils 43 | cp -r "${src_dir}"/models "${dst_dir}"/models 44 | cp -r "${src_dir}"/scripts "${dst_dir}"/scripts 45 | 46 | if [[ ! -d "${dst_dir}/ImageNet" ]]; then 47 | ln -s "${PADDLE_EDL_IMAGENET_PATH}" "${dst_dir}"/ 48 | fi 49 | 50 | if [[ ! -d "${dst_dir}/fleet_checkpoints" ]]; then 51 | ln -s "${PADDLE_EDL_FLEET_CHECKPOINT_PATH}" "${dst_dir}/fleet_checkpoints" 52 | fi 53 | -------------------------------------------------------------------------------- /example/demo/collective/start_job_client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | unset http_proxy https_proxy 19 | 20 | # running under edl 21 | export PADDLE_RUNING_ENV=PADDLE_EDL 22 | export PADDLE_JOBSERVER="http://127.0.0.1:8180" 23 | if [[ "${PADDLE_TRAINERS}x" != x ]]; then 24 | pod_arr=(${PADDLE_TRAINERS//,/ }) 25 | export PADDLE_JOBSERVER="http://${pod_arr[0]}:8180" 26 | fi 27 | export PADDLE_JOB_ID="test_job_id_1234" 28 | export PADDLE_POD_ID="not set" 29 | 30 | BASEDIR=$(dirname $(readlink -f $0)) 31 | echo $BASEDIR 32 | 33 | nohup python -u paddle_edl.demo.collective.job_client_demo \ 34 | --log_level 20 \ 35 | --package_sh ./resnet50/package.sh \ 36 | --pod_path ./resnet50_pod \ 37 | ./train_pretrain.sh > job_client.log 2>&1 & 38 | -------------------------------------------------------------------------------- /example/demo/collective/start_job_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | node_ips="127.0.0.1" 18 | if [[ "${PADDLE_TRAINERS}x" != "x" ]]; then 19 | node_ips=${PADDLE_TRAINERS} 20 | fi 21 | echo "node_ips:${node_ips}" 22 | 23 | BASEDIR=$(dirname $(readlink -f $0)) 24 | echo "${BASEDIR}" 25 | 26 | nohup python -u paddle_edl.demo.collective.job_server_demo \ 27 | --node_ips ${node_ips} \ 28 | --pod_num_of_node 8 \ 29 | --time_interval_to_change 900 \ 30 | --gpu_num_of_node 8 > job_server.log 2>&1 & 31 | -------------------------------------------------------------------------------- /example/distill/k8s/balance.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: balance 5 | labels: 6 | edl-distill-demo-app: balance 7 | spec: 8 | replicas: 1 9 | template: 10 | metadata: 11 | name: balance 12 | labels: 13 | app: balance 14 | spec: 15 | hostNetwork: true 16 | containers: 17 | - name: balance 18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7 19 | imagePullPolicy: Always 20 | workingDir: / 21 | command: ['/bin/bash', '-c'] 22 | args: ['sleep 3000'] 23 | resources: 24 | requests: 25 | memory: 20Gi 26 | cpu: 2 27 | limits: 28 | memory: 20Gi 29 | cpu: 2 30 | restartPolicy: Never 31 | -------------------------------------------------------------------------------- /example/distill/k8s/edl_k8s: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | 5 | balance_label="edl-distill-demo-app=student" 6 | etcd_label="edl-distill-demo-app=etcd" 7 | 8 | start_balance() { 9 | stdbuf -oL python /root/k8s_tools.py wait_pods_running ${etcd_label} 1 10 | export etcd_ip=$(python /root/k8s_tools.py fetch_ips ${etcd_label}) 11 | } 12 | 13 | usage() { 14 | echo "usage: paddle_k8s []:" 15 | echo " start_balance Start a blance" 16 | echo " start_student Start a stduent" 17 | echo " start_teacher Start a teacher" 18 | } 19 | 20 | case "$1" in 21 | start_balance) 22 | start_balance 23 | ;; 24 | start_stduent) 25 | start_student 26 | ;; 27 | start_teacher) 28 | start_teacher 29 | ;; 30 | --help) 31 | usage 32 | ;; 33 | *) 34 | usage 35 | ;; 36 | esac 37 | -------------------------------------------------------------------------------- /example/distill/k8s/etcd.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: etcd 5 | labels: 6 | edl-distill-demo-app: etcd 7 | spec: 8 | replicas: 1 9 | template: 10 | metadata: 11 | name: etcd 12 | labels: 13 | app: etcd 14 | spec: 15 | hostNetwork: true 16 | containers: 17 | - name: etcd 18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7 19 | imagePullPolicy: Always 20 | workingDir: / 21 | command: ['/bin/bash', '-c'] 22 | args: ['etcd'] 23 | ports: 24 | - containerPort: 2379 25 | name: serving 26 | resources: 27 | requests: 28 | memory: 20Gi 29 | cpu: 2 30 | limits: 31 | memory: 20Gi 32 | cpu: 2 33 | restartPolicy: Never 34 | -------------------------------------------------------------------------------- /example/distill/k8s/student.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: student 5 | labels: 6 | edl-distill-demo-app: student 7 | spec: 8 | replicas: 1 9 | template: 10 | metadata: 11 | name: student 12 | labels: 13 | app: student 14 | spec: 15 | hostNetwork: true 16 | containers: 17 | - name: student 18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7 19 | imagePullPolicy: Always 20 | workingDir: / 21 | command: ['/bin/bash', '-c'] 22 | args: ['sleep 3000'] 23 | resources: 24 | requests: 25 | memory: 20Gi 26 | cpu: 2 27 | limits: 28 | memory: 20Gi 29 | cpu: 2 30 | restartPolicy: Never 31 | -------------------------------------------------------------------------------- /example/distill/k8s/teacher.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: teacher 5 | labels: 6 | edl-distill-demo-app: teacher 7 | spec: 8 | replicas: 1 9 | template: 10 | metadata: 11 | name: teacher 12 | labels: 13 | app: teacher 14 | spec: 15 | hostNetwork: true 16 | containers: 17 | - name: teacher 18 | image: hub.baidubce.com/paddle-edl/paddle_edl:latest-cuda10.0-cudnn7 19 | imagePullPolicy: Always 20 | workingDir: / 21 | command: ['/bin/bash', '-c'] 22 | args: ['sleep 3000'] 23 | ports: 24 | - containerPort: 7001 25 | name: serving 26 | resources: 27 | requests: 28 | memory: 20Gi 29 | cpu: 2 30 | limits: 31 | memory: 20Gi 32 | cpu: 2 33 | restartPolicy: Never 34 | -------------------------------------------------------------------------------- /example/distill/mnist_distill/image/infer_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/distill/mnist_distill/image/infer_3.png -------------------------------------------------------------------------------- /example/distill/mnist_distill/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -eu 18 | 19 | if [ ! -f mnist_cnn_model.tar.gz ]; then 20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz 21 | fi 22 | tar -zxf mnist_cnn_model.tar.gz 23 | 24 | # at gpu 0, start paddle serving server on port 9292 25 | port=9292 26 | nohup python -m paddle_serving_server_gpu.serve \ 27 | --model mnist_cnn_model \ 28 | --thread 4 \ 29 | --port ${port} \ 30 | --mem_optim True \ 31 | --gpu_ids 0 & 32 | serving_pid=$! 33 | 34 | # start distill train 35 | export CUDA_VISIBLE_DEVICES=0 36 | python train_with_fleet.py \ 37 | --use_distill_service True \ 38 | --distill_teachers 127.0.0.1:${port} 39 | 40 | # kill serving server 41 | pstree -p ${serving_pid} | awk -F"[()]" '{print $2}'| xargs kill -9 42 | -------------------------------------------------------------------------------- /example/distill/nlp/README.md: -------------------------------------------------------------------------------- 1 | # ERNIE distillation 2 | We show how to distill knowledge from ERNIE to a mini model: BOW and other models on Chinese sentiment task. 3 | 4 | ## Quick start 5 | ### Download dataset 6 | ``` 7 | wget https://paddle-edl.bj.bcebos.com/distillation/chnsenticorp/data.tgz 8 | tar -xzvf data.tgz 9 | ``` 10 | 11 | ### Get the teacher model 12 | ``` 13 | nohup python -u ./fine_tune.py > finetune.log 2>&1 & 14 | ``` 15 | 16 | When the job completes, the directories needed for distillation: `ernie_senti_server` and `ernie_senti_client` will be generated. 17 | 18 | ### Or download the teacher model directly 19 | You can also download the teacher model directly and then you needn't generate the model yourself. 20 | 21 | ``` 22 | wget https://paddle-edl.bj.bcebos.com/distillation/chnsenticorp/ernie_senti.tgz 23 | tar -xzvf ernie_senti.tgz 24 | ``` 25 | 26 | ### Start a local teacher 27 | ``` 28 | nohup python -m paddle_serving_server_gpu.serve \ 29 | --model ./ernie_senti_server/ \ 30 | --port 19290 \ 31 | --thread 8 \ 32 | --mem_optim \ 33 | --gpu_ids 0 > teatcher.log 2>&1 & 34 | ``` 35 | 36 | ### Start a student 37 | Now the student is BOW. CNN, LSTM, tiny ernie will be added later. 38 | 39 | ``` 40 | python -u distill.py --fixed_teacher 127.0.0.1:19290 41 | ``` 42 | 43 | ### Result 44 | | model | dev dataset(acc) | test dataset(acc) | 45 | | :----: | :-----: | :----: | 46 | | BOW | 0.901 | 0.908 | 47 | | BOW + distillation | 0.905 | 0.915 | 48 | -------------------------------------------------------------------------------- /example/distill/nlp/test_distill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ 19 | export CUDA_VISIBLE_DEVICES=7 20 | 21 | fixed_teacher="127.0.0.1:19290,127.0.0.1:19291,127.0.0.1:19292,127.0.0.1:19293,127.0.0.1:19294,127.0.0.1:19295,127.0.0.1:19296,127.0.0.1:19297" 22 | 23 | for w in {1..10} 24 | do 25 | for T in {1..20} 26 | do 27 | wf=$( (echo scale=1 ; echo $w / 10 ) | bc ) 28 | Tf=$( (echo scale=1 ; echo $T ) | bc ) 29 | python3.6 -u distill.py \ 30 | --fixed_teacher $fixed_teacher \ 31 | --opt=AdamW \ 32 | --s_weight "$wf" \ 33 | --train_range 10 \ 34 | --LR 1e-4 \ 35 | --kl 0 \ 36 | --T "$Tf" \ 37 | --epoch_num 20 > log/"d_w${wf}_T${Tf}".log 2>&1 38 | done 39 | done 40 | 41 | exit 0 42 | 43 | nohup python3.6 -u distill.py \ 44 | --fixed_teacher $fixed_teacher \ 45 | --s_weight 0.05 \ 46 | --epoch_num 20 > d_2.log 2>&1 & 47 | 48 | nohup python3.6 -u distill.py \ 49 | --fixed_teacher $fixed_teacher \ 50 | --opt=Adam \ 51 | --LR=5e-5 \ 52 | --s_weight 0.05 \ 53 | --epoch_num 20 > d_3.log 2>&1 & 54 | -------------------------------------------------------------------------------- /example/distill/nlp/test_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | export LD_LIBRARY_PATH=/root/go/soft/env/cuda-9.0/lib64:/root/go/soft/cuda10-cudnn7.6.5.32/lib64:$LD_LIBRARY_PATH:/usr/lib64/:/usr/local/lib/ 18 | export CUDA_VISIBLE_DEVICES=7 19 | nohup python3.6 -u train.py > train_with_test.log 2>&1 & 20 | -------------------------------------------------------------------------------- /example/distill/qps_tools/distill_reader_qps.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | import datetime 17 | import time 18 | 19 | from paddle_edl.distill.distill_reader import DistillReader 20 | from parse_config import get_ins_predicts 21 | 22 | 23 | def sample_reader(shapes, dtypes, sample_num=1 << 12): 24 | def __reader_impl__(): 25 | for _ in range(sample_num): 26 | sample = tuple() 27 | for shape, dtype in zip(shapes, dtypes): 28 | sample += (np.random.random(shape).astype(dtype), ) 29 | yield sample 30 | 31 | return __reader_impl__ 32 | 33 | 34 | def qps(reader): 35 | pre_t = time.time() 36 | for step, _ in enumerate(reader()): 37 | if (step + 1) % 1000 == 0: 38 | now = datetime.datetime.now() 39 | t = time.time() 40 | print('{}, step={}, qps={} step/s'.format(now, step + 1, 1000.0 / ( 41 | t - pre_t))) 42 | pre_t = t 43 | 44 | 45 | def main(args): 46 | ins, ins_shape, ins_dtype, predicts = get_ins_predicts() 47 | print('{}, {}, {}, {}'.format(ins, ins_shape, ins_dtype, predicts)) 48 | 49 | reader = sample_reader(ins_shape, ins_dtype, 1 << 12) 50 | 51 | dr = DistillReader(ins=ins, predicts=predicts) 52 | dr.set_teacher_batch_size(args.teacher_bs) 53 | #dr.set_fixed_teacher(['10.255.100.13:9494']) 54 | distill_reader = dr.set_sample_generator(reader) 55 | 56 | qps(distill_reader) 57 | 58 | 59 | if __name__ == '__main__': 60 | import argparse 61 | parser = argparse.ArgumentParser(description='qps test') 62 | parser.add_argument( 63 | '--teacher_bs', 64 | type=int, 65 | default=1, 66 | help='teacher batch_size [default: %(default)s]') 67 | args = parser.parse_args() 68 | main(args) 69 | -------------------------------------------------------------------------------- /example/distill/qps_tools/parse_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from paddle_serving_client import Client as ServingClient 17 | 18 | 19 | def get_ins_predicts(conf_file=None): 20 | """ May deprecated in future""" 21 | client_types = ['int64', 'float32'] 22 | 23 | if conf_file is not None and os.path.isfile(conf_file): 24 | conf_file = conf_file 25 | elif os.path.isfile('./serving_conf/serving_client_conf.prototxt'): 26 | conf_file = './serving_conf/serving_client_conf.prototxt' 27 | else: 28 | conf_file = os.getenv('PADDLE_DISTILL_CONF_FILE') 29 | assert conf_file is not None 30 | assert os.path.isfile(conf_file) 31 | 32 | client = ServingClient() 33 | client.load_client_config(conf_file) 34 | 35 | feeds = client.get_feed_names() 36 | feeds_shapes = [] 37 | feeds_dtype = [] 38 | for feed_name in feeds: 39 | shape = client.feed_shapes_[feed_name] 40 | feeds_shapes.append(tuple(shape)) 41 | feeds_dtype.append(client_types[client.feed_types_[feed_name]]) 42 | 43 | predicts = client.get_fetch_names() 44 | return feeds, feeds_shapes, feeds_dtype, predicts 45 | -------------------------------------------------------------------------------- /example/distill/qps_tools/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # local test qps 18 | export PADDLE_DISTILL_BALANCE_SERVER='10.255.100.13:9379' 19 | export PADDLE_DISTILL_SERVICE_NAME=MnistDistill 20 | export PADDLE_DISTILL_MAX_TEACHER=1 21 | export PADDLE_DISTILL_CONF_FILE="$PWD/../reader_demo/serving_conf/serving_client_conf.prototxt" 22 | 23 | batch_size=(1 2 4 8 16 24 32) 24 | for x in ${batch_size[@]}; do 25 | echo "-------- batch_size=$x ---------" 26 | python distill_reader_qps.py --teacher_bs $x 27 | echo 28 | done 29 | -------------------------------------------------------------------------------- /example/distill/reader_demo/run_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -eu 18 | 19 | if [ ! -f mnist_cnn_model.tar.gz ]; then 20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/mnist_cnn_model.tar.gz 21 | fi 22 | tar -zxf mnist_cnn_model.tar.gz 23 | 24 | # at gpu 0, start paddle serving server on port 9292 25 | port=9292 26 | nohup python -m paddle_serving_server_gpu.serve \ 27 | --model mnist_cnn_model \ 28 | --thread 4 \ 29 | --port ${port} \ 30 | --mem_optim True \ 31 | --gpu_ids 0 & 32 | serving_pid=$! 33 | 34 | python distill_reader_demo.py --distill_teachers 127.0.0.1:${port} 35 | 36 | # kill serving server 37 | pstree -p ${serving_pid} | awk -F"[()]" '{print $2}'| xargs kill -9 38 | -------------------------------------------------------------------------------- /example/distill/resnet/README.md: -------------------------------------------------------------------------------- 1 | # ResNeXt101_32x16d_wsl distill ResNet50_vd 2 | 3 | ## Local test 4 | ### start local teacher 5 | start ResNeXt101_32x16d_wsl teacher on gpu 1 6 | ``` bash 7 | bash ./scripts/start_local_teacher.sh 8 | ``` 9 | ### train student with local teacher 10 | At another terminal, train resnet50_vd student on gpu 0. 11 | ``` bash 12 | bash ./scripts/train_student.sh 13 | ``` 14 | -------------------------------------------------------------------------------- /example/distill/resnet/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .vgg import VGG11, VGG13, VGG16, VGG19 16 | from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 17 | from .resnet_vd import ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd, ResNet200_vd 18 | -------------------------------------------------------------------------------- /example/distill/resnet/scripts/start_local_teacher.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -eu 18 | 19 | if [ ! -f ResNeXt101_32x16d_wsl_model.tar.gz ]; then 20 | wget --no-check-certificate https://paddle-edl.bj.bcebos.com/distill_teacher_model/ResNeXt101_32x16d_wsl_model.tar.gz 21 | fi 22 | tar -zxf ResNeXt101_32x16d_wsl_model.tar.gz 23 | 24 | port=9898 25 | python -m paddle_serving_server_gpu.serve \ 26 | --model ResNeXt101_32x16d_wsl_model \ 27 | --thread 4 \ 28 | --port ${port} \ 29 | --mem_optim True \ 30 | --gpu_ids 1 31 | -------------------------------------------------------------------------------- /example/distill/resnet/scripts/train_student.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Unset proxy 18 | unset https_proxy http_proxy 19 | 20 | export GLOG_v=1 21 | export GLOG_logtostderr=1 22 | export FLAGS_eager_delete_tensor_gb=0 23 | export NCCL_DEBUG=INFO 24 | 25 | python -m paddle.distributed.launch --selected_gpus 0 \ 26 | ./train_with_fleet.py \ 27 | --model=ResNet50_vd \ 28 | --data_dir=./ImageNet \ 29 | --lr_strategy=cosine_warmup_decay \ 30 | --use_distill_service=True \ 31 | --distill_teachers=127.0.0.1:9898 32 | -------------------------------------------------------------------------------- /example/distill/resnet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /example/fit_a_line/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM paddlepaddle/paddlecloud-job:0.11.0 2 | RUN mkdir -p /data/recordio/imikolov && \ 3 | python -c "import paddle; import paddle.v2.dataset as dataset; word_dict = dataset.imikolov.build_dict(); \ 4 | dataset.imikolov.train(word_dict, 5); dataset.imikolov.test(word_dict, 5); \ 5 | dataset.common.convert('/data/recordio/imikolov/', dataset.imikolov.train(word_dict, 5), 5000, 'imikolov-train')" 6 | 7 | RUN mkdir -p /workspace 8 | ADD train_ft.py /workspace 9 | -------------------------------------------------------------------------------- /example/fit_a_line/collector.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/collector.pyc -------------------------------------------------------------------------------- /example/fit_a_line/del_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | function delete_job() { 18 | jobname=$1 19 | if [[ "$jobname" == "" ]]; then 20 | echo "Usage: sh edl_jobs.sh [all|]" 21 | exit 0 22 | fi 23 | kubectl delete trainingjob $jobname 24 | kubectl delete job $jobname-trainer 25 | kubectl delete rs $jobname-master $jobname-pserver 26 | } 27 | 28 | function delete_all() { 29 | jobs=$(kubectl get trainingjob | tail -n +2 | awk '{print $1}') 30 | for job in ${jobs[@]} 31 | do 32 | delete_job $job 33 | done 34 | } 35 | 36 | case "$1" in 37 | all) 38 | delete_all 39 | ;; 40 | *) 41 | delete_job $1 42 | ;; 43 | esac 44 | -------------------------------------------------------------------------------- /example/fit_a_line/examplejob.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: paddlepaddle.org/v1 2 | kind: TrainingJob 3 | metadata: 4 | name: example 5 | spec: 6 | image: "paddlepaddle/edl-example" 7 | port: 7164 8 | ports_num: 1 9 | ports_num_for_sparse: 1 10 | fault_tolerant: true 11 | trainer: 12 | entrypoint: "python /workspace/train_ft.py" 13 | workspace: "/workspace" 14 | passes: 50 15 | min-instance: 2 16 | max-instance: 10 17 | resources: 18 | limits: 19 | #alpha.kubernetes.io/nvidia-gpu: 1 20 | cpu: "200m" 21 | memory: "200Mi" 22 | requests: 23 | cpu: "200m" 24 | memory: "200Mi" 25 | pserver: 26 | min-instance: 2 27 | max-instance: 2 28 | resources: 29 | limits: 30 | cpu: "800m" 31 | memory: "1Gi" 32 | requests: 33 | cpu: "500m" 34 | memory: "600Mi" 35 | master: 36 | resources: 37 | limits: 38 | cpu: "1" 39 | memory: "1Gi" 40 | requests: 41 | cpu: "500m" 42 | memory: "600Mi" 43 | -------------------------------------------------------------------------------- /example/fit_a_line/fluid/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import cPickle 17 | import paddle 18 | import glob 19 | 20 | 21 | def prepare_dataset(output_path, name_prefix, reader_func, sample_count=128): 22 | if not os.path.exists(output_path): 23 | os.makedirs(output_path) 24 | 25 | suffix = "%s/%s-%%05d.pickle" % (output_path, name_prefix) 26 | lines = [] 27 | indx_f = 0 28 | for i, d in enumerate(reader_func()): 29 | lines.append(d) 30 | if i >= sample_count and i % sample_count == 0: 31 | with open(suffix % indx_f, "w") as f: 32 | cPickle.dump(lines, f) 33 | lines = [] 34 | indx_f += 1 35 | if lines: 36 | with open(suffix % indx_f, "w") as f: 37 | cPickle.dump(lines, f) 38 | 39 | 40 | def cluster_reader(files_path, trainers, trainer_id): 41 | def reader(): 42 | flist = glob.glob(files_path) 43 | flist.sort() 44 | my_file_list = [] 45 | for idx, fn in enumerate(flist): 46 | if idx % trainers == trainer_id: 47 | print("append file for current trainer: %s" % fn) 48 | my_file_list.append(fn) 49 | 50 | for fn in my_file_list: 51 | print("processing file: ", fn) 52 | with open(fn, "r") as f: 53 | lines = cPickle.load(f) 54 | for line in lines: 55 | yield line 56 | 57 | return reader 58 | -------------------------------------------------------------------------------- /example/fit_a_line/fluid/image/infer_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/fluid/image/infer_3.png -------------------------------------------------------------------------------- /example/fit_a_line/fluid/image/ranges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/example/fit_a_line/fluid/image/ranges.png -------------------------------------------------------------------------------- /example/fit_a_line/nginx.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: nginx-deployment 5 | spec: 6 | selector: 7 | matchLabels: 8 | app: nginx 9 | replicas: 5 # tells deployment to run 2 pods matching the template 10 | template: 11 | metadata: 12 | labels: 13 | app: nginx 14 | spec: 15 | containers: 16 | - name: nginx 17 | image: nginx:1.7.9 18 | ports: 19 | - containerPort: 80 20 | resources: 21 | limits: 22 | cpu: "600m" 23 | memory: "200Mi" 24 | requests: 25 | cpu: "400m" 26 | memory: "100Mi" 27 | -------------------------------------------------------------------------------- /k8s/edl_controller.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: training-job-controller 5 | namespace: paddlecloud 6 | spec: 7 | replicas: 1 8 | template: 9 | metadata: 10 | labels: 11 | name: training-job-controller 12 | spec: 13 | containers: 14 | - name: training-job-controller 15 | image: yancey1989/edl-controller 16 | env: 17 | - name: https_proxy 18 | value: "" 19 | - name: http_proxy 20 | value: "" 21 | command: ["/usr/local/bin/edl", "-logtostderr", "-log_level", "debug", "-max_load_desired", "0.9"] 22 | -------------------------------------------------------------------------------- /k8s/rbac_admin.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1alpha1 3 | metadata: 4 | name: cluster-admin--default-system:default 5 | roleRef: 6 | apiGroup: rbac.authorization.k8s.io 7 | kind: ClusterRole 8 | name: cluster-admin 9 | subjects: 10 | - kind: ServiceAccount 11 | name: default 12 | namespace: default 13 | --- 14 | kind: ClusterRoleBinding 15 | apiVersion: rbac.authorization.k8s.io/v1alpha1 16 | metadata: 17 | name: cluster-admin--paddlecloud-system:default 18 | roleRef: 19 | apiGroup: rbac.authorization.k8s.io 20 | kind: ClusterRole 21 | name: cluster-admin 22 | subjects: 23 | - kind: ServiceAccount 24 | name: default 25 | namespace: paddlecloud 26 | -------------------------------------------------------------------------------- /k8s/thirdpartyresource.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: ThirdPartyResource 3 | metadata: 4 | name: training-job.paddlepaddle.org 5 | description: "PaddlePaddle TrainingJob operator" 6 | versions: 7 | - name: v1 8 | -------------------------------------------------------------------------------- /logo/edl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/logo/edl.png -------------------------------------------------------------------------------- /logo/paddle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elasticdeeplearning/edl/f065ec02bb27a67c80466103e298bd6f37494048/logo/paddle.png -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh discovery/*.py distill/*.py distill/redis/*.py setup.py) 2 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) 3 | add_custom_command( 4 | OUTPUT ${EDL_BINARY_DIR}/.timestamp 5 | COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/ 6 | COMMAND python3.6 ./setup.py bdist_wheel --universal 7 | DEPENDS ${EDL_FILES}) 8 | add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp) 9 | add_subdirectory(edl/tests/unittests) 10 | -------------------------------------------------------------------------------- /python/edl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/collective/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/collective/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class FileSplitter(object): 17 | """ 18 | This the interface user should inherit. 19 | It will let's the framework knows the data file it's processing. 20 | TxtDataReader is an example. 21 | """ 22 | 23 | def __init__(self, data_file): 24 | self._data_file = data_file 25 | 26 | def __iter__(self): 27 | """ 28 | yield idx, record data 29 | """ 30 | raise NotImplementedError() 31 | 32 | 33 | class TxtFileSplitter(FileSplitter): 34 | def __init__(self, data_file): 35 | super(TxtFileSplitter, self).__init__(data_file) 36 | 37 | def __iter__(self): 38 | idx = 0 39 | with open(self._data_file, "r") as f: 40 | for line in f: 41 | line = line.strip() 42 | if len(line) <= 0: 43 | continue 44 | idx += 1 45 | yield idx, line 46 | -------------------------------------------------------------------------------- /python/edl/collective/launch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | paddle.distributed.launch is a module that spawns multiple distributed 16 | process on each training node for gpu training. 17 | """ 18 | 19 | from __future__ import print_function 20 | 21 | import sys 22 | from edl.utils import args_utils 23 | from edl.utils import env as edl_env 24 | from edl.utils import etcd_db 25 | from edl.utils import launcher as edl_launcher 26 | from edl.utils import log_utils 27 | from edl.utils import status as edl_status 28 | from edl.utils.log_utils import logger 29 | from edl.utils import pod as edl_pod 30 | 31 | 32 | def main(): 33 | log_utils.get_logger(log_level=10) 34 | args = args_utils.parse_args() 35 | args_dict = args_utils.convert_args_to_dict(args) 36 | 37 | # job enviroment. 38 | job_env = edl_env.JobEnv(args_dict) 39 | logger.info("get job env:{}".format(str(job_env))) 40 | 41 | # get global etcd and lock 42 | etcd = etcd_db.get_global_etcd(job_env.etcd_endpoints, job_env.job_id) 43 | 44 | last_status = edl_status.load_job_status_from_etcd(etcd, timeout=30) 45 | if last_status == edl_status.Status.SUCCEED: 46 | logger.info("job:{} has completed! Need't try!".format(job_env.job_id)) 47 | sys.exit(0) 48 | 49 | # local pod, and the pod's id does't change. 50 | pod = edl_pod.Pod() 51 | pod.from_env(job_env) 52 | 53 | launcher = edl_launcher.Launcher(job_env=job_env, pod=pod, etcd=etcd, args=args) 54 | launcher.init() 55 | launcher.launch() 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /python/edl/collective/serializable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from edl.utils.json_serializable import SerializableBase 16 | 17 | __all__ = ["SerializableBase"] 18 | -------------------------------------------------------------------------------- /python/edl/discovery/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/discovery/server_alive.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import socket 16 | from contextlib import closing 17 | 18 | 19 | def is_server_alive(server): 20 | """ is server alive 21 | return alive, client_addr 22 | """ 23 | alive = True 24 | client_addr = None 25 | ip, port = server.split(":") 26 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 27 | try: 28 | s.settimeout(1.5) 29 | s.connect((ip, int(port))) 30 | client_addr = s.getsockname() 31 | s.shutdown(socket.SHUT_RDWR) 32 | except socket.error: 33 | alive = False 34 | return alive, client_addr 35 | -------------------------------------------------------------------------------- /python/edl/distill/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/distill/redis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/distill/redis/redis_store.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import redis 16 | 17 | 18 | class RedisStore(object): 19 | def __init__(self, ip="127.0.0.1", port=6379, passwd=None): 20 | self._ip = ip 21 | self._port = port 22 | self._passwd = passwd 23 | self._redis = redis.Redis( 24 | host=ip, port=port, password=passwd, decode_responses=True 25 | ) 26 | print("connected to redis ip:{} port:{}".format(ip, port)) 27 | 28 | def get_service(self, service_name): 29 | servers = [] 30 | for key in self._redis.scan_iter("/service/{}/nodes/*".format(service_name)): 31 | servers.append(self._redis.hgetall(key)) 32 | return servers 33 | 34 | def remove_service(self, service_name): 35 | for key in self._redis.scan_iter("/service/{}/*".format(service_name)): 36 | self._redis.delete(key) 37 | 38 | def set_server(self, service_name, server, info, ttl=6): 39 | server_info = {"server": server, "info": info} 40 | key = "/service/{}/nodes/{}".format(service_name, server) 41 | self._redis.hmset(key, server_info) 42 | self._redis.expire(key, ttl) 43 | 44 | def remove_server(self, service_name, server): 45 | self._redis.delete("/service/{}/nodes/{}".format(service_name, server)) 46 | 47 | def refresh(self, service_name, server, info=None, ttl=6): 48 | if info is not None: 49 | self.set_server(self, service_name, server, info, ttl) 50 | return True 51 | key = "/service/{}/nodes/{}".format(service_name, server) 52 | time = self._redis.ttl(key) 53 | if time < 0: 54 | return False 55 | self._redis.expire(key, ttl) 56 | return True 57 | 58 | def get_client(self, client): 59 | # Todo 60 | pass 61 | 62 | def set_client(self, client, service_name): 63 | # Todo 64 | pass 65 | 66 | 67 | if __name__ == "__main__": 68 | service_name = "TestService" 69 | store = RedisStore("127.0.0.1", 6379) 70 | print(store.get_service(service_name)) 71 | store.set_server(service_name, "127.0.0.1:5454", "{cpu: 10%, gpu: 20%}") 72 | print(store.get_service(service_name)) 73 | -------------------------------------------------------------------------------- /python/edl/distill/timeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | import time 18 | 19 | 20 | class _NopTimeLine(object): 21 | def record(self, name): 22 | pass 23 | 24 | def reset(self): 25 | pass 26 | 27 | 28 | class _RealTimeLine(object): 29 | def __init__(self): 30 | self.pid = os.getpid() 31 | self.time = time.time() 32 | 33 | def record(self, name): 34 | new_time = time.time() 35 | sys.stderr.write( 36 | "pid={} op={} time={}ms\n".format( 37 | self.pid, name, (new_time - self.time) * 1000 38 | ) 39 | ) 40 | self.time = new_time 41 | 42 | def reset(self): 43 | self.time = time.time() 44 | 45 | 46 | _is_profile = int(os.environ.get("DISTILL_READER_PROFILE", 0)) 47 | _TimeLine = _RealTimeLine if _is_profile else _NopTimeLine 48 | -------------------------------------------------------------------------------- /python/edl/distill/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from paddle.distributed.fs_wrapper import BDFS 17 | 18 | 19 | def download_hdfs_file(model_name, dst_path): 20 | """ 21 | teacher model name 22 | dst_path: dst directory name 23 | """ 24 | hdfs_name = os.getenv("PADDLE_DISTILL_HDFS_NAME") 25 | hdfs_ugi = os.getenv("PADDLE_DISTILL_HDFS_UGI") 26 | hdfs_path = os.getenv("PADDLE_DISTILL_HDFS_PATH") 27 | assert hdfs_name, "hdfs_name must be set" 28 | assert hdfs_ugi, "hdfs_ugi must be set" 29 | assert hdfs_path, "hdfs_path must be set" 30 | 31 | fs = BDFS(hdfs_name, hdfs_ugi) 32 | 33 | proto_path = hdfs_path + "/" + model_name + "/serving_server_conf.prototxt" 34 | fs.download(proto_path, dst_path) 35 | -------------------------------------------------------------------------------- /python/edl/liveft/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/liveft/launch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | import signal 17 | 18 | from edl.liveft.elastic import ElasticManager 19 | from edl.liveft.elastic import LauncherInterface 20 | from edl.liveft.elastic import ElasticStatus 21 | from edl.liveft.elastic import ELASTIC_EXIT_CODE 22 | 23 | 24 | def launch(): 25 | # user interface for launching the pserver. 26 | # launch_ps() 27 | # return 28 | 29 | elastic = ElasticManager() 30 | 31 | signal.signal(signal.SIGTERM, elastic.signal_handler) 32 | signal.signal(signal.SIGABRT, elastic.signal_handler) 33 | signal.signal(signal.SIGINT, elastic.signal_handler) 34 | 35 | while True: 36 | 37 | # wait for all nodes ready to run 38 | elastic.wait() 39 | 40 | # run self with specified launcher 41 | elastic.run(LauncherInterface) 42 | 43 | # keep watching the health status of self and being notified for other's failure 44 | ret = elastic.watch() 45 | if ret == ElasticStatus.COMPLETED: 46 | break 47 | if ret == ElasticStatus.HOLD: 48 | continue 49 | if ret == ElasticStatus.EXIT: 50 | break 51 | if ret == ElasticStatus.ERROR: 52 | sys.exit(3) 53 | if ret == ElasticStatus.RESTART: 54 | sys.exit(ELASTIC_EXIT_CODE) 55 | 56 | if int(elastic.sigint) > 0: 57 | sys.exit(128 + int(elastic.sigint)) 58 | else: 59 | sys.exit(0) 60 | 61 | 62 | if __name__ == "__main__": 63 | launch() 64 | -------------------------------------------------------------------------------- /python/edl/protos/common.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | option go_package = ".;masterpb"; 18 | package common; 19 | 20 | message Status { 21 | string type = 1; 22 | string detail = 2; 23 | } 24 | 25 | message EmptyRet { Status status = 1; } 26 | -------------------------------------------------------------------------------- /python/edl/protos/data_server.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | import "common.proto"; 18 | 19 | option go_package = ".;masterpb"; 20 | package data_server; 21 | 22 | message ShutDownRequest {} 23 | message EmptyRequest {} 24 | 25 | message FileListElement { 26 | int64 idx = 1; 27 | string path = 2; 28 | } 29 | 30 | message FileListRequest { 31 | string pod_id = 1; 32 | string reader_name = 2; 33 | // used for verify 34 | repeated FileListElement file_list = 3; 35 | } 36 | 37 | message FileListResponse { 38 | common.Status status = 1; 39 | repeated FileListElement file_list = 2; 40 | } 41 | 42 | message Record { 43 | int64 record_no = 1; 44 | repeated bytes field_data = 2; 45 | } 46 | 47 | message BatchData { 48 | string batch_data_id = 1; 49 | repeated Record records = 2; 50 | } 51 | 52 | message BatchDataMeta { 53 | string reader_name = 1; 54 | string producer_pod_id = 2; 55 | string consumer_pod_id = 3; 56 | string data_server_endpoint = 4; 57 | 58 | repeated string batch_data_ids = 5; 59 | } 60 | 61 | message ReportBatchDataMetaRequest { 62 | string reader_name = 1; 63 | string pod_id = 2; 64 | string data_server_endpoint = 3; 65 | 66 | repeated string batch_data_ids = 4; 67 | } 68 | 69 | message GetBatchDataMetaRequest { 70 | string reader_name = 1; 71 | string pod_id = 2; 72 | } 73 | 74 | message ReachDataEndRequest { 75 | string reader_name = 1; 76 | string pod_id = 2; 77 | } 78 | 79 | message BatchDataMetaResponse { 80 | common.Status status = 1; 81 | repeated BatchDataMeta data = 2; 82 | } 83 | 84 | message BatchDataResponse { 85 | common.Status status = 1; 86 | repeated BatchData data = 2; 87 | } 88 | 89 | service DataServer { 90 | // only leader can do this 91 | rpc ReportBatchDataMeta(ReportBatchDataMetaRequest) 92 | returns (common.EmptyRet) {} 93 | rpc ReachDataEnd(ReachDataEndRequest) returns (common.EmptyRet) {} 94 | rpc GetBatchDataMeta(GetBatchDataMetaRequest) 95 | returns (BatchDataMetaResponse) {} 96 | 97 | // all data servers can do this 98 | rpc GetFileList(FileListRequest) returns (FileListResponse) {} 99 | rpc GetBatchData(BatchDataMeta) returns (BatchDataResponse) {} 100 | } 101 | -------------------------------------------------------------------------------- /python/edl/protos/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -xe 18 | 19 | #TODO(gongwb): reopen them then async trainning 20 | #pushd /tmp/ 21 | #go get -u -v github.com/golang/protobuf/protoc-gen-go@v1.3.0 22 | #popd 23 | 24 | #protoc --go_out=plugins=grpc:./ master.proto 25 | #protoc --go_out=plugins=grpc:./ common.proto 26 | 27 | #mkdir -p ../../../pkg/masterpb 28 | #mv *.go ../../../pkg/masterpb 29 | 30 | # see the build.sh to get the pakage version 31 | which python 32 | python ./run_codegen.py 33 | 34 | # generate python compatabile path 35 | sed -i -r 's/^import (.+_pb2.*)/from . import \1/g' ./*_pb2*.py 36 | 37 | # import os 38 | mv pod_server*.py data_server*.py common*.py ../utils/ 39 | mv distill_discovery*.py ../distill/ 40 | -------------------------------------------------------------------------------- /python/edl/protos/pod_server.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | import "common.proto"; 18 | 19 | option go_package = ".;masterpb"; 20 | package pod_server; 21 | 22 | message BarrierRequest { 23 | string job_id = 1; 24 | string pod_id = 2; 25 | } 26 | 27 | message BarrierResponse { 28 | common.Status status = 1; 29 | string cluster_json = 2; 30 | } 31 | 32 | message ScaleInRequest { int32 num = 1; } 33 | 34 | message ScaleOutRequest {} 35 | 36 | service PodServer { 37 | rpc Barrier(BarrierRequest) returns (BarrierResponse) {} 38 | 39 | // Cluster controller -> master 40 | rpc ScaleOut(ScaleOutRequest) returns (common.Status) {} 41 | rpc ScaleIn(ScaleInRequest) returns (common.Status) {} 42 | } 43 | -------------------------------------------------------------------------------- /python/edl/protos/run_codegen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Copyright 2015 gRPC authors. 16 | # 17 | # Licensed under the Apache License, Version 2.0 (the "License"); 18 | # you may not use this file except in compliance with the License. 19 | # You may obtain a copy of the License at 20 | # 21 | # http://www.apache.org/licenses/LICENSE-2.0 22 | # 23 | # Unless required by applicable law or agreed to in writing, software 24 | # distributed under the License is distributed on an "AS IS" BASIS, 25 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | # See the License for the specific language governing permissions and 27 | # limitations under the License. 28 | """Runs protoc with the gRPC plugin to generate messages and gRPC stubs.""" 29 | 30 | from grpc_tools import protoc 31 | import pkg_resources 32 | import sys 33 | 34 | print("run code gen python verion:", sys.version_info) 35 | 36 | # python -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. data_server.proto 37 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "common.proto",)) 38 | 39 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "pod_server.proto",)) 40 | 41 | protoc.main(("", "-I.", "--python_out=.", "--grpc_python_out=.", "data_server.proto",)) 42 | 43 | proto_include = pkg_resources.resource_filename("grpc_tools", "_proto") 44 | protoc.main( 45 | ( 46 | "", 47 | "-I.", 48 | "-I{}".format(proto_include), 49 | "--python_out=.", 50 | "--grpc_python_out=.", 51 | "distill_discovery.proto", 52 | ) 53 | ) 54 | -------------------------------------------------------------------------------- /python/edl/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/data_server/a.txt: -------------------------------------------------------------------------------- 1 | a0 2 | a1 3 | a2 4 | a3 5 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/data_server/b.txt: -------------------------------------------------------------------------------- 1 | b0 2 | b1 3 | b2 4 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/del_from_etcd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from edl.utils import constants 17 | from edl.discovery import etcd_client 18 | 19 | g_etcd_endpoints = "127.0.0.1:2379" 20 | 21 | job_id = os.environ["PADDLE_JOB_ID"] 22 | etcd_endpoints = os.environ["PADDLE_ETCD_ENDPOINTS"] 23 | etcd = etcd_client.EtcdClient([g_etcd_endpoints], root=job_id) 24 | etcd.init() 25 | constants.clean_etcd(etcd) 26 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/distill_reader_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import edl.distill.distill_reader as distill_reader 16 | import numpy as np 17 | import unittest 18 | 19 | 20 | class TestDistillReader(unittest.TestCase): 21 | def test_distill_reader(self): 22 | # temp local test 23 | distill_reader.distill_worker._NOP_PREDICT_TEST = True 24 | 25 | # test mnist distill reader 26 | def _reader(): 27 | img = np.array( 28 | [(i + 1) / 28.0 for i in range(28)] * 28, dtype=np.float32 29 | ).reshape((1, 28, 28)) 30 | label = np.array([100], dtype=np.int64) 31 | for i in range(24): 32 | yield 8 * [(img, label)] 33 | yield 2 * [(img, label)] 34 | 35 | dr = distill_reader.DistillReader(ins=["img", "label"], predicts=["prediction"]) 36 | dr.set_teacher_batch_size(4) 37 | dr.set_fixed_teacher(["127.0.0.1:9292", "127.0.0.1:9293"]) 38 | # dr.set_dynamic_teacher(['127.0.0.1:7001'], 'DistillReaderTest', 3) 39 | 40 | train_reader = dr.set_sample_list_generator(_reader) 41 | dr.print_config() 42 | 43 | for epoch in range(300): 44 | for step, batch in enumerate(train_reader()): 45 | if epoch == 0 and step == 0: 46 | dr.print_config() 47 | if epoch % 10 == 0: 48 | print( 49 | "^^^^^^^^^^^^^ epoch={} predict[0][0]={}^^^^^^^^^^^^^^".format( 50 | epoch, batch[-1][-1][0] 51 | ) 52 | ) 53 | 54 | 55 | if __name__ == "__main__": 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/edl_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | 16 | pod_id = os.getenv("PADDLE_POD_ID", "") 17 | print(pod_id + "__edl_demo__") 18 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/etcd_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | name=${TEST_TARGET_NAME} 18 | TEST_TIMEOUT=${TEST_TIMEOUT} 19 | 20 | if [[ ${name}"x" == "x" ]]; then 21 | echo "can't find ${name}, please set ${TEST_TARGET_NAME} first" 22 | exit 1 23 | fi 24 | 25 | if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then 26 | echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first" 27 | exit 1 28 | fi 29 | 30 | # rm flag file 31 | rm -f "${name}"_*.log 32 | 33 | # start the unit test 34 | run_time=$(( TEST_TIMEOUT - 10 )) 35 | echo "run_time: ${run_time}" 36 | 37 | timeout -s SIGKILL ${run_time} "${PYTHON_EXECUTABLE}" -u "${name}.py" > "${name}_run.log" 2>&1 38 | exit_code=$? 39 | 40 | echo "${name} faild with ${exit_code}" 41 | if [[ $exit_code -eq 0 ]]; then 42 | exit 0 43 | fi 44 | 45 | echo "${name} log" 46 | for log in ./"${name}"_*.log 47 | do 48 | printf "\ncat %s\n", "${log}" 49 | cat -n "${log}" 50 | done 51 | 52 | exit 1 53 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/etcd_test_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | import edl.utils.constants as constants 17 | import edl.utils.log_utils as log_utils 18 | import os 19 | import unittest 20 | from edl.discovery.etcd_client import EtcdClient 21 | from edl.utils import env as edl_env 22 | 23 | g_etcd_endpoints = "127.0.0.1:2379" 24 | 25 | 26 | class EtcdTestBase(unittest.TestCase): 27 | def setUp(self, job_id): 28 | log_utils.get_logger(log_level=10) 29 | self._etcd = EtcdClient([g_etcd_endpoints], root=job_id) 30 | self._etcd.init() 31 | 32 | self._old_environ = copy.copy(dict(os.environ)) 33 | proc_env = { 34 | "PADDLE_TRAINER_ID": "0", 35 | "PADDLE_RUNNING_PLATFORM": "PADDLE_CLOUD", 36 | "PADDLE_JOB_ID": job_id, 37 | "PADDLE_EDL_HDFS_HOME": "/usr/local/hadoop-2.7.7", 38 | "PADDLE_EDL_HDFS_NAME": "", 39 | "PADDLE_EDL_HDFS_UGI": "", 40 | "PADDLE_EDL_HDFS_PATH": "test_register_path", 41 | "PADDLE_EDL_ONLY_FOR_CE_TEST": "1", 42 | "PADDLE_EDL_FS_CACHE": ".test_register_cache", 43 | "PADDLE_EDL_SAVE_CHECKPOINT_INTER": "0", 44 | "PADDLE_EDL_NODES_RANGE": "1:4", 45 | "PADDLE_EDL_NPROC_PERNODE": "1", 46 | "PADDLE_ETCD_ENDPOINTS": "127.0.0.1:2379", 47 | "PADDLE_EDLNODES_RANAGE": "2:2", 48 | "CUDA_VISIBLE_DEVICES": "0", 49 | "PADDLE_TRAINER_PORTS": "6670", 50 | } 51 | os.environ.pop("https_proxy", None) 52 | os.environ.pop("http_proxy", None) 53 | os.environ.update(proc_env) 54 | 55 | self._job_env = edl_env.JobEnv(None) 56 | constants.clean_etcd(self._etcd) 57 | 58 | def tearDown(self): 59 | os.environ.clear() 60 | os.environ.update(self._old_environ) 61 | constants.clean_etcd(self._etcd) 62 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/launch_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import sys 17 | 18 | exit_code = int(os.getenv("PADDLE_DEMO_EXIT_CODE")) 19 | print("exit code:", exit_code) 20 | sys.exit(exit_code) 21 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/master_client_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import paddle_edl.utils.master_pb2 as master_pb2 17 | import unittest 18 | from edl.utils.master_client import Client 19 | from edl.utils.utils import get_file_list, get_logger 20 | 21 | os.environ["https_proxy"] = "" 22 | os.environ["http_proxy"] = "" 23 | 24 | 25 | class TestMasterClient(unittest.TestCase): 26 | def setUp(self): 27 | self._client = Client("127.0.0.1:8080") 28 | 29 | def test_add_dataset(self): 30 | dataset = master_pb2.DataSet() 31 | dataset.name = "train" 32 | for t in get_file_list("./test_file_list.txt"): 33 | dataset.file_list.append(t[0]) 34 | 35 | res = self._client.add_dataset(dataset) 36 | assert res is None or res.type == "", "must not any error" 37 | 38 | res = self._client.add_dataset(dataset) 39 | assert res.type == "DuplicateInitDataSet", "must error" 40 | 41 | 42 | if __name__ == "__main__": 43 | logger = get_logger(10) 44 | unittest.main() 45 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/serving_conf/serving_client_conf.prototxt: -------------------------------------------------------------------------------- 1 | feed_var { 2 | name: "img" 3 | alias_name: "img" 4 | is_lod_tensor: false 5 | feed_type: 1 6 | shape: 1 7 | shape: 28 8 | shape: 28 9 | } 10 | fetch_var { 11 | name: "fc_0.tmp_2" 12 | alias_name: "prediction" 13 | is_lod_tensor: false 14 | fetch_type: 1 15 | shape: 10 16 | } 17 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_cluster.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | from edl.tests.unittests import etcd_test_base 17 | from edl.utils import cluster as edl_cluster 18 | 19 | 20 | class TestCluster(etcd_test_base.EtcdTestBase): 21 | def setUp(self): 22 | super(TestCluster, self).setUp("test_cluster") 23 | 24 | def test_cluster_basic(self): 25 | cluster = edl_cluster.Cluster() 26 | 27 | cluster2 = edl_cluster.Cluster() 28 | cluster2.from_json(cluster.to_json()) 29 | self.assertEqual(cluster, cluster2) 30 | 31 | 32 | if __name__ == "__main__": 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_consistent_hash.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | import six 17 | import unittest 18 | from edl.discovery.consistent_hash import ConsistentHash 19 | 20 | 21 | class TestConsistentHash(unittest.TestCase): 22 | def test_consistent_hash(self): 23 | nodes = ["127.0.0.1:1234", "127.0.0.1:2345", "127.0.0.1:3456"] 24 | sample_count = 10000 25 | node_to_count = {key: 0 for key in nodes} 26 | sample_to_node = dict() 27 | 28 | cs_hash = ConsistentHash(nodes) 29 | 30 | def hash_test(ip): 31 | for i in range(sample_count): 32 | key = "{}:{}".format(ip, i) 33 | node = cs_hash.get_node(key) 34 | if key not in sample_to_node: 35 | sample_to_node[key] = node 36 | node_to_count[node] += 1 37 | else: 38 | old_node = sample_to_node[key] 39 | node_to_count[old_node] -= 1 40 | 41 | sample_to_node[key] = node 42 | node_to_count[node] += 1 43 | 44 | for node, count in six.iteritems(node_to_count): 45 | print("node={}, count={}".format(node, count)) 46 | 47 | hash_test("1.1.1.1") 48 | old_node_to_count = copy.deepcopy(node_to_count) 49 | for count in node_to_count.values(): 50 | # test Balance 51 | assert count > 3000 52 | 53 | # remove node 54 | print("\nremove node={}".format(nodes[1])) 55 | 56 | cs_hash.remove_node(nodes[1]) 57 | hash_test("1.1.1.1") 58 | # test Monotonicity, remove 59 | assert 0 == node_to_count[nodes[1]] 60 | 61 | # recover node 62 | print("\nrecover node={}".format(nodes[1])) 63 | cs_hash.add_new_node(nodes[1]) 64 | hash_test("1.1.1.1") 65 | # test Monotonicity, recover 66 | assert node_to_count == old_node_to_count 67 | 68 | # add new node 69 | new_node = "8.8.8.8:8888" 70 | print("\nadd new node={}".format(new_node)) 71 | nodes.append(new_node) 72 | node_to_count[new_node] = 0 73 | cs_hash.add_new_node(new_node) 74 | 75 | hash_test("8.8.8.8") 76 | # test Balance, Monotonicity 77 | assert node_to_count[new_node] < 3000 78 | 79 | 80 | if __name__ == "__main__": 81 | unittest.main() 82 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_data_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | from edl.collective.data_reader import DistributedDataReader 17 | from edl.collective.dataset import TxtFileSplitter 18 | 19 | 20 | class TestDataReader(unittest.TestCase): 21 | def setUp(self): 22 | self._file_list = ["./data_server/a.txt", "./data_server/b.txt"] 23 | self._data = {} 24 | for idx, p in enumerate(self._file_list): 25 | s = TxtFileSplitter(p) 26 | for r in s: 27 | if idx not in self._data: 28 | self._data[idx] = [] 29 | d = ((p), (r[0], r[1:])) 30 | self._data[idx].append(d) # [(path),(rec_no, splitted_fiels)]... 31 | 32 | def test_data_reader(self): 33 | reader1 = DistributedDataReader( 34 | file_list=self._file_list, 35 | file_splitter_cls=TxtFileSplitter, 36 | splitted_data_field=["line"], 37 | batch_size=1, 38 | ) 39 | 40 | reader2 = DistributedDataReader( 41 | file_list=self._file_list, 42 | file_splitter_cls=TxtFileSplitter, 43 | splitted_data_field=["line"], 44 | batch_size=1, 45 | ) 46 | 47 | size1 = 0 48 | for meta, batch in reader1: 49 | self.assertTrue(meta._size, 1) 50 | for k, v in meta._batch: 51 | c = self._data[k._idx] 52 | self.assertTrue(c[0][0], k._path) 53 | size1 += 1 54 | 55 | size2 = 0 56 | for meta, batch in reader2: 57 | self.assertTrue(meta._size, 1) 58 | for k, v in meta._batch: 59 | c = self._data[k._idx] 60 | self.assertTrue(c[0][0], k._path) 61 | size2 += 1 62 | 63 | self.assertTrue(size1, size2) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_distill_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | unset https_proxy http_proxy 18 | 19 | version_str=$(python --version 2>&1) 20 | if [[ ${version_str} > "Python 3" ]]; then 21 | echo "fix me under Python 3" 22 | exit 0 23 | fi 24 | 25 | nohup etcd > test_distill_reader_etcd.log 2>&1 & 26 | etcd_pid=$! 27 | 28 | # wait etcd start 29 | sleep 10 30 | 31 | nohup python -m edl.discovery.register --service_name DistillReaderTest --server 127.0.0.1:2379 > run_discovery_register.log 2>&1 & 32 | register_pid=$! 33 | 34 | nohup python -m edl.distill.discovery_server > run_discovery_server.log 2>&1 & 35 | discovery_pid=$! 36 | 37 | # wait discovery start 38 | sleep 5 39 | 40 | export PADDLE_DISTILL_BALANCE_TYPE=etcd 41 | 42 | export PADDLE_DISTILL_BALANCE_SERVER=127.0.0.1:7001 43 | export PADDLE_DISTILL_SERVICE_NAME=DistillReaderTest 44 | export PADDLE_DISTILL_MAX_TEACHER=4 45 | python distill_reader_test.py 46 | 47 | kill -9 $discovery_pid $register_pid $etcd_pid 48 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_etcd_client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | 19 | nohup etcd > test_etcd_client_etcd.log 2>&1 & 20 | etcd_pid=$! 21 | 22 | unset https_proxy http_proxy 23 | python -u ./etcd_client_test.py 24 | 25 | set +e 26 | kill -9 $etcd_pid 27 | echo $etcd_pid 28 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_file_list.txt: -------------------------------------------------------------------------------- 1 | data_server/a.txt 2 | data_server/b.txt 3 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_launch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import print_function 16 | 17 | from edl.tests.unittests import etcd_test_base 18 | from edl.utils import status as edl_status 19 | from edl.utils.log_utils import logger 20 | from edl.utils import launcher as edl_launcher 21 | 22 | 23 | class TestLauncher(etcd_test_base.EtcdTestBase): 24 | def setUp(self): 25 | super(TestLauncher, self).setUp("test_launcher") 26 | 27 | def test_normal_exit(self): 28 | launcher = edl_launcher(self._job_env, self._pod, self._etcd, None) 29 | launcher.init() 30 | launcher.launch() 31 | 32 | last_status = edl_status.load_job_status_from_etcd(self._etcd) 33 | if last_status == edl_status.Status.SUCCEED: 34 | logger.info( 35 | "job:{} has completed! Need't try!".format(self._job_env.job_id) 36 | ) 37 | return 38 | self.assertFalse(True) 39 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | unset https_proxy http_proxy 19 | 20 | name=${TEST_TARGET_NAME} 21 | TEST_TIMEOUT=${TEST_TIMEOUT} 22 | 23 | # rm flag file 24 | rm -f "${name}"_*.log 25 | 26 | nohup etcd > "${name}_etcd.log" 2>&1 & 27 | etcd_pid=$! 28 | 29 | echo "etcd_pid:${etcd_pid} ${name}_etcd.log" 30 | 31 | if [[ ${TEST_TIMEOUT}"x" == "x" ]]; then 32 | echo "can't find ${TEST_TIMEOUT}, please set ${TEST_TIMEOUT} first" 33 | exit 1 34 | fi 35 | 36 | # start the unit test 37 | run_time=$(( TEST_TIMEOUT - 10 )) 38 | echo "run_time: ${run_time}" 39 | 40 | export PADDLE_JOB_ID="test_success_job" 41 | export PADDLE_ETCD_ENDPOINTS="127.0.0.1:2379" 42 | export PADDLE_EDLNODES_RANAGE="2:2" 43 | export PADDLE_EDL_ONLY_FOR_CE_TEST="1" 44 | export PADDLE_EDL_HDFS_PATH="./success_job" 45 | export PADDLE_EDL_HDFS_HOME="./hadoop" 46 | 47 | #clean keys 48 | python del_from_etcd.py 49 | 50 | # all success---- 51 | export CUDA_VISIBLE_DEVICES=0 52 | export PADDLE_DEMO_EXIT_CODE=0 53 | timeout -s SIGKILL "${run_time}" python -m edl.collective.launch --log_dir 00 launch_demo.py > "${name}_run_00.log" 2>&1 & 54 | pid_00=$! 55 | 56 | export CUDA_VISIBLE_DEVICES=1 57 | export PADDLE_DEMO_EXIT_CODE=0 58 | timeout -s SIGKILL "${run_time}" python -m edl.collective.launch --log_dir 01 launch_demo.py > "${name}_run_01.log" 2>&1 & 59 | pid_01=$! 60 | 61 | key="/${PADDLE_JOB_ID}/job_flag/nodes/job_status" 62 | value="$(etcdctl get "${key}")" 63 | echo "job complete flag:${value}" 64 | 65 | job_flag=True 66 | for pid in $pid_00 $pid_01; do 67 | echo "wait ${pid}" 68 | if ! wait ${pid} ; then 69 | job_flag=False 70 | fi 71 | done 72 | #---- 73 | 74 | if [[ $job_flag == "True" ]]; then 75 | exit 0 76 | fi 77 | 78 | echo "cat ${name}_run_00.log" 79 | cat "${name}_run_00.log" 80 | 81 | echo "cat ${name}_run_01.log" 82 | cat "${name}_run_01.log" 83 | 84 | 85 | set +e 86 | kill -9 $etcd_pid 87 | echo $etcd_pid 88 | exit 1 89 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_leader_pod.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import unittest 17 | from edl.tests.unittests import etcd_test_base 18 | from edl.utils import constants 19 | from edl.utils import leader_pod 20 | from edl.utils import pod as edl_pod 21 | from edl.utils import resource_pods 22 | from edl.utils import cluster_generator 23 | 24 | 25 | class TestLeaderPod(etcd_test_base.EtcdTestBase): 26 | def setUp(self): 27 | super(TestLeaderPod, self).setUp("test_leader_pod") 28 | 29 | def _add_pod(self): 30 | pod = edl_pod.Pod() 31 | pod.from_env(self._job_env) 32 | resource_register = resource_pods.Register( 33 | self._job_env, 34 | pod_id=pod.pod_id, 35 | pod_json=pod.to_json(), 36 | ttl=constants.ETCD_TTL, 37 | ) 38 | generator = cluster_generator.Generator(self._job_env, pod.pod_id) 39 | leader_register = leader_pod.Register( 40 | self._job_env, pod.pod_id, cluster_generator=generator 41 | ) 42 | 43 | return (pod, leader_register, resource_register) 44 | 45 | def test_seize_leader(self): 46 | pod0, leader_register0, resource_register0 = self._add_pod() 47 | time.sleep(constants.ETCD_TTL) 48 | pod1, leader_register1, resource_register1 = self._add_pod() 49 | 50 | leader_id = leader_pod.get_pod_leader_id(self._etcd, timeout=15) 51 | self.assertEqual(pod0.pod_id, leader_id) 52 | 53 | leader_register0.stop() 54 | time.sleep(constants.ETCD_TTL) 55 | 56 | leader_id = leader_pod.get_pod_leader_id(self._etcd, timeout=15) 57 | self.assertEqual(pod1.pod_id, leader_id) 58 | leader_register1.stop() 59 | 60 | resource_register0.stop() 61 | resource_register1.stop() 62 | 63 | 64 | if __name__ == "__main__": 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_pod.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import unittest 15 | 16 | from edl.tests.unittests import etcd_test_base 17 | from edl.utils import pod as edl_pod 18 | 19 | 20 | class TestPod(etcd_test_base.EtcdTestBase): 21 | def setUp(self): 22 | super(TestPod, self).setUp("test_pod") 23 | 24 | def test_pod(self): 25 | pod = edl_pod.Pod() 26 | pod.from_env(self._job_env) 27 | 28 | pod2 = edl_pod.Pod() 29 | pod2.from_json(pod.to_json()) 30 | self.assertEqual(pod, pod2) 31 | 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_redis_distill_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | unset https_proxy http_proxy 18 | 19 | nohup redis-server --port 3456 2>&1 & 20 | redis_pid=$! 21 | 22 | # wait redis start 23 | sleep 10 24 | 25 | nohup python -m edl.distill.redis.server_register \ 26 | --db_endpoints 127.0.0.1:3456 \ 27 | --service_name DistillReaderTest \ 28 | --server 127.0.0.1:3456 > test_redist_distill_reader.1.log 2>&1 & 29 | register_pid=$! 30 | 31 | nohup python -m edl.distill.redis.balance_server --db_endpoints 127.0.0.1:3456 > test_redist_distill_reader.2.log 2>&1 & 32 | discovery_pid=$! 33 | # wait balance start 34 | sleep 10 35 | 36 | export PADDLE_DISTILL_BALANCE_SERVER=127.0.0.1:7001 37 | export PADDLE_DISTILL_SERVICE_NAME=DistillReaderTest 38 | export PADDLE_DISTILL_MAX_TEACHER=4 39 | python distill_reader_test.py 40 | 41 | kill -9 $discovery_pid $register_pid $redis_pid 42 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_resource_pods.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import six 15 | import time 16 | import unittest 17 | from edl.tests.unittests import etcd_test_base 18 | from edl.utils import constants 19 | from edl.utils import pod as edl_pod 20 | from edl.utils import resource_pods 21 | 22 | 23 | class TestRegister(etcd_test_base.EtcdTestBase): 24 | def setUp(self): 25 | super(TestRegister, self).setUp("test_register") 26 | 27 | def test_register_resource_pod(self): 28 | try: 29 | pod0 = edl_pod.Pod() 30 | pod0._id = "0" 31 | 32 | pod1 = edl_pod.Pod() 33 | pod1._id = "1" 34 | 35 | ttl = constants.ETCD_TTL 36 | register1 = resource_pods.Register( 37 | self._job_env, pod_id="0", pod_json=pod0.to_json(), ttl=ttl 38 | ) 39 | register2 = resource_pods.Register( 40 | self._job_env, pod_id="1", pod_json=pod1.to_json(), ttl=ttl 41 | ) 42 | 43 | # check if the ttl is valid 44 | time.sleep(ttl + 2) 45 | 46 | pods = resource_pods.load_from_etcd(self._etcd, timeout=15) 47 | self.assertEqual(len(pods), 2) 48 | for pod_id, pod in six.iteritems(pods): 49 | if pod_id == "0": 50 | self.assertEqual(pod, pod0) 51 | elif pod_id == "1": 52 | self.assertEqual(pod, pod1) 53 | else: 54 | raise Exception("not supported pod_id:{}".format(pod_id)) 55 | except Exception as e: 56 | raise e 57 | finally: 58 | register1.stop() 59 | register2.stop() 60 | 61 | time.sleep(ttl + 1) 62 | pods_dict = resource_pods.load_from_etcd(self._etcd, timeout=15) 63 | self.assertEqual(len(pods_dict), 0) 64 | 65 | 66 | if __name__ == "__main__": 67 | unittest.main() 68 | -------------------------------------------------------------------------------- /python/edl/tests/unittests/test_train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import unittest 16 | import edl 17 | from edl.collective.data_reader import DistributedDataReader, FileMeta 18 | from edl.collective.dataset import TxtFileSplitter 19 | from paddle.fluid.incubate.fleet.collective import fleet 20 | 21 | learning_rate = 1.0 22 | start_program = None 23 | main_program = None 24 | exe = None 25 | 26 | 27 | def adjust(): 28 | learing_rate = learning_rate * edl.size() # noqa: F841 29 | 30 | 31 | class TestDataReader(unittest.TestCase): 32 | def setUp(self): 33 | self._file_list = ["./data_server/a.txt", "./data_server/b.txt"] 34 | self._data = {} 35 | for idx, p in enumerate(self._file_list): 36 | s = TxtFileSplitter(p) 37 | m = FileMeta() 38 | for r in s: 39 | if idx not in m: 40 | self._data[idx] = [] 41 | record = ((p), (r[0], r[1:])) 42 | self._data[idx].append(record) # [(path),(rec_no, splitted_fiels)]... 43 | 44 | def _train(self, state): 45 | print("learning_rate:", learning_rate) 46 | reader = DistributedDataReader( 47 | file_list=self._file_list, 48 | file_splitter_cls=TxtFileSplitter, 49 | splitted_data_field=["line"], 50 | batch_size=1, 51 | trainer_rank=0, 52 | ) 53 | 54 | for epoch in range(state.epoch, 5): 55 | for meta, batch in reader: 56 | edl.notify_end_one_batch(meta, state) 57 | edl.notify_end_one_epoch(state) 58 | 59 | def test_data_reader(self): 60 | fleet.init() 61 | state = edl.PaddleState( 62 | exe, start_program, main_program, optimizer=None, batch=0, epoch=0 63 | ) 64 | state.register_adjust_function([adjust]) 65 | self._train(state) 66 | 67 | 68 | if __name__ == "__main__": 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /python/edl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /python/edl/utils/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import grpc 16 | 17 | 18 | class Client(object): 19 | def __init__(self, endpoint): 20 | self._endpoint = endpoint 21 | 22 | def connect(self): 23 | self._channel = grpc.insecure_channel(self._endpoint) 24 | 25 | def shutdown(self): 26 | self._channel = None 27 | -------------------------------------------------------------------------------- /python/edl/utils/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ETCD_POD_RESOURCE = "resource" 16 | ETCD_POD_RANK = "rank" 17 | ETCD_POD_STATUS = "pod_status" 18 | ETCD_JOB_STATUS = "job_status" 19 | ETCD_TRAIN_STATUS = "train_status" 20 | ETCD_CLUSTER = "cluster" 21 | ETCD_READER = "reader" 22 | ETCD_STATE = "state" 23 | ETCD_POD_LEADER = "0" 24 | 25 | ETCD_CONN_TIMEOUT = 6 26 | ETCD_TTL = 15 27 | ETCD_OPERATION_TIMEOUT = 60 28 | 29 | 30 | def clean_etcd(etcd): 31 | etcd.remove_service(ETCD_POD_RESOURCE) 32 | etcd.remove_service(ETCD_POD_RANK) 33 | etcd.remove_service(ETCD_POD_STATUS) 34 | etcd.remove_service(ETCD_JOB_STATUS) 35 | etcd.remove_service(ETCD_TRAIN_STATUS) 36 | etcd.remove_service(ETCD_CLUSTER) 37 | etcd.remove_service(ETCD_READER) 38 | etcd.remove_service(ETCD_STATE) 39 | etcd.remove_service(ETCD_POD_LEADER) 40 | -------------------------------------------------------------------------------- /python/edl/utils/data_filter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class DatatFilter(object): 17 | def __init__(self, checkpoint): 18 | pass 19 | 20 | def is_processed(self, idx, path, record_idx): 21 | pass 22 | 23 | def add_processed(self, idx, path, record_idx): 24 | pass 25 | -------------------------------------------------------------------------------- /python/edl/utils/error_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import functools 16 | import time 17 | 18 | from edl.utils import exceptions 19 | from edl.utils.log_utils import logger 20 | 21 | 22 | def handle_errors_until_timeout(f): 23 | def handler(*args, **kwargs): 24 | begin = time.time() 25 | timeout = kwargs["timeout"] 26 | while True: 27 | try: 28 | return f(*args, **kwargs) 29 | except exceptions.EdlDataEndError: 30 | raise exceptions.EdlDataEndError 31 | except exceptions.EdlException as e: 32 | if time.time() - begin >= timeout: 33 | logger.warning("{} execute timeout:{}".format(f.__name__, timeout)) 34 | raise e 35 | 36 | time.sleep(3) 37 | continue 38 | 39 | return functools.wraps(f)(handler) 40 | -------------------------------------------------------------------------------- /python/edl/utils/etcd_db.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from ..discovery.etcd_client import EtcdClient 15 | 16 | g_etcd = None 17 | 18 | 19 | def get_global_etcd(etcd_endpoints=None, job_id=None): 20 | global g_etcd 21 | if g_etcd is None: 22 | assert etcd_endpoints is not None and job_id is not None 23 | g_etcd = EtcdClient(endpoints=etcd_endpoints, root=job_id, timeout=6) 24 | g_etcd.init() 25 | return g_etcd 26 | 27 | return g_etcd 28 | -------------------------------------------------------------------------------- /python/edl/utils/etcd_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from edl.utils import constants 16 | 17 | 18 | def get_train_status_table_key(self, server_name): 19 | return self._etcd.get_full_path(constants.ETCD_TRAIN_STATUS, server_name) 20 | 21 | 22 | def get_cluster_table_key(self): 23 | return self._etcd.get_full_path(constants.ETCD_CLUSTER, constants.ETCD_CLUSTER) 24 | 25 | 26 | def get_rank_table_key(self): 27 | return self._etcd.get_full_path(constants.ETCD_POD_RANK, constants.ETCD_POD_LEADER) 28 | -------------------------------------------------------------------------------- /python/edl/utils/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | 17 | from edl.utils import common_pb2 18 | 19 | 20 | class EdlException(Exception): 21 | pass 22 | 23 | 24 | class EdlStopIteration(EdlException): 25 | pass 26 | 27 | 28 | class EdlRegisterError(EdlException): 29 | pass 30 | 31 | 32 | class EdlBarrierError(EdlException): 33 | pass 34 | 35 | 36 | class EdlUnkownError(EdlException): 37 | pass 38 | 39 | 40 | class EdlRankError(EdlException): 41 | pass 42 | 43 | 44 | class EdlInternalError(EdlException): 45 | pass 46 | 47 | 48 | class EdlWaitFollowersReleaseError(EdlException): 49 | pass 50 | 51 | 52 | class EdlLeaderError(EdlException): 53 | pass 54 | 55 | 56 | class EdlGenerateClusterError(EdlException): 57 | pass 58 | 59 | 60 | class EdlTableError(EdlException): 61 | pass 62 | 63 | 64 | class EdlEtcdIOError(EdlException): 65 | pass 66 | 67 | 68 | class EdlDataEndError(EdlException): 69 | pass 70 | 71 | 72 | class EdlPodIDNotExistError(EdlException): 73 | pass 74 | 75 | 76 | class EdlReaderNameError(EdlException): 77 | pass 78 | 79 | 80 | class EdlFileListNotMatchError(EdlException): 81 | pass 82 | 83 | 84 | class EdlDataGenerateError(EdlException): 85 | pass 86 | 87 | 88 | class EdlNotLeaderError(EdlException): 89 | pass 90 | 91 | 92 | def deserialize(pb_status): 93 | thismodule = sys.modules[__name__] 94 | try: 95 | cls = getattr(thismodule, pb_status.type)(pb_status.detail) 96 | except Exception as e: 97 | raise Exception( 98 | "type:{} detail:{} meets error:{}".format( 99 | pb_status.type, pb_status.detail, str(e) 100 | ) 101 | ) 102 | raise cls 103 | 104 | 105 | def serialize_to_pb_status(exception): 106 | pb_status = common_pb2.Status() 107 | pb_status.type = exception.__class__.__name__ 108 | pb_status.detail = str(exception) 109 | return pb_status 110 | 111 | 112 | def serialize(pb_response, exception, stack_info=None): 113 | pb_response.status.type = exception.__class__.__name__ 114 | if stack_info is not None: 115 | pb_response.status.detail = str(exception) + stack_info 116 | else: 117 | pb_response.status.detail = str(exception) 118 | -------------------------------------------------------------------------------- /python/edl/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def read_txt_lines(file_list): 17 | """ 18 | return [(file_path, line_no)...] 19 | """ 20 | line_no = -1 21 | ret = [] 22 | with open(file_list, "r") as f: 23 | for line in f: 24 | line = line.strip() 25 | if len(line) <= 0: 26 | continue 27 | 28 | line_no += 1 29 | ret.append((line, line_no)) 30 | return ret 31 | -------------------------------------------------------------------------------- /python/edl/utils/json_serializable.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import json 15 | import six 16 | 17 | 18 | class SerializableBase(object): 19 | def to_json(self): 20 | raise NotImplementedError 21 | 22 | def from_json(self): 23 | raise NotImplementedError 24 | 25 | 26 | class Serializable(SerializableBase): 27 | def to_json(self, filter_names=set()): 28 | d = {} 29 | for k, v in six.iteritems(self.__dict__): 30 | if k in filter_names: 31 | continue 32 | 33 | if isinstance(v, SerializableBase): 34 | d[k] = v.to_json() 35 | continue 36 | 37 | d[k] = v 38 | 39 | return json.dumps(d) 40 | 41 | def from_json(self, json_str): 42 | d = json.loads(json_str) 43 | for k, v in six.iteritems(self.__dict__): 44 | if k not in d: 45 | continue 46 | 47 | self.__dict__[k] = d[k] 48 | 49 | return self 50 | 51 | def __eq__(self, other): 52 | if other is None: 53 | return False 54 | 55 | return self.__dict__ == other.__dict__ 56 | 57 | def __ne__(self, other): 58 | return not self.__eq__(other) 59 | 60 | def __str__(self): 61 | return self.to_json() 62 | -------------------------------------------------------------------------------- /python/edl/utils/log_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | logger = logging.getLogger("root") 18 | logger.propagate = False 19 | 20 | 21 | def get_logger(log_level, name="root"): 22 | logger = logging.getLogger(name) 23 | logger.setLevel(log_level) 24 | 25 | log_handler = logging.StreamHandler() 26 | log_format = logging.Formatter( 27 | "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 28 | ) 29 | log_handler.setFormatter(log_format) 30 | logger.addHandler(log_handler) 31 | 32 | return logger 33 | -------------------------------------------------------------------------------- /python/edl/utils/network_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import socket 16 | 17 | from contextlib import closing 18 | 19 | 20 | def get_extern_ip(): 21 | return socket.gethostbyname(socket.gethostname()) 22 | 23 | 24 | def get_host_name_ip(): 25 | host_name = socket.gethostname() 26 | host_ip = socket.gethostbyname(host_name) 27 | return host_name, host_ip 28 | 29 | 30 | def find_free_ports(num): 31 | if num <= 0: 32 | return None 33 | 34 | def __free_port(): 35 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 36 | s.bind(("", 0)) 37 | return s.getsockname()[1] 38 | 39 | port_set = set() 40 | step = 0 41 | while True: 42 | port = __free_port() 43 | if port not in port_set: 44 | port_set.add(port) 45 | 46 | if len(port_set) >= num: 47 | return port_set 48 | 49 | step += 1 50 | if step > 100: 51 | print("can't find avilable port and use the specified static port now!") 52 | return None 53 | 54 | return None 55 | -------------------------------------------------------------------------------- /python/edl/utils/pb_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | 17 | 18 | def record_to_string(rec): 19 | return "record_no:{} fields_len:{}".format(rec.record_no, len(rec.field_data)) 20 | 21 | 22 | def batch_data_response_to_string(res): 23 | r = [] 24 | for data in res.data: 25 | s = {} 26 | s["batch_data_id"] = data.batch_data_id 27 | s["records_num"] = len(data.records) 28 | 29 | records_str = [] 30 | for rec in data.records: 31 | records_str.append(record_to_string(rec)) 32 | 33 | s["records"] = ",".join(records_str) 34 | r.append(json.dumps(s)) 35 | 36 | return ";".jion(r) 37 | 38 | 39 | def batch_data_meta_response_to_string(res): 40 | r = [] 41 | for data in res.data: 42 | r.append(str(data)) 43 | 44 | return ",".join(r) 45 | -------------------------------------------------------------------------------- /python/edl/utils/pod_server_client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from edl.utils import client as client_base 17 | from edl.utils import cluster as edl_cluster 18 | from edl.utils import exceptions 19 | from edl.utils import pod_server_pb2 20 | from edl.utils import pod_server_pb2_grpc 21 | from edl.utils.log_utils import logger 22 | 23 | 24 | class Client(client_base.Client): 25 | def __init__(self, endpoint): 26 | super(Client, self).__init__(endpoint) 27 | 28 | def connect(self): 29 | super(Client, self).connect() 30 | self._stub = pod_server_pb2_grpc.PodServerStub(self._channel) 31 | return self._channel, self._stub 32 | 33 | def shutdown(self): 34 | super(Client, self).shutdown() 35 | self._stub = None 36 | 37 | def barrier(self, job_id, pod_id, timeout=15): 38 | """ 39 | try to barrier on master with other launchers until timeout 40 | """ 41 | req = pod_server_pb2.BarrierRequest() 42 | req.job_id = job_id 43 | req.pod_id = pod_id 44 | 45 | c, s = self.connect() 46 | begin = time.time() 47 | cluster = edl_cluster.Cluster() 48 | while True: 49 | res = s.Barrier(req) 50 | if res.status.type == "": 51 | cluster.from_json(res.cluster_json) 52 | logger.debug("pod client get cluster:{}".format(cluster)) 53 | logger.info("barrier ok!") 54 | return cluster 55 | 56 | if time.time() - begin > timeout: 57 | message = "job_id:{} pod_id:{} barrier time out".format(job_id, pod_id) 58 | logger.info(message) 59 | exceptions.deserialize(res.status) 60 | time.sleep(1) 61 | -------------------------------------------------------------------------------- /python/edl/utils/process.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import multiprocessing 16 | import threading 17 | 18 | from edl.utils.log_utils import logger 19 | 20 | 21 | class ProcessWrapper(object): 22 | def __init__(self): 23 | self._stop = None 24 | self._lock = None 25 | self._worker = None 26 | 27 | self._stop = multiprocessing.Event() 28 | self._lock = threading.Lock() 29 | self._worker = multiprocessing.Process(target=self._worker_func) 30 | 31 | def _worker_func(self): 32 | raise NotImplementedError 33 | 34 | def start(self): 35 | self._worker.start() 36 | 37 | def stop(self): 38 | self._stop.set() 39 | with self._lock: 40 | if self._worker: 41 | self._worker.join() 42 | self._worker = None 43 | 44 | logger.info("{} exit".format(self.__class__.__name__)) 45 | 46 | def is_stopped(self): 47 | with self._lock: 48 | return self._worker is None 49 | 50 | def __exit__(self): 51 | self.stop() 52 | -------------------------------------------------------------------------------- /python/edl/utils/resource_pods.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from edl.utils import constants 16 | from edl.utils import error_utils 17 | from edl.utils import pod 18 | from edl.utils import register 19 | from edl.utils import string_utils 20 | from edl.utils import exceptions 21 | from edl.utils.log_utils import logger 22 | 23 | 24 | class Register(register.Register): 25 | def __init__(self, job_env, pod_id, pod_json, ttl=constants.ETCD_TTL): 26 | service = constants.ETCD_POD_RESOURCE 27 | server = "{}".format(pod_id) 28 | value = pod_json 29 | 30 | super(Register, self).__init__( 31 | etcd_endpoints=job_env.etcd_endpoints, 32 | job_id=job_env.job_id, 33 | service=service, 34 | server=server, 35 | info=value, 36 | ttl=ttl, 37 | ) 38 | 39 | def stop(self): 40 | super(Register, self).stop() 41 | logger.info("pod:{} resource_register stopped") 42 | 43 | 44 | @error_utils.handle_errors_until_timeout 45 | def load_from_etcd(etcd, timeout=15): 46 | servers = etcd.get_service(constants.ETCD_POD_RESOURCE) 47 | 48 | pods = {} 49 | for s in servers: 50 | p = pod.Pod() 51 | p.from_json(string_utils.bytes_to_string(s.info)) 52 | pods[p.get_id()] = p 53 | 54 | return pods 55 | 56 | 57 | @error_utils.handle_errors_until_timeout 58 | def wait_resource(etcd, pod_id, timeout=15): 59 | pods = load_from_etcd(etcd, timeout=timeout) 60 | if len(pods) == 1: 61 | if pod_id in pods: 62 | return True 63 | 64 | if len(pods) == 0: 65 | return True 66 | 67 | raise exceptions.EdlWaitFollowersReleaseError( 68 | "can't wait all resource exit:{}".format(pods.keys()) 69 | ) 70 | 71 | return False 72 | -------------------------------------------------------------------------------- /python/edl/utils/string_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def dataset_to_string(o): 17 | """ 18 | FileMeta to string 19 | """ 20 | ret = "idx_in_list:{}, file_path:{}".format(o.idx_in_list, o.file_path) 21 | 22 | ret += " record:[" 23 | for rs in o.records: 24 | for rec_no in range(rs.begin, rs.end + 1): 25 | ret += "(record_no:{})".format(rec_no) 26 | ret += "]" 27 | 28 | return ret 29 | 30 | 31 | def data_request_to_string(o): 32 | """ 33 | DataMeta to string 34 | """ 35 | ret = "idx_in_list:{} file_path:{}".format(o.idx_in_list, o.file_path) 36 | for rs in o.chunks: 37 | ret += " chunk:[" 38 | ret += chunk_to_string(rs) 39 | ret += "]" 40 | 41 | return ret 42 | 43 | 44 | def chunk_to_string(rs): 45 | ret = "status:{} ".format(rs.status) 46 | for rec_no in range(rs.meta.begin, rs.meta.end + 1): 47 | ret += "(record_no:{}) ".format(rec_no) 48 | 49 | return ret 50 | 51 | 52 | def bytes_to_string(o, codec="utf-8"): 53 | if o is None: 54 | return None 55 | 56 | if not isinstance(o, str): 57 | return o.decode(codec) 58 | 59 | return o 60 | -------------------------------------------------------------------------------- /python/edl/utils/train_status.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import enum 16 | import json 17 | from edl.utils import constants 18 | from edl.utils import error_utils 19 | 20 | 21 | class TrainStatus(enum.IntEnum): 22 | INITIAL = 0 23 | RUNNING = 1 24 | NEARTHEEND = 3 25 | SUCCEED = 3 26 | FAILED = 4 27 | 28 | 29 | @error_utils.handle_errors_until_timeout 30 | def save_to_etcd(etcd, pod_id, status, timeout=30): 31 | service = constants.ETCD_TRAIN_STATUS 32 | server = pod_id 33 | info = json.dumps({"status": int(status)}) 34 | etcd.set_server_permanent(service, server, info) 35 | 36 | 37 | @error_utils.handle_errors_until_timeout 38 | def load_from_etcd(etcd, pod_id, timeout=30): 39 | value = etcd.get_value(constants.ETCD_TRAIN_STATUS, pod_id) 40 | 41 | if value is None: 42 | return None 43 | 44 | d = json.load(value) 45 | return d["status"] 46 | -------------------------------------------------------------------------------- /python/edl/utils/trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import uuid 16 | from edl.utils import json_serializable 17 | 18 | 19 | class Trainer(json_serializable.Serializable): 20 | def __init__(self): 21 | self._id = None 22 | self._rank_in_pod = None 23 | self._gpus = [] 24 | self._endpoint = None 25 | self._global_rank = None 26 | 27 | def __str__(self): 28 | s = "id:{} rank_in_pod:{} gpus:{} endpoint:{} global_rank:{}".format( 29 | self._ids, self._rank_in_pod, self._gpus, self._endpoint, self._global_rank 30 | ) 31 | 32 | return s 33 | 34 | @property 35 | def global_rank(self): 36 | return self._global_rank 37 | 38 | @property 39 | def rank_in_pod(self): 40 | return self._rank_in_pod 41 | 42 | @property 43 | def gpus(self): 44 | return self._gpus 45 | 46 | @property 47 | def endpoint(self): 48 | return self._endpoint 49 | 50 | def from_pod(self, endpoint, rank_in_pod, gpus): 51 | self._id = str(uuid.uuid1()) 52 | self._global_rank = None 53 | self._rank_in_pod = rank_in_pod 54 | self._endpoint = endpoint 55 | self._gpus = gpus 56 | -------------------------------------------------------------------------------- /python/edl/utils/unique_name.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import collections 16 | 17 | 18 | class UniqueNameGenerator(object): 19 | """ 20 | Generate unique name with prefix. 21 | 22 | Args: 23 | prefix(str): The generated name prefix. All generated name will be 24 | started with this prefix. 25 | """ 26 | 27 | def __init__(self, prefix=None): 28 | self.ids = collections.defaultdict(int) 29 | if prefix is None: 30 | prefix = "" 31 | self.prefix = prefix 32 | 33 | def __call__(self, key): 34 | """ 35 | Generate unique names with prefix 36 | 37 | Args: 38 | key(str): The key of return string. 39 | 40 | Returns(str): A unique string with the prefix 41 | """ 42 | tmp = self.ids[key] 43 | self.ids[key] += 1 44 | return self.prefix + "_".join([key, str(tmp)]) 45 | 46 | 47 | generator = UniqueNameGenerator() 48 | 49 | 50 | def generate(key): 51 | return generator(key) 52 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | trap 'kill -9 $(jobs -p)' EXIT 18 | unset https_proxy http_proxy 19 | 20 | set -e 21 | if [[ $# != 1 ]] ; then 22 | echo "must set python version" 23 | exit 0 24 | fi 25 | 26 | unset GREP_OPTIONS 27 | BASEDIR="$(dirname "$(readlink -f "${0}")")" 28 | cd "${BASEDIR}"/.. 29 | 30 | function gen_env(){ 31 | py_version=$1 32 | old_path=$PATH 33 | python_path="$(which python"${py_version}")" 34 | 35 | tmp_path=/tmp/edl-build/python${py_version}/bin 36 | mkdir -p "${tmp_path}" 37 | rm -f "${tmp_path}/python" 38 | 39 | ln -s "${python_path}" "${tmp_path}/python" 40 | export PATH="${tmp_path}:${old_path}" 41 | echo "current path:${PATH}" 42 | } 43 | 44 | py_version=$1 45 | gen_env "$py_version" 46 | 47 | # check python version 48 | which python 49 | version_str=$(python --version 2>&1) 50 | echo "python version:${version_str}" 51 | if [[ ${version_str} != "Python ${py_version}"* ]]; then 52 | echo "${version_str} not valid for argument:${py_version}" 53 | exit 1 54 | fi 55 | 56 | 57 | pushd python/edl/protos/ 58 | bash generate.sh 59 | popd 60 | 61 | build_dir=build 62 | rm -rf ${build_dir} 63 | mkdir -p ${build_dir}/cmd/master/ 64 | # TODO(gongwb): add them on async training 65 | # go 66 | #go build -o build/cmd/master/master cmd/master/master.go 67 | 68 | nohup etcd > "build_etcd.log" 2>&1 & 69 | 70 | #build python 71 | pushd ${build_dir} 72 | cmake .. -DPY_VERSION="${py_version}" 73 | make clean && make -j 74 | ctest -V -R 75 | popd 76 | 77 | #test all go test 78 | #go test --cover ./... 79 | -------------------------------------------------------------------------------- /scripts/custom-boilerplate.go.txt: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2016 PaddlePaddle Authors All Rights Reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | -------------------------------------------------------------------------------- /scripts/download_etcd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | ETCD_VER=v3.4.7 19 | 20 | # choose either URL 21 | DOWNLOAD_URL=https://paddle-edl.bj.bcebos.com/etcd-${ETCD_VER}-linux-amd64.tar.gz 22 | 23 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz 24 | rm -rf /tmp/etcd-download-test && mkdir -p /tmp/etcd-download-test 25 | 26 | wget -q ${DOWNLOAD_URL} -O /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz 27 | tar xzvf /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz -C /tmp/etcd-download-test --strip-components=1 28 | rm -f /tmp/etcd-${ETCD_VER}-linux-amd64.tar.gz 29 | 30 | /tmp/etcd-download-test/etcd --version 31 | /tmp/etcd-download-test/etcdctl version 32 | 33 | mv /tmp/etcd-download-test/etcd /usr/local/bin/ 34 | mv /tmp/etcd-download-test/etcdctl /usr/local/bin/ 35 | 36 | rm -rf /tmp/etcd-download-test 37 | -------------------------------------------------------------------------------- /scripts/run_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | unset GREP_OPTIONS 19 | BASEDIR=$(dirname "$(readlink -f "${0}")") 20 | 21 | 22 | echo "base_dir:${BASEDIR}" 23 | cd "${BASEDIR}" 24 | 25 | # 2.7 is deprecated 26 | # ./build.sh 2.7 27 | 28 | function abort(){ 29 | echo "Your change doesn't follow Edl's code style." 1>&2 30 | echo "Please use pre-commit to check what is wrong." 1>&2 31 | exit 1 32 | } 33 | 34 | 35 | function check_style() { 36 | trap 'abort' 0 37 | 38 | set +e 39 | upstream_url='https://github.com/elasticdeeplearning/edl' 40 | git remote remove upstream 41 | git remote add upstream $upstream_url 42 | set -e 43 | git fetch upstream develop 44 | 45 | pre-commit install 46 | changed_files="$(git diff --name-only upstream/develop)" 47 | echo "$changed_files" | xargs pre-commit run --files 48 | 49 | trap : 0 50 | } 51 | 52 | pushd "${BASEDIR}/../" 53 | check_style 54 | popd 55 | 56 | 57 | ./build.sh 3.7 58 | --------------------------------------------------------------------------------