├── .dockerignore ├── .github └── workflows │ ├── build.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── README_cn.md ├── build.sh ├── docker ├── Dockerfile ├── Dockerfile.base ├── Dockerfile.build ├── Dockerfile.nccl └── docker_build.sh ├── documents ├── cn │ ├── benchmark.md │ ├── serving.md │ └── training.md ├── en │ ├── benchmark.md │ ├── pmem.md │ ├── serving.md │ ├── train.md │ └── training.md ├── images │ ├── benchmark-server.png │ ├── benchmark.png │ ├── pmem_vs_dram_oe.png │ ├── serving.drawio.png │ ├── standalone.drawio.png │ └── training.drawio.png └── papers │ └── openembedding_icde2023.pdf ├── examples ├── criteo_deepctr_hook.py ├── criteo_deepctr_network.py ├── criteo_deepctr_network_mirrored.py ├── criteo_deepctr_network_mpi.py ├── criteo_lr_subclass.py ├── criteo_preprocess.py ├── run │ ├── criteo_deepctr_checkpoint.sh │ ├── criteo_deepctr_horovod.sh │ ├── criteo_deepctr_mirrored.sh │ ├── criteo_deepctr_mpi.sh │ ├── criteo_deepctr_restful.sh │ ├── criteo_deepctr_standalone.sh │ └── criteo_preprocess.sh ├── tensorflow_serving_client.py ├── tensorflow_serving_restful.py ├── train100.csv └── wide100.csv ├── laboratory ├── benchmark │ ├── Dockerfile │ ├── analyze.py │ ├── benchmark.Dockerfile │ ├── benchmark.py │ ├── criteo_sample.txt │ ├── deepctr_criteo_model.py │ ├── parse_tensor_board.py │ ├── summary.py │ └── tensornet.Dockerfile ├── inject │ ├── Dockerfile │ ├── inject.sh │ ├── network_model.py │ ├── openembedding_inject_tensorflow.py │ ├── python │ └── sitecustomize.py ├── onnx │ └── criteo_deepctr_torch.py ├── publish-serving.sh └── strangedemo │ ├── Dockerfile.criteo │ ├── Dockerfile.push │ ├── criteo_deepctr │ ├── criteo_deepctr.ipynb │ └── criteo_deepctr.py │ ├── criteo_deepctr_np │ ├── criteo_deepctr.py │ ├── criteo_deepctr_np.ipynb │ └── horovod_criteo_deepctr.py │ ├── criteo_lr │ ├── criteo_lr.ipynb │ └── criteo_lr.py │ ├── criteo_predict.py │ └── hook │ ├── Dockerfile │ ├── install.sh │ ├── mlcompile │ ├── mlrun │ └── openembedding_hook_tensorflow.py ├── openembedding ├── CMakeLists.txt ├── __init__.py ├── client │ ├── Communication.cpp │ ├── Communication.h │ ├── Connection.cpp │ ├── Connection.h │ ├── EmbeddingVariableHandle.cpp │ ├── EmbeddingVariableHandle.h │ ├── EnvConfig.cpp │ ├── EnvConfig.h │ ├── Model.cpp │ ├── Model.h │ ├── ModelController.cpp │ ├── ModelController.h │ ├── ObjectPool.h │ ├── WorkerContext.cpp │ └── WorkerContext.h ├── entry │ ├── c_api.cc │ ├── c_api.h │ ├── c_api_ha_test.cpp │ ├── c_api_test.cpp │ ├── c_api_test.h │ ├── controller.cc │ ├── controller.proto │ ├── masterd.cc │ ├── pmem_c_api_test.cpp │ ├── py_api.cc │ └── server.cc ├── server │ ├── EmbeddingDumpOperator.cpp │ ├── EmbeddingDumpOperator.h │ ├── EmbeddingInitOperator.cpp │ ├── EmbeddingInitOperator.h │ ├── EmbeddingLoadOperator.cpp │ ├── EmbeddingLoadOperator.h │ ├── EmbeddingPullOperator.cpp │ ├── EmbeddingPullOperator.h │ ├── EmbeddingPushOperator.cpp │ ├── EmbeddingPushOperator.h │ ├── EmbeddingRestoreOperator.cpp │ ├── EmbeddingRestoreOperator.h │ ├── EmbeddingShardFile.h │ ├── EmbeddingStorage.h │ ├── EmbeddingStoreOperator.cpp │ ├── EmbeddingStoreOperator.h │ └── RpcView.h ├── tensorflow │ ├── CMakeLists.txt │ ├── Prefetch.h │ ├── ThreadPool.h │ ├── __init__.py │ ├── exb.py │ └── exb_ops.cpp └── variable │ ├── DataType.h │ ├── EmbeddingInitializer.h │ ├── EmbeddingItemPool.h │ ├── EmbeddingOptimizer.h │ ├── EmbeddingOptimizerVariable.h │ ├── EmbeddingTable.h │ ├── EmbeddingVariable.cpp │ ├── EmbeddingVariable.h │ ├── Factory.h │ ├── Meta.h │ ├── MpscGradientReducer.h │ ├── PersistManager.h │ ├── PmemEmbeddingItemPool.h │ ├── PmemEmbeddingOptimizerVariable.h │ ├── PmemEmbeddingTable.h │ ├── VariableAsyncTask.h │ └── pmem_embedding_table_test.cpp ├── setup.py └── test ├── benchmark ├── criteo_deepctr.py ├── criteo_deepctr_torch.py ├── criteo_tfrecord.py └── server.py ├── criteo_preprocess.cpp └── optimizer_test.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | tmp 3 | build 4 | tools 5 | pico-ps/build 6 | pico-ps/pico-core/build 7 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | services: 13 | registry: 14 | image: registry:2 15 | ports: 16 | - 5000:5000 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v2 20 | with: 21 | submodules: 'recursive' 22 | token: ${{ secrets.CHECKOUT_TOKEN }} 23 | - name: Set up Docker Buildx 24 | id: buildx 25 | uses: docker/setup-buildx-action@v1 26 | - name: docker build 27 | run: | 28 | docker/docker_build.sh 29 | - name: docker image 30 | run: | 31 | docker/docker_build.sh image 32 | - name: docker test 33 | run: | 34 | docker/docker_build.sh test 35 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - v[0-9]+.[0-9]+.[0-9]+ 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v2 14 | with: 15 | submodules: 'recursive' 16 | token: ${{ secrets.CHECKOUT_TOKEN }} 17 | - name: Get branch name 18 | uses: nelonoel/branch-name@v1.0.1 19 | - name: Set up Docker Buildx 20 | id: buildx 21 | uses: docker/setup-buildx-action@v1 22 | - name: Login to DockerHub 23 | uses: docker/login-action@v1 24 | with: 25 | username: ${{ secrets.DOCKERHUB_USERNAME }} 26 | password: ${{ secrets.DOCKERHUB_TOKEN }} 27 | - name: docker build 28 | run: | 29 | VERSION=${BRANCH_NAME:1} docker/docker_build.sh 30 | - name: docker image 31 | run: | 32 | VERSION=${BRANCH_NAME:1} docker/docker_build.sh image 33 | - name: docker test 34 | run: | 35 | VERSION=${BRANCH_NAME:1} docker/docker_build.sh test 36 | - name: docker push 37 | run: | 38 | docker push 4pdosc/openembedding:${BRANCH_NAME:1} 39 | docker tag 4pdosc/openembedding:${BRANCH_NAME:1} 4pdosc/openembedding:latest 40 | docker push 4pdosc/openembedding:latest 41 | - name: pypi upload 42 | env: 43 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 44 | TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} 45 | run: | 46 | pip3 install twine 47 | twine upload output/dist/openembedding-${BRANCH_NAME:1}.tar.gz 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## General 2 | 3 | # Compiled Object files 4 | *.slo 5 | *.lo 6 | *.o 7 | *.cuo 8 | 9 | # Compiled Dynamic libraries 10 | *.so 11 | *.dylib 12 | *.pyd 13 | 14 | # Compiled Static libraries 15 | *.lai 16 | *.la 17 | *.a 18 | 19 | # Compiled protocol buffers 20 | *.pb.h 21 | *.pb.cc 22 | *_pb2.py 23 | *_pb.py 24 | *_pb2.pyi 25 | *_pb.pyi 26 | 27 | # Compiled python 28 | *.pyc 29 | 30 | # Compiled MATLAB 31 | *.mex* 32 | 33 | # IPython notebook checkpoints 34 | .ipynb_checkpoints 35 | 36 | # Editor temporaries 37 | *.swn 38 | *.swo 39 | *.swp 40 | *~ 41 | 42 | # Sublime Text settings 43 | *.sublime-workspace 44 | *.sublime-project 45 | 46 | # Eclipse Project settings 47 | *.*project 48 | .settings 49 | 50 | # QtCreator files 51 | *.user 52 | 53 | # PyCharm files 54 | .idea 55 | 56 | # Visual Studio Code files 57 | .vscode 58 | 59 | # OSX dir files 60 | .DS_Store 61 | 62 | 63 | 64 | CMakeCache.txt 65 | CMakeFiles 66 | build.config 67 | build 68 | build_* 69 | build-debug 70 | build-release 71 | core.* 72 | lib 73 | output 74 | tmp 75 | tools 76 | 77 | *.log 78 | *.tar.gz 79 | 80 | .build_debug/* 81 | .build_release/* 82 | .setuptools-cmake-build/* 83 | .unittest_tmp/ 84 | .ycm_extra_conf.py 85 | 86 | virtualenv 87 | venv 88 | .envrc 89 | .psenvrc 90 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "parameter-server"] 2 | path = pico-ps 3 | url = https://github.com/4paradigm/parameter-server.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | if (NOT OPENEMBEDDING_VERSION) 4 | set(OPENEMBEDDING_VERSION 0.0.0) 5 | endif() 6 | 7 | project(openembedding VERSION ${OPENEMBEDDING_VERSION}) 8 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${PROJECT_SOURCE_DIR}/pico-ps/cmake ${PROJECT_SOURCE_DIR}/pico-ps/pico-core/cmake) 9 | 10 | if (NOT PYTHON) 11 | set(PYTHON "python3") 12 | endif() 13 | 14 | if(THIRD_PARTY) 15 | set(CMAKE_PREFIX_PATH "${THIRD_PARTY}") 16 | message(STATUS "THIRD_PARTY=${THIRD_PARTY}") 17 | include_directories(SYSTEM ${THIRD_PARTY}/include) 18 | link_directories(${THIRD_PARTY}/lib ${THIRD_PARTY}/lib64) 19 | set(OPENSSL_ROOT_DIR ${THIRD_PARTY}/lib64) 20 | endif() 21 | execute_process(COMMAND ${PYTHON} -c "import sysconfig; print(sysconfig.get_paths()['include'], end='')" OUTPUT_VARIABLE PYTHON_INCLUDE) 22 | include_directories(SYSTEM ${PYTHON_INCLUDE}) 23 | message(STATUS "PYTHON_INCLUDE=${PYTHON_INCLUDE}") 24 | 25 | # check gcc version 26 | if(CMAKE_COMPILER_IS_GNUCXX) 27 | execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) 28 | message(STATUS "gcc ${GCC_VERSION}") 29 | if(GCC_VERSION VERSION_GREATER 7 OR GCC_VERSION VERSION_EQUAL 7) 30 | message(STATUS "C++14 activated.") 31 | else() 32 | message(FATAL_ERROR "gcc version should be compatible with tensorflow") 33 | endif() 34 | else() 35 | message(FATAL_ERROR "only gcc supported") 36 | endif() 37 | 38 | add_definitions(--std=c++14 -Wall -Wextra -Wno-deprecated-declarations -Werror -frecord-gcc-switches -fPIC) 39 | include_directories(${PROJECT_SOURCE_DIR}) 40 | 41 | option(USE_RDMA "whether build with rdma support" OFF) 42 | if (USE_RDMA) 43 | add_definitions(-DUSE_RDMA) 44 | set(RDMA_LIBRARIES rdmacm ibverbs) 45 | message(STATUS "RDMA enabled") 46 | else() 47 | message(STATUS "RDMA disabled") 48 | set(RDMA_LIBRARIES ) 49 | endif() 50 | 51 | option(USE_DCPMM "whether build with rdma support" OFF) 52 | if (USE_DCPMM) 53 | add_definitions(-DUSE_DCPMM) 54 | find_package(PMEM REQUIRED) 55 | message(STATUS "DCPMM enabled") 56 | else() 57 | message(STATUS "DCPMM disabled") 58 | endif() 59 | 60 | 61 | if (DEBUG) 62 | add_definitions(-O0 -g) 63 | else() 64 | #add_definitions(-O0 -g) 65 | #add_definitions(-O3 -DNDEBUG) 66 | add_definitions(-O3 -g -DNDEBUG -DEIGEN_NO_DEBUG) #perf 67 | endif() 68 | 69 | set(CMAKE_SHARED_LINKER_FLAGS "-pthread -Wl,--whole-archive -lrt -Wl,--no-whole-archive") 70 | set(CMAKE_EXE_LINKER_FLAGS "-pthread -Wl,--whole-archive -lrt -Wl,--no-whole-archive") 71 | 72 | add_definitions(-DOPENEMBEDDING_VERSION="${PROJECT_VERSION}") 73 | 74 | find_package(Jemalloc REQUIRED) 75 | find_package(PicoCoreDep REQUIRED) 76 | 77 | enable_testing() 78 | add_subdirectory(openembedding) 79 | 80 | file(GLOB_RECURSE WHL_SRC LICENSE README.md setup.py MANIFEST.in openembedding/tensorflow/*.py openembedding/*.py) 81 | set(HYPEREMBEDDING_OUT ${CMAKE_CURRENT_BINARY_DIR}/openembedding-${PROJECT_VERSION}.tar.gz) 82 | add_custom_command( 83 | OUTPUT ${HYPEREMBEDDING_OUT} 84 | DEPENDS ${WHL_SRC} cexb_pack 85 | COMMAND rm -rf pypi 86 | COMMAND mkdir -p pypi 87 | COMMAND echo __version__ = \\\'${PROJECT_VERSION}\\\' > pypi/openembedding_setup.py 88 | COMMAND cd ${PROJECT_SOURCE_DIR} && cp -r LICENSE README.md setup.py MANIFEST.in openembedding ${CMAKE_CURRENT_BINARY_DIR}/pypi 89 | COMMAND cp openembedding/libcexb_pack.so pypi/openembedding 90 | COMMAND cd pypi && ${PYTHON} setup.py sdist 91 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 92 | ) 93 | add_custom_target(pip_package ALL DEPENDS ${HYPEREMBEDDING_OUT}) 94 | 95 | add_executable(criteo_preprocess test/criteo_preprocess.cpp) 96 | target_link_libraries(criteo_preprocess pico_core ${PicoCoreDep_LIBRARIES} ${Jemalloc_pic_LIBRARIES}) -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE openembedding_setup.py 2 | recursive-include openembedding *.h *.cpp *.cc 3 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.5.1-gpu 2 | # remove tensorflow docker logo to avoid confusion 3 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake mpich vim wget curl 4 | RUN HOROVOD_WITHOUT_MPI=1 pip3 install mpi4py horovod 5 | RUN pip3 install pandas scikit-learn deepctr 6 | ADD . /openembedding 7 | RUN pip3 install /openembedding/output/dist/openembedding-*.tar.gz 8 | WORKDIR /openembedding 9 | -------------------------------------------------------------------------------- /docker/Dockerfile.base: -------------------------------------------------------------------------------- 1 | # to use glibc 2.12 and dt7 which have the same system compatibility as tensorflow 2 | FROM tensorflow/tensorflow:2.3.0-custom-op-ubuntu16 3 | 4 | RUN cd /dt7/usr/bin && ln -s gcc cc && cd / 5 | 6 | # use glibc 2.12 7 | ADD pico-ps/pico-core/third_party /third_party 8 | 9 | RUN third_party/prepare.sh build cmake 10 | RUN PATH=/dt7/usr/bin:$PATH prefix=/tools third_party/prepare.sh build \ 11 | gflags glog googletest sparsehash zlib snappy lz4 boost yaml jemalloc prometheus-cpp \ 12 | avro-cpp zookeeper protobuf leveldb openssl brpc && cd .. 13 | -------------------------------------------------------------------------------- /docker/Dockerfile.build: -------------------------------------------------------------------------------- 1 | FROM 4pdosc/openembedding-base:0.1.0 2 | 3 | # only c api is tested here 4 | ADD . /openembedding 5 | ARG VERSION=0.0.0 6 | RUN pip3 install pybind11 7 | RUN PATH=/dt7/usr/bin:$PATH prefix=/tools /openembedding/pico-ps/pico-core/third_party/prepare.sh build eigen 8 | RUN cd /openembedding && \ 9 | PATH=/dt7/usr/bin:$PATH SKIP_CHECK_WHEEL_SETUP=1 VERSION=${VERSION} THIRD_PARTY=/tools ./build.sh 10 | RUN cd /openembedding/build && make test 11 | -------------------------------------------------------------------------------- /docker/Dockerfile.nccl: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:latest-gpu 2 | RUN RUN apt-get update && apt-get update && apt-get install -y gcc-7 g++-7 cmake \ 3 | openssh-client openmpi-bin libopenmpi-dev vim wget curl \ 4 | build-essential devscripts debhelper fakeroot 5 | RUN NCCL=2.9.9-1 && mkdir nccl && cd nccl && \ 6 | wget https://github.com/NVIDIA/nccl/archive/v${NCCL}.tar.gz && tar -xzf v{NCCL}.tar.gz && \ 7 | cd nccl-{NCCL} && make src.build && make pkg.debian.build && \ 8 | apt-get -y install ./build/pkg/deb/libnccl2_*_amd64.deb ./build/pkg/deb/libnccl-dev_*_amd64.deb 9 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip3 install horovod 10 | RUN pip3 install pandas scikit-learn deepctr 11 | ADD . /openembedding 12 | RUN pip3 install /openembedding/output/dist/openembedding-*.tar.gz 13 | WORKDIR /openembedding 14 | -------------------------------------------------------------------------------- /docker/docker_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | if [ "${VERSION}" == "" ]; then 4 | VERSION=0.0.0 5 | fi 6 | 7 | function build() { 8 | IMAGE=4pdosc/openembedding:${VERSION}-build 9 | docker build -t ${IMAGE} -f docker/Dockerfile.build --build-arg VERSION=${VERSION} . 10 | docker run --name dockerbuild -itd ${IMAGE} /bin/bash 11 | rm -rf output 12 | mkdir -p output/dist 13 | docker cp dockerbuild:/openembedding/build/pypi/dist/openembedding-${VERSION}.tar.gz output/dist 14 | docker stop dockerbuild 15 | docker rm dockerbuild 16 | docker rmi ${IMAGE} 17 | } 18 | 19 | function image() { 20 | IMAGE=4pdosc/openembedding:${VERSION} 21 | docker build -t ${IMAGE} -f docker/Dockerfile . 22 | } 23 | 24 | function image_test() { 25 | mkdir -p tmp 26 | IMAGE=4pdosc/openembedding:${VERSION} 27 | echo '{' > tmp/daemon.json 28 | echo ' "storage-driver": "vfs"' >> tmp/daemon.json 29 | echo '}' >> tmp/daemon.json 30 | 31 | echo 'set -e' > tmp/test.sh 32 | echo 'curl -fsSL https://get.docker.com | sh' >> tmp/test.sh 33 | echo 'mkdir -p /etc/docker' >> tmp/test.sh 34 | echo 'cp tmp/daemon.json /etc/docker' >> tmp/test.sh 35 | echo 'service docker start' >> tmp/test.sh 36 | echo './build.sh test' >> tmp/test.sh 37 | 38 | docker run --privileged --name image_test -v `pwd`/tmp:/openembedding/tmp ${IMAGE} bash tmp/test.sh 39 | docker rm image_test 40 | } 41 | 42 | case "$1" in 43 | build|"") 44 | build 45 | ;; 46 | image) 47 | image 48 | ;; 49 | test) 50 | image_test 51 | ;; 52 | *) 53 | echo "unkown cmd" 54 | exit 1 55 | ;; 56 | esac 57 | -------------------------------------------------------------------------------- /documents/cn/benchmark.md: -------------------------------------------------------------------------------- 1 | # 性能测试 2 | 3 | ## 单机多卡 4 | 5 | 在单机多卡的环境下,对比仅使用 Horovod 和 OpenEmbedding & Horovod 对 TensorFlow 的加速效果。 6 | 7 | | 选项 | 设置 | 8 | | - | - | 9 | | CPU | 2 * CPU Xeon(R) Gold 5218 CPU @ 2.30GHz | 10 | | GPU | 8 * Tesla T4 | 11 | | 数据集 | Criteo | 12 | | 数据格式 | TFRecord | 13 | | Model | WDL, DeepFM, XDeepFM | 14 | | Embedding维度 | 9, 64 | 15 | | Optimizer | Adagrad | 16 | | Batch Size per GPU | 4096 | 17 | 18 | ![benchmark](../images/benchmark.png) 19 | 20 | 随着 GPU 数量的增加,仅使用 Horovod 难以得到加速,对于稀疏部分占比更大的 WDL 64, DeepFM 64 性能还会反而下降。对于 XDeepFM 9,由于模型计算量特别大,稀疏部分占比相对较小,Horovod 仍然能得到较好的加速,但是当 GPU 数量增多时,与 OpenEmbedding & Horovod 的差距就越来越大了。由于 XDeepFM 64 计算量极大,用时过长,这里没有测试。 21 | 22 | ## 分离参数服务器 23 | 24 | > 在上一节中,OpenEmbedding & Horovod 实际上使用的是本节中 Cache Local 的设置。 25 | 26 | | 测试名称 | 测试模式 | 27 | | - | - | 28 | | Local | Server 在本地 | 29 | | Cache Local | Server 在本地,且高频 Embedding 视为稠密参数,使用 all-reduce 同步 | 30 | | Remote 100G | Server 在远程,与 Worker 通过 100G bit/s 网络互联 | 31 | | Cache Remote 100G | Server 在远程,与 Worker 通过 100G bit/s 网络互联,其他同 Cache Local | 32 | 33 | ![avatar](../images/benchmark-server.png) 34 | 35 | 在 100G 网络中,Server 与 Worker 之间的通信不会对性能造成很大影响。另外,使用 Cache 通常能够得到 10% 左右的加速。 36 | 37 | ## 大规模数据 38 | 39 | OpenEmbedding 具备处理超大规模的数据的能力,对于大规模数据中的稀疏特征,有时难以去重并重新编号,在 OpenEmbedding 中可以将其 hash 到 int64 的非负整数范围,Server 会使用 hash table 存储参数。 40 | 41 | 1TB 的 Criteo 数据集的性能测试结果如下。 42 | 43 | | | | 44 | | - | - | 45 | | Model | DeepFM 9 | 46 | | Optimizer | Adagrad | 47 | | 测试模式 | Remote | 48 | | 数据集 | Criteo1T | 49 | | 数据格式 | TSV | 50 | | 每轮 Instance 数量 | 3.3 G | 51 | | 训练速度 | 692 kips | 52 | | 每轮用时 | 4763 s | 53 | | Checkpoint 用时 | 869 s | 54 | | Server 最大内存 | 1 * 175 GB | 55 | | Worker 最大内存 | 8 * 1.6 GB | 56 | | Checkpoint 大小 | 78 GB | 57 | | SavedModel 大小 | 45 GB | 58 | 59 | # 测试步骤 60 | 61 | ## 单机多卡 62 | 63 | 1. copy 出 test/benchmark 和 example/criteo_preprocess.py 64 | 2. 下载并解压 Criteo 数据集,得到 `train.txt` 约 11 GB 65 | 3. 预处理 `python3 criteo_preprocess.py train.txt train.csv` 66 | 4. 转换为 TFRecord 格式 `mkdir -p tfrecord && python3 criteo_tfrecord.py train.csv 0 1` 如果转换过程较慢,这一步可以并行执行,详见 `criteo_tfrecord.py` 67 | 5. 测试 Horovod 2 GPU `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord` 68 | 6. 测试 OpenEmbedding & Horovod 2 GPU `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord --server` 69 | 70 | ## 分离参数服务器 71 | 72 | 两台机器的 ip 分别是 ip1 和 ip2。 73 | 1. 启动 Server `python3 server.py ip2:34567` 74 | 2. 启动 Worker `python3 criteo_deepctr.py --data tfrecord --server --cache --master_endpoint ip2:34567 --bind_ip ip1` 75 | 76 | ## 大规模数据 77 | 78 | 1. 下载并解压 Criteo 1TB 的数据放到 criteo1T 文件夹下,文件路径为 criteo1T/day_* 79 | 2. 在另一台机器 ip1 上执行 `python3 server.py ip1:34567` 80 | 3. 执行 `horovodrun -np 8 python3 criteo_deepctr.py --data criteo1T --server --master_endpoint ip1:34567` 81 | 82 | 可以使用 `--checkpoint`, `--save` 等参数指定模型保存路径,注意包括 `--data` 在内所有路径都应是共享的路径,不同机器之间可以通过挂载分布式文件系统来共享路径。 83 | 84 | -------------------------------------------------------------------------------- /documents/cn/serving.md: -------------------------------------------------------------------------------- 1 | # Serving 2 | 3 | ## 单机模型 4 | 5 | ![standalone](../images/standalone.drawio.png) 6 | 7 | 用户可以通过 `save_as_orignal_model` 将分布式 Model 保存为单机的 SavedModel。SavedModel 包含前向计算图和包括 Embedding 在内的所有参数,可以被 TensorFlow Serving 直接加载。这个 SavedModel 不能用于训练,因为它没有存储 `Optimizer` 状态。 8 | 9 | ## 分布式模型 10 | 11 | ![serving](../images/serving.drawio.png) 12 | 13 | 分布式 Model 需要使用包含 OpenEmbedding Operator 的 TensorFlow Serving 才能加载,启动流程如下: 14 | 1. 启动参数服务器集群,包括 ZooKeeper Master, Server, Controller。 15 | 2. 通过 Controller 将 EmbeddingModel 加载到参数服务器上。 16 | 3. 启动 TensorFlow Serving 加载 SavedModel 并连接到参数服务器的 ZooKeeper Master。 17 | 18 | 在 SavedModel 中存储了一个 UUID 用于维护 SavedModel 和 EmbeddingModel 的对应关系。如果参数服务器上没有找到对应的 EmbeddingModel,Tensorflow Serving 会返回 "not found model",而不会引发其他异常。 -------------------------------------------------------------------------------- /documents/cn/training.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | ## 数据并行和模型并行 4 | 5 | ![training](../images/training.drawio.png) 6 | 7 | 训练的并行模式如上图,稠密部分和高频 `Embedding` 参数 在每个 Worker 中都存储一个副本,通过 all-reduce 同步,实现数据并行。低频的 Embedding 分 shard 存储在 Server 上,实现模型并行。基于同步模式的参数服务器架构,Worker 从 Server 上 pull 参数并且将梯度 push 给 Server,Server 收集到一个 mini-batch 中所有 Worker 的梯度后使用 `Optimizer` 更新参数。 8 | -------------------------------------------------------------------------------- /documents/en/benchmark.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## Multi GPUs 4 | 5 | Compare the acceleration effects of only Horovod and OpenEmbedding & Horovod on TensorFlow. 6 | 7 | | Option | Setting | 8 | | - | - | 9 | | CPU | 2 * CPU Xeon(R) Gold 5218 CPU @ 2.30GHz | 10 | | GPU | 8 * Tesla T4 | 11 | | Data | Criteo | 12 | | Data Format | TFRecord | 13 | | Model | WDL, DeepFM, XDeepFM | 14 | | Embedding Dimension | 9, 64 | 15 | | Optimizer | Adagrad | 16 | | Batch Size per GPU | 4096 | 17 | 18 | ![benchmark](../images/benchmark.png) 19 | 20 | With the increase in the number of GPUs, it is difficult to speed up using the Allreduce-based framework Horovod. For WDL 64, which accounts for a larger proportion of the sparse part, the performance of DeepFM 64 will decrease instead. For XDeepFM 9, Horovod can still get better acceleration due to the large amount of model calculations and the relatively small proportion of the sparse part. However, when the number of GPUs increases, the gap with OpenEmbedding & Horovod becomes larger and larger. Since XDeepFM 64 has a huge amount of calculation and takes too long, there is no test here. 21 | 22 | ## Remote Parameter Server 23 | 24 | > In the previous section, OpenEmbedding & Horovod actually used the Cache Local setting in this section. 25 | 26 | | Case | Setting | 27 | | - | - | 28 | | Local | Local server | 29 | | Cache Local | Local server, high-frequency `Embedding` parameters updated by dense method and synchronized by all-reduce operator | 30 | | Remote 100G | Remote server,connect with worker through 100G bit/s network | 31 | | Cache Remote 100G | Remote server,connect with worker through 100G bit/s network,`Embedding` same as Cache Local | 32 | 33 | ![avatar](../images/benchmark-server.png) 34 | 35 | As shown in the figure, in a 100G network, the communication between server and worker will not affect the performance significantly. In addition, the `Cache` test cases can usually get about 10% speedup. 36 | 37 | ## Big Data 38 | 39 | OpenEmbedding has the ability to handle large-scale data. For the sparse features in large-scale data, it is sometimes difficult to de-duplicate and relabel. In OpenEmbedding, it can be hashed to the non-negative integer range of int64, and parameter servers will use hash table to store the parameters. 40 | 41 | The performance test results of the 1TB Criteo data set are as follows. 42 | | | | 43 | | - | - | 44 | | Model | DeepFM 9 | 45 | | Optimizer | Adagrad | 46 | | Setting | Remote | 47 | | Data | Criteo1T | 48 | | Data Format | TSV | 49 | | Instance per Epoch | 3.3 G | 50 | | Training speed | 692 kips | 51 | | Time per Epoch | 4763 s | 52 | | Checkpoint Time | 869 s | 53 | | Server Memory | 1 * 175 GB | 54 | | Worker Memory | 8 * 1.6 GB | 55 | | Checkpoint Size | 78 GB | 56 | | SavedModel Size | 45 GB | 57 | 58 | # Run Steps 59 | 60 | ## Multi GPUs 61 | 62 | 1. Copy test/benchmark example/criteo_preprocess.py 63 | 2. Download and decompress Criteo dat and get `train.txt` about 11 GB 64 | 3. Preprocess `python3 criteo_preprocess.py train.txt train.csv` 65 | 4. Transform data to TFRecord format `mkdir -p tfrecord && python3 criteo_tfrecord.py train.csv 0 1` 66 | 5. Run the brenchmark case Horovod `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord` 67 | 6. Run the brenchmark case OpenEmbedding & Horovod `horovodrun -np 2 python3 criteo_deepctr.py --data` 68 | 69 | ## Remote Parameter Server 70 | 71 | For the ip of the two machines are ip1 and ip2 respectively 72 | 1. Run servers `python3 server.py ip2:34567` 73 | 2. Run workers `python3 criteo_deepctr.py --data tfrecord --server --cache --master_endpoint ip2:34567 --bind_ip ip1` 74 | 75 | ## Big Data 76 | 77 | 1. Download and decompress Criteo 1TB data to `criteo1T` folder and the pattern of file path should be criteo1T/day_* 78 | 2. Run servers `python3 server.py ip1:34567` 79 | 3. Run workers `horovodrun -np 8 python3 criteo_deepctr.py --data criteo1T --server --master_endpoint ip1:34567` 80 | 81 | You can use `--checkpoint`, `--save` and other parameters to specify the model save path. Note that all paths including `--data` should be shared. Distributed file systems can be mounted between different machines to share the path. 82 | -------------------------------------------------------------------------------- /documents/en/serving.md: -------------------------------------------------------------------------------- 1 | # Serving 2 | 3 | ## Stand-alone Model 4 | 5 | ![standalone](../images/standalone.drawio.png) 6 | 7 | You can save the distributed model as a stand-alone SavedModel by `save_as_orignal_model`. SavedModel contains forward calculation graph and all parameters including `Embedding`, which can be loaded directly by TensorFlow Serving. This SavedModel cannot be used for training because it does not store `Optimizer` states. 8 | 9 | ## Distributed Model 10 | 11 | ![serving](../images/serving.drawio.png) 12 | 13 | The distributed model needs to be loaded with TensorFlow Serving including OpenEmbedding Operator. The startup process is as follows: 14 | 1. Start the parameter server cluster, including ZooKeeper Master, Server, and Controller. 15 | 2. Load the EmbeddingModel to the parameter server cluster through the Controller. 16 | 3. Start TensorFlow Serving and load the SavedModel and connect to the ZooKeeper Master of the parameter server. 17 | 18 | A UUID is stored in SavedModel to maintain the correspondence with EmbeddingModel. If the corresponding EmbeddingModel is not found on the parameter server, Tensorflow Serving will return "not found model" without causing other exceptions. -------------------------------------------------------------------------------- /documents/en/train.md: -------------------------------------------------------------------------------- 1 | # Start 2 | 3 | ## Parameter Server in Process 4 | ```python 5 | import openembedding.tensorflow as embed 6 | ``` 7 | 8 | ## Remote Parameter Server 9 | 10 | ### Master 11 | ```python 12 | import time 13 | import openembedding as embed 14 | master = embed.Master() 15 | time.sleep(10) # Wait 16 | ``` 17 | 18 | ### Parameter Server 19 | ```python 20 | import openembedding as embed 21 | embed.flags.master_endpoint = '{ip}:{port}' 22 | _server = embed.Server() 23 | _server.join() 24 | ``` 25 | 26 | ### Worker 27 | ```python 28 | import openembedding.tensorflow as embed 29 | embed.flags.master_endpoint = '{ip}:{port}' 30 | embed.flags.wait_num_servers = num_servers 31 | ``` 32 | -------------------------------------------------------------------------------- /documents/en/training.md: -------------------------------------------------------------------------------- 1 | # Training 2 | 3 | ## Model parallel and data parallel 4 | 5 | ![training](../images/training.drawio.png) 6 | 7 | The parallel mode of training is shown in the figure above. The dense part and high-frequency `Embedding ` parameters are stored in multi workers mirrored mode and synchronized by all-reduce operator, which implement the data parallel. The low-frequency `Embedding` parameters divided into shards and stored on servers, which implement the model parallel. Based on the synchronized training mode of parameter server architecture, workers pull parameters from servers and push gradients to servers, and each server collects the gradients of all workers in a mini-batch and then update parameters by `Optimizer`. -------------------------------------------------------------------------------- /documents/images/benchmark-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/benchmark-server.png -------------------------------------------------------------------------------- /documents/images/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/benchmark.png -------------------------------------------------------------------------------- /documents/images/pmem_vs_dram_oe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/pmem_vs_dram_oe.png -------------------------------------------------------------------------------- /documents/images/serving.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/serving.drawio.png -------------------------------------------------------------------------------- /documents/images/standalone.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/standalone.drawio.png -------------------------------------------------------------------------------- /documents/images/training.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/training.drawio.png -------------------------------------------------------------------------------- /documents/papers/openembedding_icde2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/papers/openembedding_icde2023.pdf -------------------------------------------------------------------------------- /examples/criteo_deepctr_hook.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import deepctr.models 5 | import deepctr.feature_column 6 | import horovod.tensorflow.keras as hvd 7 | import openembedding.tensorflow as embed 8 | print('OpenEmbedding', embed.__version__) 9 | 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser() 13 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 14 | parser.add_argument('--data', default=default_data) 15 | parser.add_argument('--optimizer', default='Adam') 16 | parser.add_argument('--model', default='DeepFM') 17 | parser.add_argument('--checkpoint', default='') # include optimizer 18 | parser.add_argument('--load', default='') # include optimizer 19 | parser.add_argument('--save', default='') # not include optimizer 20 | 21 | parser.add_argument('--batch_size', default=8, type=int) 22 | # Because the example uses hash table to store data, 23 | # it does not support exporting to tensorflow original model. 24 | # parser.add_argument('--export', default='') # not include optimizer 25 | args = parser.parse_args() 26 | if not args.optimizer.endswith(')'): 27 | args.optimizer += '()' # auto call args.optimizer 28 | 29 | 30 | # Hook deepctr.inputs.Embedding. 31 | class HookEmbedding(embed.Embedding): 32 | def __init__(self, input_dim=-1, output_dim=9, 33 | embeddings_initializer=None, embeddings_regularizer=None, **kwargs): 34 | # input_dim = -1 means that the input range is the natural number range of int64 [0, 2**63-1]. 35 | # If input_dim = -1, the server will uses hash table to store Embedding layer, 36 | # server does not support embeddings_regularizer. 37 | # You can specify the number of global shards by num_shards, 38 | # num_shards is equal to the number of servers by default. 39 | super(HookEmbedding, self).__init__(input_dim, output_dim, 40 | embeddings_initializer=embeddings_initializer, 41 | activity_regularizer=embeddings_regularizer, 42 | num_shards=1, 43 | **kwargs) 44 | import deepctr.inputs 45 | deepctr.inputs.Embedding = HookEmbedding 46 | 47 | 48 | # Assign GPU according to rank. 49 | hvd.init() 50 | gpus = tf.config.experimental.list_physical_devices('GPU') 51 | if gpus: 52 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') 53 | tf.config.experimental.set_memory_growth(gpus[hvd.local_rank()], True) 54 | 55 | 56 | # Process data. 57 | data = pandas.read_csv(args.data) 58 | n = data.shape[0] // hvd.size() * hvd.size() 59 | data = data.iloc[hvd.rank():n:hvd.size()] 60 | inputs = dict() 61 | feature_columns = list() 62 | for name in data.columns: 63 | if name[0] == 'C': 64 | inputs[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) # hash encoding 65 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 66 | vocabulary_size=-1, embedding_dim=9, dtype='int64')) 67 | elif name[0] == 'I': 68 | inputs[name] = data[name] 69 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 70 | 71 | 72 | # Compile distributed model. 73 | optimizer = eval("tf.keras.optimizers." + args.optimizer) 74 | optimizer = embed.distributed_optimizer(optimizer) 75 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum) 76 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary') 77 | model = embed.distributed_model(model) 78 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 79 | 80 | 81 | # load --> fit --> save 82 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), 83 | hvd.callbacks.MetricAverageCallback() ] 84 | if args.checkpoint and hvd.rank() == 0: 85 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 86 | if args.load: 87 | model.load_weights(args.load) 88 | 89 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2) 90 | 91 | if args.save and hvd.rank() == 0: 92 | model.save(args.save, include_optimizer=False) 93 | -------------------------------------------------------------------------------- /examples/criteo_deepctr_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import deepctr.models 5 | import deepctr.feature_column 6 | import horovod.tensorflow.keras as hvd 7 | import openembedding.tensorflow as embed 8 | print('OpenEmbedding', embed.__version__) 9 | 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser() 13 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 14 | parser.add_argument('--data', default=default_data) 15 | parser.add_argument('--batch_size', default=8, type=int) 16 | parser.add_argument('--optimizer', default='Adam') 17 | parser.add_argument('--model', default='DeepFM') 18 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer 19 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer 20 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer 21 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer 22 | args = parser.parse_args() 23 | if not args.optimizer.endswith(')'): 24 | args.optimizer += '()' # auto call args.optimizer 25 | 26 | 27 | # Assign GPU according to rank. 28 | hvd.init() 29 | gpus = tf.config.experimental.list_physical_devices('GPU') 30 | if gpus: 31 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') 32 | tf.config.experimental.set_memory_growth(gpus[hvd.local_rank()], True) 33 | 34 | 35 | # Process data. 36 | data = pandas.read_csv(args.data) 37 | n = data.shape[0] // hvd.size() * hvd.size() 38 | data = data.iloc[hvd.rank():n:hvd.size()] 39 | inputs = dict() 40 | feature_columns = list() 41 | for name in data.columns: 42 | if name[0] == 'C': 43 | inputs[name] = data[name] % 65536 44 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 45 | vocabulary_size=65536, embedding_dim=9, dtype='int64')) 46 | elif name[0] == 'I': 47 | inputs[name] = data[name] 48 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 49 | 50 | 51 | # Compile distributed model. 52 | optimizer = eval("tf.keras.optimizers." + args.optimizer) 53 | optimizer = embed.distributed_optimizer(optimizer) 54 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum) 55 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary') 56 | model = embed.distributed_model(model) 57 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 58 | 59 | 60 | # load --> fit --> save 61 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), 62 | hvd.callbacks.MetricAverageCallback() ] 63 | if args.checkpoint and hvd.rank() == 0: 64 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 65 | if args.load: 66 | model.load_weights(args.load) 67 | 68 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2) 69 | 70 | if args.save and hvd.rank() == 0: 71 | model.save(args.save, include_optimizer=False) 72 | if args.export and hvd.rank() == 0: 73 | model.save_as_original_model(args.export, include_optimizer=False) 74 | -------------------------------------------------------------------------------- /examples/criteo_deepctr_network_mirrored.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import deepctr.models 5 | import deepctr.feature_column 6 | import openembedding.tensorflow as embed 7 | print('OpenEmbedding', embed.__version__) 8 | 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser() 12 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 13 | parser.add_argument('--data', default=default_data) 14 | parser.add_argument('--batch_size', default=8, type=int) 15 | # Currently, MirroredStrategy does not support this. 16 | # parser.add_argument('--prefetch', action='store_true') 17 | parser.add_argument('--optimizer', default='Adam') 18 | parser.add_argument('--model', default='DeepFM') 19 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer 20 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer 21 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer 22 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer 23 | args = parser.parse_args() 24 | if not args.optimizer.endswith(')'): 25 | args.optimizer += '()' # auto call args.optimizer 26 | 27 | # Process data 28 | data = pandas.read_csv(args.data) 29 | data = data.iloc[:data.shape[0] // args.batch_size * args.batch_size] 30 | inputs = dict() 31 | feature_columns = list() 32 | for name in data.columns: 33 | inputs[name] = tf.reshape(data[name], [-1, args.batch_size, 1]) 34 | if name[0] == 'C': 35 | inputs[name] = tf.cast(inputs[name] % 65536, dtype=tf.int64) 36 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 37 | vocabulary_size=65536, embedding_dim=9, dtype='int64')) 38 | elif name[0] == 'I': 39 | inputs[name] = tf.cast(inputs[name], dtype=tf.float32) 40 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 41 | train_batch_target = tf.reshape(data['label'], [-1, args.batch_size]) 42 | dataset = tf.data.Dataset.from_tensor_slices((inputs, train_batch_target)) 43 | 44 | 45 | # Compile distributed model 46 | strategy = tf.distribute.MirroredStrategy() 47 | with strategy.scope(): 48 | optimizer = eval("tf.keras.optimizers." + args.optimizer) 49 | optimizer = embed.distributed_optimizer(optimizer) 50 | 51 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary') 52 | model = embed.distributed_model(model, sparse_as_dense_size=args.batch_size) 53 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 54 | 55 | 56 | # load --> fit --> save 57 | callbacks = list() 58 | if args.checkpoint: 59 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 60 | if args.load: 61 | model.load_weights(args.load) 62 | 63 | 64 | # Currently, MirroredStrategy does not support this. 65 | # if args.prefetch: 66 | # dataset = embed.pulling(dataset, model).prefetch(4) 67 | model.fit(dataset, batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2) 68 | 69 | if args.save: 70 | model.save(args.save, include_optimizer=False) 71 | if args.export: 72 | model.save_as_original_model(args.export, include_optimizer=False) 73 | -------------------------------------------------------------------------------- /examples/criteo_deepctr_network_mpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import deepctr.models 5 | import deepctr.feature_column 6 | import openembedding.tensorflow as embed 7 | print('OpenEmbedding', embed.__version__) 8 | 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser() 12 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 13 | parser.add_argument('--data', default=default_data) 14 | parser.add_argument('--batch_size', default=8, type=int) 15 | # Currently, MirroredStrategy does not support this. 16 | # parser.add_argument('--prefetch', action='store_true') 17 | parser.add_argument('--optimizer', default='Adam') 18 | parser.add_argument('--model', default='DeepFM') 19 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer 20 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer 21 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer 22 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer 23 | parser.add_argument('--port', default=50000) 24 | args = parser.parse_args() 25 | if not args.optimizer.endswith(')'): 26 | args.optimizer += '()' # auto call args.optimizer 27 | 28 | 29 | gpus = tf.config.experimental.list_physical_devices('GPU') 30 | if gpus: 31 | for gpu in gpus: 32 | tf.config.experimental.set_memory_growth(gpu, True) 33 | 34 | 35 | # Synchronizing distributed configurations using MPI. 36 | import json 37 | import socket 38 | from mpi4py import MPI 39 | comm_rank = MPI.COMM_WORLD.Get_rank() 40 | comm_size = MPI.COMM_WORLD.Get_size() 41 | ip = str(socket.gethostbyname(socket.gethostname())) 42 | ip_port = ip + ':' + str(args.port + comm_rank) 43 | os.environ['TF_CONFIG'] = json.dumps({ 44 | 'cluster': { 'worker': MPI.COMM_WORLD.allgather(ip_port) }, 45 | 'task': { 'type': 'worker', 'index': comm_rank } 46 | }) 47 | strategy = tf.distribute.MultiWorkerMirroredStrategy() 48 | 49 | 50 | # Process data. 51 | data = pandas.read_csv(args.data) 52 | data = data.iloc[:data.shape[0] // args.batch_size * args.batch_size] 53 | inputs = dict() 54 | feature_columns = list() 55 | for name in data.columns: 56 | inputs[name] = tf.reshape(data[name], [-1, args.batch_size, 1]) 57 | if name[0] == 'C': 58 | inputs[name] = tf.cast(inputs[name] % 65536, dtype=tf.int64) 59 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 60 | vocabulary_size=65536, embedding_dim=9, dtype='int64')) 61 | elif name[0] == 'I': 62 | inputs[name] = tf.cast(inputs[name], dtype=tf.float32) 63 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 64 | train_batch_target = tf.reshape(data['label'], [-1, args.batch_size]) 65 | dataset = tf.data.Dataset.from_tensor_slices((inputs, train_batch_target)) 66 | 67 | 68 | # Compile distributed model. 69 | with strategy.scope(): 70 | optimizer = eval("tf.keras.optimizers." + args.optimizer) 71 | optimizer = embed.distributed_optimizer(optimizer) 72 | 73 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary') 74 | model = embed.distributed_model(model, sparse_as_dense_size=args.batch_size) 75 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 76 | 77 | 78 | # load --> fit --> save 79 | callbacks = list() 80 | if args.checkpoint: 81 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 82 | if args.load: 83 | model.load_weights(args.load) 84 | 85 | 86 | # Currently, MirroredStrategy does not support this. 87 | # if args.prefetch: 88 | # dataset = embed.pulling(dataset, model).prefetch(4) 89 | model.fit(dataset, batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2) 90 | 91 | if args.save: 92 | model.save(args.save, include_optimizer=False) 93 | if args.export: 94 | model.save_as_original_model(args.export, include_optimizer=False) 95 | -------------------------------------------------------------------------------- /examples/criteo_lr_subclass.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import openembedding.tensorflow as embed 5 | print('OpenEmbedding', embed.__version__) 6 | 7 | 8 | class CriteoLR(tf.keras.Model): 9 | def __init__(self): 10 | super(CriteoLR, self).__init__() 11 | # input_dim = -1 means that the input range is the natural number range of int64 [0, 2**63-1]. 12 | # If input_dim = -1, the server will uses hash table to store Embedding layer, 13 | self.embeddings = embed.Embedding(input_dim=-1, output_dim=1, 14 | embeddings_initializer=tf.keras.initializers.Zeros(), num_shards=16) 15 | self.concatenate = tf.keras.layers.Concatenate() 16 | self.sigmoid = tf.keras.layers.Dense(1, activation='sigmoid') 17 | 18 | def call(self, inputs): 19 | fields = [] 20 | for name, tensor in inputs.items(): 21 | if name[0] == 'C': 22 | fields.append(self.embeddings(tensor)) 23 | else: 24 | fields.append(tf.reshape(tensor, [-1, 1, 1])) 25 | return self.sigmoid(self.concatenate(fields)) 26 | 27 | 28 | import argparse 29 | parser = argparse.ArgumentParser() 30 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 31 | parser.add_argument('--data', default=default_data) 32 | parser.add_argument('--checkpoint', default='') # include optimizer 33 | parser.add_argument('--load', default='') # include optimizer 34 | parser.add_argument('--save', default='') # not include optimizer 35 | # subclass model not support exporting to tensorflow original model 36 | # parser.add_argument('--export', default='') # not include optimizer 37 | args = parser.parse_args() 38 | 39 | 40 | # Process data 41 | data = pandas.read_csv(args.data) 42 | inputs = dict() 43 | for name in data.columns: 44 | if name[0] == 'C': 45 | inputs[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) # hash encoding 46 | elif name[0] == 'I': 47 | inputs[name] = data[name] 48 | 49 | 50 | # Compile distributed model 51 | optimizer = tf.keras.optimizers.Adam() 52 | optimizer = embed.distributed_optimizer(optimizer) 53 | model = CriteoLR() 54 | model = embed.distributed_model(model) 55 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 56 | 57 | 58 | # load --> fit --> save 59 | callbacks = list() 60 | if args.checkpoint: 61 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 62 | if args.load: 63 | model.load_weights(args.load) 64 | 65 | model.fit(inputs, data['label'], batch_size=8, epochs=5, callbacks=callbacks, verbose=2) 66 | if args.save: 67 | model.save(args.save, include_optimizer=False) 68 | -------------------------------------------------------------------------------- /examples/criteo_preprocess.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas 3 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler 4 | 5 | if len(sys.argv) < 2: 6 | print("usage: process_data.py input_file output_file") 7 | 8 | data = pandas.read_csv(sys.argv[1], sep='\t', header=None) 9 | target = ['label'] 10 | dense_features = ['I' + str(i) for i in range(1, 14)] 11 | sparse_features = ['C' + str(i) for i in range(1, 27)] 12 | data.columns = target + dense_features + sparse_features 13 | 14 | data[sparse_features] = data[sparse_features].fillna('-1', ) 15 | data[dense_features] = data[dense_features].fillna(0, ) 16 | 17 | for feat in dense_features: 18 | print(feat, data[feat].min(), data[feat].max()) 19 | mms = MinMaxScaler(feature_range=(0, 1)) 20 | data[dense_features] = mms.fit_transform(data[dense_features]) 21 | for feat in sparse_features: 22 | lbe = LabelEncoder() 23 | data[feat] = lbe.fit_transform(data[feat]) 24 | 25 | data.to_csv(sys.argv[2], float_format='%.6f') 26 | -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_checkpoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --checkpoint tmp/epoch 4 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --load tmp/epoch4/variables/variables 5 | -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_horovod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --export tmp/criteo/1 -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_mirrored.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python3 examples/criteo_deepctr_network_mirrored.py --export tmp/criteo/1 -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_mpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | mpirun -np 2 python3 examples/criteo_deepctr_network_mpi.py --export tmp/criteo/1 -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_restful.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python3 examples/tensorflow_serving_restful.py 4 | python3 examples/tensorflow_serving_restful.py --rows 1 5 | -------------------------------------------------------------------------------- /examples/run/criteo_deepctr_standalone.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | python3 examples/criteo_deepctr_network.py --export tmp/criteo/1 4 | -------------------------------------------------------------------------------- /examples/run/criteo_preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | wget https://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz -O tmp/dac_sample.tar.gz 4 | tar -xzf tmp/dac_sample.tar.gz -C tmp 5 | python3 examples/criteo_preprocess.py tmp/dac_sample.txt tmp/dac_sample.csv 6 | python3 examples/criteo_deepctr_hook.py --data tmp/dac_sample.csv --batch_size 256 7 | -------------------------------------------------------------------------------- /examples/tensorflow_serving_client.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import grpc 4 | import numpy 5 | import pandas 6 | import threading 7 | import tensorflow as tf 8 | 9 | 10 | from tensorflow_serving.apis import predict_pb2 11 | from tensorflow_serving.apis import prediction_service_pb2_grpc 12 | 13 | 14 | class _ResultCounter(object): 15 | def __init__(self, num_tests, concurrency): 16 | self._num_tests = num_tests 17 | self._concurrency = concurrency 18 | self._error = 0 19 | self._done = 0 20 | self._active = 0 21 | self._condition = threading.Condition() 22 | 23 | def inc_error(self): 24 | with self._condition: 25 | self._error += 1 26 | 27 | def inc_done(self): 28 | with self._condition: 29 | self._done += 1 30 | self._condition.notify() 31 | 32 | def dec_active(self): 33 | with self._condition: 34 | self._active -= 1 35 | self._condition.notify() 36 | 37 | def get_error_rate(self): 38 | with self._condition: 39 | while self._done != self._num_tests: 40 | self._condition.wait() 41 | return self._error / float(self._num_tests) 42 | 43 | def throttle(self): 44 | with self._condition: 45 | while self._active == self._concurrency: 46 | self._condition.wait() 47 | self._active += 1 48 | 49 | 50 | def _create_rpc_callback(label, result_counter): 51 | def _callback(result_future): 52 | exception = result_future.exception() 53 | if exception: 54 | result_counter.inc_error() 55 | print(exception) 56 | else: 57 | predict = numpy.array(result_future.result().outputs['prediction_layer'].float_val) 58 | print('label = ', label, ', predict = ', int(predict[0] + 0.5), ' ' ,predict[0]) 59 | if label != int(predict[0] + 0.5): 60 | result_counter.inc_error() 61 | result_counter.inc_done() 62 | result_counter.dec_active() 63 | return _callback 64 | 65 | 66 | import argparse 67 | parser = argparse.ArgumentParser() 68 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 69 | parser.add_argument('--data', default=default_data) 70 | parser.add_argument('--hash', action='store_true') 71 | parser.add_argument('--grpc', default='127.0.0.1:8500') 72 | parser.add_argument('--model', default='criteo') 73 | args = parser.parse_args() 74 | 75 | 76 | # process data 77 | data = pandas.read_csv(args.data) 78 | feature_names = list() 79 | for name in data.columns: 80 | if name[0] == 'C': 81 | if args.hash: 82 | data[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) 83 | else: 84 | data[name] = data[name] % 65536 85 | feature_names.append(name) 86 | elif name[0] == 'I': 87 | feature_names.append(name) 88 | 89 | 90 | # use TensorFlow Serving 91 | channel = grpc.insecure_channel(args.grpc) 92 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) 93 | result_counter = _ResultCounter(data.shape[0], 4) 94 | for i in range(data.shape[0]): 95 | request = predict_pb2.PredictRequest() 96 | request.model_spec.name = args.model 97 | for name in feature_names: 98 | dtype = tf.float32 99 | if name.startswith('C'): 100 | dtype = tf.int64 101 | request.inputs[name].CopyFrom(tf.make_tensor_proto(data[name][i], dtype=dtype, shape=[1, 1])) 102 | result_counter.throttle() 103 | result_future = stub.Predict.future(request, 5.0) # 5 seconds 104 | result_future.add_done_callback( 105 | _create_rpc_callback(data['label'][i], result_counter)) 106 | print('error rate: ', result_counter.get_error_rate()) 107 | -------------------------------------------------------------------------------- /examples/tensorflow_serving_restful.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas 4 | import argparse 5 | parser = argparse.ArgumentParser() 6 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 7 | parser.add_argument('--data', default=default_data) 8 | parser.add_argument('--rows', type=int, default=None) 9 | parser.add_argument('--hash', action='store_true') 10 | parser.add_argument('--host', default='127.0.0.1:8501') 11 | parser.add_argument('--model', default='criteo') 12 | args = parser.parse_args() 13 | 14 | 15 | # process data 16 | data = pandas.read_csv(args.data, nrows=args.rows) 17 | feature_names = list() 18 | for name in data.columns: 19 | if name[0] == 'C': 20 | if args.hash: 21 | data[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) 22 | else: 23 | data[name] = data[name] % 65536 24 | feature_names.append(name) 25 | elif name[0] == 'I': 26 | feature_names.append(name) 27 | 28 | inputs = dict() 29 | for name in data.columns: 30 | if name[0] == 'C': 31 | inputs[name] = [[int(value)] for value in data[name]] 32 | elif name[0] == 'I': 33 | inputs[name] = [[float(value)] for value in data[name]] 34 | post = json.dumps({'inputs':inputs}) 35 | command = f"curl -d '{post}' {args.host}/v1/models/{args.model}:predict" 36 | print(command) 37 | result = json.load(os.popen(command)) 38 | print(json.dumps(result)) 39 | 40 | if "outputs" not in result or len(result["outputs"]) != data.shape[0]: 41 | print("get error result!") 42 | exit(1) 43 | -------------------------------------------------------------------------------- /laboratory/benchmark/Dockerfile: -------------------------------------------------------------------------------- 1 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ 2 | python3.7 get-pip.py && \ 3 | rm get-pip.py 4 | 5 | RUN apt-get update && apt-get install -y python3.7-dev 6 | 7 | RUN pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir \ 8 | future \ 9 | grpcio \ 10 | h5py \ 11 | mock \ 12 | numpy \ 13 | requests \ 14 | pandas \ 15 | sklearn \ 16 | deepctr \ 17 | tensorflow==2.2 18 | 19 | RUN apt-get update && apt-get install -y cmake build-essential devscripts debhelper fakeroot 20 | RUN wget https://github.com/NVIDIA/nccl/archive/v2.8.3-1.tar.gz && tar -xzf v2.8.3-1.tar.gz && \ 21 | cd nccl-2.8.3-1 && make -j src.build && make pkg.debian.build 22 | RUN apt-get -y install ./nccl-2.8.3-1/build/pkg/deb/libnccl2_2.8.3-1+cuda10.1_amd64.deb ./nccl-2.8.3-1/build/pkg/deb/libnccl-dev_2.8.3-1+cuda10.1_amd64.deb 23 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir horovod 24 | 25 | WORKDIR /root 26 | RUN apt-get -y install libnuma-dev librdmacm-dev libibverbs-dev 27 | 28 | RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.5.tar.gz && \ 29 | tar -xzf openmpi-4.0.5.tar.gz && cd openmpi-4.0.5 && \ 30 | ./configure --prefix=/usr/local/openmpi CFLAGS="-fPIC" CXXFlAGS="-fPIC" --enable-static && \ 31 | make -j && make install 32 | 33 | RUN apt-get update && apt-get install -y gawk vim libssl-dev tsocks privoxy ssh patchelf 34 | 35 | RUN rm /usr/bin/python && rm /usr/bin/python3 && rm /usr/local/bin/pip && rm /usr/local/bin/pip3 && \ 36 | ln -s /usr/bin/python3.7 /usr/bin/python && \ 37 | ln -s /usr/bin/python3.7 /usr/bin/python3 && \ 38 | ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip && \ 39 | ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3 40 | 41 | RUN pip3.7 install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html 42 | RUN pip3.7 uninstall -y horovod && HOROVOD_GPU_OPERATIONS=NCCL pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir --upgrade horovod 43 | 44 | ENV THRID_PARTY /usr/local 45 | -------------------------------------------------------------------------------- /laboratory/benchmark/analyze.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | data = pandas.read_csv('train_1000w.csv') 3 | 4 | cache = dict() 5 | 6 | feature = list() 7 | batch_size = 4096 8 | all_whole_unique = 0 9 | all_related_unique = 0 10 | for name in data.columns: 11 | if name[0] != 'C': 12 | continue 13 | cache[name] = set() 14 | column = data[name] 15 | whole = 0 16 | whole_unique = 0 17 | related = 0 18 | related_unique = 0 19 | for i in range(1, 100): 20 | prev = set() 21 | for j in range(batch_size): 22 | cache[name].add(column[(i - 1) * batch_size + j]) 23 | prev.add(column[(i - 1) * batch_size + j]) 24 | rlt = list() 25 | whl = column[i * batch_size: (i + 1) * batch_size] 26 | cache_hit = list() 27 | for key in whl: 28 | if key in prev: 29 | rlt.append(key) 30 | if key in cache[name]: 31 | cache_hit.append(key) 32 | whole += len(whl) 33 | whole_unique += len(set(whl)) 34 | related += len(rlt) 35 | related_unique += len(set(rlt)) 36 | if i == 64: 37 | print(name, data[name].max() + 1, len(set(whl)), len(set(cache_hit))) 38 | feature.append([name, [whole, whole_unique, related, related_unique]]) 39 | all_whole_unique += whole_unique 40 | all_related_unique += related_unique 41 | print(name, whole, related) 42 | print(name, whole_unique, related_unique) 43 | 44 | print() 45 | print(all_whole_unique, all_related_unique) 46 | feature = sorted(feature, key=lambda x: x[1][1]) 47 | for name, values in feature: 48 | print(name, values[1] / values[0], values[2] / values[0], values[3] / values[1]) 49 | -------------------------------------------------------------------------------- /laboratory/benchmark/benchmark.Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # paddle 2.1 3 | # git clone paddleRec -b 2.0 4 | -------------------------------------------------------------------------------- /laboratory/benchmark/benchmark.py: -------------------------------------------------------------------------------- 1 | # cuda = 10.1 2 | # torch = 1.7 3 | # tensorflow = 2.2 4 | 5 | import os 6 | import sys 7 | import time 8 | 9 | def run_remote_server(user, ip, port): 10 | os.system('echo "bash run_server.sh {}:{}\n sleep 1\n exit\n" | ssh {}@{}'.format(ip, port, user, ip)) 11 | 12 | def run(py, data, model, embedding_dim, options, np=1, bind_ip=None, master_endpoint=None): 13 | if data.endswith('csv'): 14 | extend = 'csv' 15 | else: 16 | extend = 'tf' 17 | name = 'result/{}_{}_{}_{}'.format(py, extend, model, embedding_dim) 18 | command = 'horovodrun -np {} python3.7 {}.py'.format(np, py) 19 | command += ' --data {} --model {} --embedding_dim {}'.format(data, model, embedding_dim) 20 | for option in options: 21 | name += '_{}'.format(option) 22 | command += ' --{}'.format(option) 23 | if master_endpoint: 24 | name += '_remote' 25 | command += ' --master_endpoint {}'.format(master_endpoint) 26 | if bind_ip: 27 | command += ' --bind_ip {}'.format(bind_ip) 28 | name += '_' + str(np) 29 | command += ' 1>{}.out 2>{}.err'.format(name, name) 30 | print(command) 31 | os.system(command) 32 | time.sleep(1) 33 | 34 | 35 | if len(sys.argv) > 3: 36 | # remote 37 | user = sys.argv[1] 38 | remote_ip = sys.argv[2] 39 | bind_ip = sys.argv[3] 40 | port = 61000 41 | for model in ['WDL', 'DeepFM']: 42 | for embedding_dim in [9, 64]: 43 | for options in [['server'], ['server', 'cache'], ['server', 'cache', 'prefetch']]: 44 | for np in [1, 2, 4, 8]: 45 | port += 1 46 | time.sleep(60) 47 | run_remote_server(user, remote_ip, port) 48 | time.sleep(60) 49 | run('deepctr_criteo', 'tfrecord', model, embedding_dim, options, np=np, 50 | bind_ip=bind_ip, master_endpoint='{}:{}'.format(remote_ip, port)) 51 | else: 52 | #local 53 | for data in ['tfrecord', 'train.csv']: 54 | for model in ['WDL', 'DeepFM', 'xDeepFM']: 55 | for embedding_dim in [9, 64]: 56 | for options in [[], ['server'], ['server', 'cache'], ['server', 'cache', 'prefetch']]: 57 | for np in [1, 2, 4, 8]: 58 | run('deepctr_criteo', data, model, embedding_dim, options, np=np) 59 | run('deepctr_criteo', data, model, embedding_dim, options + ['cpu'], np=1) 60 | 61 | for model in ['WDL', 'DeepFM']: 62 | for embedding_dim in [9, 64]: 63 | for np in [1, 2, 4, 8]: 64 | run('deepctr_criteo_torch', data, model, embedding_dim, [], np=np) 65 | 66 | for model in ['WDL', 'DeepFM']: 67 | for embedding_dim in [9, 64]: 68 | run('deepctr_criteo_torch', data, model, embedding_dim, ['cpu'], np=1) 69 | 70 | -------------------------------------------------------------------------------- /laboratory/benchmark/parse_tensor_board.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | 4 | profile = json.loads(open(sys.argv[1]).read()) 5 | 6 | timeline = dict() 7 | for event in profile['traceEvents']: 8 | if 'name' in event and 'ts' in event: 9 | p = event['name'].rfind(':') 10 | name = event['name'][p + 1:] 11 | timeline.setdefault(name, list()) 12 | timeline[name].append(event) 13 | 14 | for name, events in timeline.items(): 15 | l = min(event['ts'] for event in events) 16 | r = max(event['ts'] + event['dur'] for event in events) 17 | s = sum(event['dur'] for event in events) 18 | c = len(events) 19 | print(name, int(l / 1000), int(r / 1000), int(s / 1000), c) 20 | 21 | -------------------------------------------------------------------------------- /laboratory/benchmark/summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | times = dict() 4 | 5 | for name in os.listdir(sys.argv[1]): 6 | time = 100000000 7 | for line in open(sys.argv[1] + '/' + name): 8 | r = line.find('s - loss') 9 | l = line.find('-') 10 | if l > r: 11 | l = line.find(':') 12 | if l != -1 and r != -1: 13 | time = min(time, int(line[l+1:r])) 14 | sp = len(name) - 6 15 | key, np = name[:sp], name[sp:] 16 | times.setdefault(key, [0, 0, 0, 0]) 17 | if np == '_1.out': 18 | times[key][0] = time 19 | if np == '_2.out': 20 | times[key][1] = time 21 | if np == '_4.out': 22 | times[key][2] = time 23 | if np == '_8.out': 24 | times[key][3] = time 25 | 26 | for key, value in sorted(times.items()): 27 | print(key, *value) -------------------------------------------------------------------------------- /laboratory/benchmark/tensornet.Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # tensorflow=2.2 3 | # python3.7 4 | # git clone tensornet -b 1.1 5 | # cp to /usr/local/lib/python3.7 6 | # openmpi4.0 7 | # LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH 8 | # tensornet/WORKSPACE 9 | # tensornet/examples 10 | 11 | -------------------------------------------------------------------------------- /laboratory/inject/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.2.0-gpu 2 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake 3 | RUN pip install horovod 4 | ADD . /openembedding 5 | WORKDIR /openembedding/laboratory/inject 6 | 7 | RUN bash inject.sh 8 | WORKDIR /root 9 | -------------------------------------------------------------------------------- /laboratory/inject/inject.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | site=`python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"` 3 | cat "openembedding_inject_tensorflow.py" > "${site}/openembedding_inject_tensorflow.py" 4 | cat "sitecustomize.py" > "/usr/lib/python3.6/sitecustomize.py" 5 | 6 | python=python3.6 7 | which_python=`which ${python}` 8 | which_pythonm=`which ${python}m` 9 | 10 | cat python > "${which_python}" 11 | echo "${which_pythonm}" '"${args[@]}"' >> "$which_python" 12 | 13 | 14 | pico_compile criteo_deepctr_network.py -o pico_network_model.py 15 | pico_run -np 4 pico_network_model.py 16 | -------------------------------------------------------------------------------- /laboratory/inject/network_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas 3 | import tensorflow as tf 4 | import horovod.tensorflow.keras as hvd 5 | import deepctr.models 6 | import deepctr.feature_column 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv' 11 | parser.add_argument('--data', default=default_data) # 输入的数据文件 12 | parser.add_argument('--optimizer', default='Adam') 13 | parser.add_argument('--model', default='DeepFM') 14 | parser.add_argument('--checkpoint', default='', help='checkpoint 保存路径') # include optimizer 15 | parser.add_argument('--load', default='', help='要恢复的 checkpoint 路径') # include optimizer 16 | parser.add_argument('--save', default='', help='分布式 serving model 保存的路径') # not include optimizer 17 | args = parser.parse_args() 18 | if not args.optimizer.endswith(')'): 19 | args.optimizer += '()' # auto call args.optimizer 20 | 21 | 22 | # process data 23 | hvd.init() 24 | data = pandas.read_csv(args.data) 25 | n = data.shape[0] // hvd.size() 26 | data = data.iloc[hvd.rank() * n: hvd.rank() * n + n] 27 | inputs = dict() 28 | feature_columns = list() 29 | for name in data.columns: 30 | if name[0] == 'C': 31 | inputs[name] = data[name] % 65536 32 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 33 | vocabulary_size=65536, embedding_dim=9, dtype='int64')) 34 | elif name[0] == 'I': 35 | inputs[name] = data[name] 36 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 37 | 38 | 39 | # compile distributed model 40 | optimizer = eval("tf.keras.optimizers." + args.optimizer) 41 | optimizer = hvd.DistributedOptimizer(optimizer) 42 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary') 43 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False) 44 | 45 | 46 | # load --> fit --> save 47 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), 48 | hvd.callbacks.MetricAverageCallback() ] 49 | if args.checkpoint and hvd.rank() == 0: 50 | callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}')) 51 | if args.load: 52 | model.load_weights(args.load) 53 | 54 | model.fit(inputs, data['label'], batch_size=8, epochs=5, callbacks=callbacks, verbose=1) 55 | 56 | if args.save and hvd.rank() == 0: 57 | model.save(args.save, include_optimizer=False) 58 | -------------------------------------------------------------------------------- /laboratory/inject/openembedding_inject_tensorflow.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | try: 5 | from tensorflow.python import keras 6 | import tensorflow as tf 7 | import openembedding.tensorflow as embed 8 | except ImportError: 9 | pass 10 | else: 11 | class Embedding(embed.Embedding): 12 | def __init__(self, *args, **kwargs): 13 | explicit = kwargs.pop('explicit', False) 14 | super().__init__(*args, explicit=explicit, **kwargs) 15 | 16 | keras.layers.Embedding = Embedding 17 | tf.keras.layers.Embedding = Embedding 18 | keras.models.Model = embed.Model 19 | tf.keras.models.Model = embed.Model 20 | 21 | _NotExplicitClass = dict() 22 | def _NotExplicit(T): 23 | class _Optimizer(T): 24 | def __init__(self, *args, **kwargs): 25 | self.__Class = _NotExplicitClass[T] 26 | explicit = kwargs.pop('explicit', False) 27 | super(self.__Class, self).__init__(*args, explicit=explicit, **kwargs) 28 | 29 | if T not in _NotExplicitClass: 30 | _NotExplicitClass[T] = type(T.__name__, (T,), dict(_Optimizer.__dict__)) 31 | return _NotExplicitClass[T] 32 | 33 | tf.keras.optimizers.Adadelta = _NotExplicit(embed.Adadelta) 34 | tf.keras.optimizers.Adagrad = _NotExplicit(embed.Adagrad) 35 | tf.keras.optimizers.Adam = _NotExplicit(embed.Adam) 36 | tf.keras.optimizers.Adamax = _NotExplicit(embed.Adamax) 37 | tf.keras.optimizers.Ftrl = _NotExplicit(embed.Ftrl) 38 | tf.keras.optimizers.Nadam = _NotExplicit(embed.Nadam) 39 | tf.keras.optimizers.RMSprop = _NotExplicit(embed.RMSprop) 40 | tf.keras.optimizers.SGD = _NotExplicit(embed.SGD) 41 | 42 | -------------------------------------------------------------------------------- /laboratory/inject/python: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | args=() 3 | until [ $# == '0' ]; do 4 | arg="$1" 5 | args[${#args[@]}]="$arg" 6 | shift 7 | if [ "X$file" == "X" ]; then 8 | case "$arg" in 9 | --* ) 10 | if [ "X$arg" == "X--help" ]; then 11 | help="1" 12 | fi 13 | ;; 14 | -* ) 15 | for i in `seq ${#arg}`; do 16 | case "${arg:$i:1}" in 17 | c ) 18 | command="1" 19 | if [ "$i" == "${#arg}" ] && [ $# != '0' ]; then 20 | args[${#args[@]}]="$1" 21 | shift 22 | fi 23 | ;; 24 | m ) 25 | model="1" 26 | if [ "$i" == "${#arg}" ] && [ $# != '0' ]; then 27 | args[${#args[@]}]="$1" 28 | shift 29 | fi 30 | ;; 31 | h ) 32 | help="1" 33 | esac 34 | done 35 | ;; 36 | * ) 37 | if [ "X$model" == "X" ] && [ "X$command" == "X" ] && [ "X$help" == "X" ]; then 38 | file="1" 39 | if grep -q "import tensorflow" "$arg" 2>/dev/null; then 40 | export HYPEREMBEDDING_INJECT_TENSORFLOW="1" 41 | fi 42 | fi 43 | esac 44 | fi 45 | done 46 | -------------------------------------------------------------------------------- /laboratory/inject/sitecustomize.py: -------------------------------------------------------------------------------- 1 | # install the apport exception handler if available 2 | try: 3 | import apport_python_hook 4 | except ImportError: 5 | pass 6 | else: 7 | apport_python_hook.install() 8 | 9 | import os 10 | if os.environ.get('HYPEREMBEDDING_INJECT_TENSORFLOW', None) == '1': 11 | import sys 12 | sys.argv=[""] 13 | import openembedding_inject_tensorflow 14 | -------------------------------------------------------------------------------- /laboratory/onnx/criteo_deepctr_torch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas 3 | import torch 4 | # import horovod.torch as hvd 5 | import time 6 | import numpy as np 7 | import sklearn 8 | import deepctr_torch as deepctr 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--data', required=True) 13 | parser.add_argument('--optimizer', default='Adagrad', choices=['Adagrad']) 14 | parser.add_argument('--model', default="DeepFM", choices=["WDL", 'DeepFM', 'XDeepFM']) 15 | parser.add_argument('--embedding_dim', default=9, type=int) 16 | parser.add_argument('--batch_size', default=4096, type=int) 17 | parser.add_argument('--epochs', default=2, type=int) 18 | parser.add_argument('--onnx', action='store_true') 19 | parser.add_argument('--cpu', action='store_true') 20 | args = parser.parse_args() 21 | # hvd.init() 22 | # if args.cpu: 23 | # device = 'cpu' 24 | # else: 25 | # # torch.cuda.set_device(hvd.local_rank()) 26 | # device = 'cuda:{}'.format(hvd.local_rank()) 27 | 28 | device = 'cuda' 29 | def train_model(model, x, y, batch_size, epochs=1, optimizer=torch.optim.Adagrad): 30 | x = [np.expand_dims(tensor, 1) for tensor in x] 31 | x = torch.from_numpy(np.concatenate(x, axis=-1)) 32 | y = torch.from_numpy(y) 33 | train_tensor_data = torch.utils.data.TensorDataset(x, y) 34 | train_loader = torch.utils.data.DataLoader(dataset=train_tensor_data, batch_size=batch_size) 35 | loss_func = torch.nn.functional.binary_cross_entropy 36 | for epoch in range(epochs): 37 | start_time = time.time() 38 | epoch_loss = 0.0 39 | epoch_auc = 0.0 40 | for x_train, y_train in train_loader: 41 | x_train = x_train.to(device).float() 42 | y_train = y_train.to(device).float() 43 | y_pred = model(x_train).to(device).squeeze() 44 | optimizer.zero_grad() 45 | loss = loss_func(y_pred, y_train.squeeze(), reduction='sum') 46 | epoch_loss += loss.item() 47 | loss.backward() 48 | optimizer.step() 49 | # train_result["AUC"].append(sklearn.metrics.roc_auc_score( 50 | # y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64"))) 51 | 52 | epoch_time = int(time.time() - start_time) 53 | print('Epoch {0}/{1}'.format(epoch + 1, epochs)) 54 | eval_str = "{0}s - loss: {1: .4f}".format(epoch_time, epoch_loss) 55 | # eval_str += " - " + name + ": {0: .4f}".format(epoch_logs[name]) 56 | print(eval_str) 57 | 58 | 59 | if __name__ == "__main__": 60 | data = pandas.read_csv(args.data) 61 | num_lines = data.shape[0] 62 | num_local_lines = num_lines // args.batch_size * args.batch_size 63 | local_start = 0 64 | # num_local_lines = int(num_lines / hvd.size()) // args.batch_size * args.batch_size 65 | # local_start = hvd.local_rank() * num_local_lines 66 | local_end = local_start + num_local_lines 67 | print("num_lines:%d, num_local_lines:%d" % (num_lines, num_local_lines)) 68 | print("local_start:%d, local_end:%d" % (local_start, local_end)) 69 | 70 | target = ['label'] 71 | dense_features = ['I' + str(i) for i in range(1, 14)] 72 | sparse_features = ['C' + str(i) for i in range(1, 27)] 73 | print(data.columns) 74 | 75 | feature_columns = [] 76 | for name in sparse_features: 77 | feature_columns.append(deepctr.inputs.SparseFeat(name, data[name].max() + 1, dtype='int64')) 78 | for name in dense_features: 79 | feature_columns.append(deepctr.inputs.DenseFeat(name, 1, dtype='float32')) 80 | train = data.iloc[local_start:local_end] 81 | train_model_input = {name:train[name] for name in sparse_features + dense_features} 82 | 83 | if args.model == 'WDL': 84 | fc_sizes = (512, 256, 128, 32) 85 | elif args.model in {'DeepFM', 'xDeepFM'}: 86 | fc_sizes = (400, 400, 400) 87 | else: 88 | print("unknown model ", args.model) 89 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, device=device, 90 | task='binary', dnn_hidden_units=fc_sizes, l2_reg_linear=0, l2_reg_embedding=0) 91 | x = [train_model_input[name] for name in model.feature_index] 92 | if args.onnx: 93 | from onnxruntime.training.ortmodule import ORTModule 94 | model = ORTModule(model) 95 | optimizer=torch.optim.Adagrad(model.parameters()) 96 | train_model(model, x, train[target].values, 97 | batch_size=args.batch_size, epochs=args.epochs, optimizer=optimizer) 98 | -------------------------------------------------------------------------------- /laboratory/publish-serving.sh: -------------------------------------------------------------------------------- 1 | echo $1 2 | target=$1/tensorflow_serving/custom_ops 3 | if [ "X$1" != "X" ]; then 4 | mkdir -p "$target" 5 | mkdir -p "$target/openembedding" 6 | mkdir -p "$target/openembedding/core" 7 | mkdir -p "$target/openembedding/tensorflow" 8 | cp "./build/openembedding/core/libcexb_pack.so" "$target/openembedding/core/libcexb_pack.so" 9 | cp "./openembedding/core/c_api.h" "$target/openembedding/core/c_api.h" 10 | cp "./openembedding/tensorflow/exb_ops.cpp" "$target/openembedding/tensorflow/exb_ops.cpp" 11 | cp "./openembedding/tensorflow/exb_ops.cpp" "$target/openembedding/tensorflow/exb_ops.h" 12 | fi 13 | -------------------------------------------------------------------------------- /laboratory/strangedemo/Dockerfile.criteo: -------------------------------------------------------------------------------- 1 | FROM openembedding-demo:2.2 2 | ADD train.csv /root/train.csv 3 | ADD dac_sample.csv /root/dac_sample.csv 4 | ADD laboratory/strangedemo/criteo_predict.py /root/criteo_predict.py 5 | ADD laboratory/strangedemo/criteo_deepctr /root/criteo_deepctr 6 | ADD laboratory/strangedemo/criteo_deepctr_np /root/criteo_deepctr_np 7 | ADD laboratory/strangedemo/criteo_lr /root/criteo_lr 8 | 9 | RUN ln -s /root/train.csv /root/criteo_deepctr/train.csv && \ 10 | ln -s /root/dac_sample.csv /root/criteo_deepctr/dac_sample.csv && \ 11 | ln -s /root/criteo_predict.py /root/criteo_deepctr/criteo_predict.py 12 | 13 | RUN ln -s /root/train.csv /root/criteo_deepctr_np/train.csv && \ 14 | ln -s /root/dac_sample.csv /root/criteo_deepctr_np/dac_sample.csv && \ 15 | ln -s /root/criteo_predict.py /root/criteo_deepctr_np/criteo_predict.py 16 | 17 | RUN ln -s /root/train.csv /root/criteo_lr/train.csv && \ 18 | ln -s /root/dac_sample.csv /root/criteo_lr/dac_sample.csv && \ 19 | ln -s /root/criteo_predict.py /root/criteo_lr/criteo_predict.py 20 | WORKDIR /root 21 | -------------------------------------------------------------------------------- /laboratory/strangedemo/Dockerfile.push: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | RUN apt-get update && apt-get install -y vim apt-transport-https ca-certificates curl gnupg-agent software-properties-common wget 3 | RUN curl -sSL https://get.daocloud.io/docker | sh 4 | RUN wget https://mirror.azure.cn/kubernetes/helm/helm-v2.14.1-linux-amd64.tar.gz && \ 5 | tar -xzf helm-v2.14.1-linux-amd64.tar.gz && cp linux-amd64/helm linux-amd64/tiller /usr/local/bin && \ 6 | rm -rf linux-amd64 helm-v2.14.1-linux-amd64.tar.gz 7 | -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_deepctr/criteo_deepctr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3" 14 | }, 15 | "orig_nbformat": 4, 16 | "kernelspec": { 17 | "name": "python3", 18 | "display_name": "Python 3.7.3 64-bit" 19 | }, 20 | "interpreter": { 21 | "hash": "51c43b68502c46154a57a0f411be94ca0e84f1091eab4730ae0fb62cf38c2f81" 22 | } 23 | }, 24 | "nbformat": 4, 25 | "nbformat_minor": 2, 26 | "cells": [ 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!python3 criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "!mlcompile criteo_deepctr.py -o ml_criteo_deepctr.py" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!GLOG_minloglevel=1 mlrun -np 4 python3 ml_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!python3 criteo_predict.py --data train.csv --rows 1 --model criteo_model --host {serving地址}" 61 | ] 62 | } 63 | ] 64 | } -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_deepctr/criteo_deepctr.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import tensorflow as tf 3 | import deepctr.models 4 | import deepctr.feature_column 5 | 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--data', required=True) # 输入的数据文件 9 | parser.add_argument('--learning_rate', type=float, required=True) 10 | parser.add_argument('--batch_size', type=int, required=True) 11 | parser.add_argument('--save', required=True) 12 | args = parser.parse_args() 13 | 14 | data = pandas.read_csv(args.data) 15 | inputs = dict() 16 | feature_columns = list() 17 | for name in data.columns: 18 | if name[0] == 'C': 19 | inputs[name] = data[name] 20 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 21 | vocabulary_size=data[name].max() + 1, embedding_dim=4, dtype='int64')) 22 | elif name[0] == 'I': 23 | inputs[name] = data[name] 24 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 25 | 26 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate) 27 | model = deepctr.models.xDeepFM(feature_columns, feature_columns, task='binary', 28 | l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0) 29 | model.compile(optimizer, 'binary_crossentropy', metrics=['AUC']) 30 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=3, verbose=2) 31 | model.save(args.save, overwrite=True, include_optimizer=False) 32 | -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_deepctr_np/criteo_deepctr.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import tensorflow as tf 3 | import deepctr.models 4 | import deepctr.feature_column 5 | 6 | import argparse 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--data', required=True) # 输入的数据文件 9 | parser.add_argument('--learning_rate', type=float, required=True) 10 | parser.add_argument('--batch_size', type=int, required=True) 11 | parser.add_argument('--save', required=True) 12 | args = parser.parse_args() 13 | 14 | data = pandas.read_csv(args.data) 15 | inputs = dict() 16 | feature_columns = list() 17 | for name in data.columns: 18 | if name[0] == 'C': 19 | inputs[name] = data[name] 20 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 21 | vocabulary_size=data[name].max() + 1, embedding_dim=64, dtype='int64')) 22 | elif name[0] == 'I': 23 | inputs[name] = data[name] 24 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 25 | 26 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate) 27 | model = deepctr.models.DeepFM(feature_columns, feature_columns, task='binary', 28 | l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0) 29 | model.compile(optimizer, 'binary_crossentropy', metrics=['AUC']) 30 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=3, verbose=2) 31 | model.save(args.save, overwrite=True, include_optimizer=False) 32 | -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_deepctr_np/criteo_deepctr_np.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": 3 14 | }, 15 | "orig_nbformat": 4 16 | }, 17 | "nbformat": 4, 18 | "nbformat_minor": 2, 19 | "cells": [ 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "!horovodrun -np 4 python3 horovod_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "!mlcompile criteo_deepctr.py -o ml_criteo_deepctr.py" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "!GLOG_minloglevel=1 mlrun -np 4 python3 ml_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "!python3 criteo_predict.py --data train.csv --rows 1 --model criteo_model --host {serving地址}" 54 | ] 55 | } 56 | ] 57 | } -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_deepctr_np/horovod_criteo_deepctr.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import tensorflow as tf 3 | import deepctr.models 4 | import deepctr.feature_column 5 | import horovod.tensorflow.keras as hvd 6 | 7 | import argparse 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--data', required=True) # 输入的数据文件 10 | parser.add_argument('--learning_rate', required=True, type=float) 11 | parser.add_argument('--batch_size', required=True, type=int) 12 | parser.add_argument('--save', required=True) 13 | args = parser.parse_args() 14 | 15 | hvd.init() 16 | gpus = tf.config.experimental.list_physical_devices('GPU') 17 | if gpus: 18 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank() % len(gpus)], 'GPU') 19 | 20 | data = pandas.read_csv(args.data) 21 | inputs = dict() 22 | feature_columns = list() 23 | for name in data.columns: 24 | if name[0] == 'C': 25 | inputs[name] = data[name] 26 | feature_columns.append(deepctr.feature_column.SparseFeat(name, 27 | vocabulary_size=data[name].max() + 1, embedding_dim=64, dtype='int64')) 28 | elif name[0] == 'I': 29 | inputs[name] = data[name] 30 | feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32')) 31 | 32 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate) 33 | model = deepctr.models.DeepFM(feature_columns, feature_columns, task='binary', 34 | l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0) 35 | 36 | # 使用 horovod 实现数据并行 37 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum) 38 | n = data.shape[0] // hvd.size() * hvd.size() 39 | for key in inputs.keys(): 40 | inputs[key] = inputs[key][hvd.rank():n:hvd.size()] 41 | labels = data['label'][hvd.rank():n:hvd.size()] 42 | callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0), 43 | hvd.callbacks.MetricAverageCallback()] 44 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC']) 45 | model.fit(inputs, labels, callbacks=callbacks, 46 | batch_size=args.batch_size, epochs=3, verbose=2) 47 | 48 | if hvd.rank() == 0: 49 | model.save(args.save, overwrite=True, include_optimizer=False) 50 | -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_lr/criteo_lr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "language_info": { 4 | "codemirror_mode": { 5 | "name": "ipython", 6 | "version": 3 7 | }, 8 | "file_extension": ".py", 9 | "mimetype": "text/x-python", 10 | "name": "python", 11 | "nbconvert_exporter": "python", 12 | "pygments_lexer": "ipython3", 13 | "version": "3" 14 | }, 15 | "orig_nbformat": 4, 16 | "kernelspec": { 17 | "name": "python3", 18 | "display_name": "Python 3.7.3 64-bit" 19 | }, 20 | "interpreter": { 21 | "hash": "51c43b68502c46154a57a0f411be94ca0e84f1091eab4730ae0fb62cf38c2f81" 22 | } 23 | }, 24 | "nbformat": 4, 25 | "nbformat_minor": 2, 26 | "cells": [ 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "!python3 criteo_lr.py --data dac_sample.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "!mlcompile criteo_lr.py -o ml_criteo_lr.py" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "!GLOG_minloglevel=1 python3 ml_criteo_lr.py --data dac_sample.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "!python3 criteo_predict.py --data dac_sample.csv --rows 1 --model criteo_model --host {serving地址}" 61 | ] 62 | } 63 | ] 64 | } -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_lr/criteo_lr.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import tensorflow as tf 3 | 4 | def CriteoLR(features, input_dim): 5 | embeddings = tf.keras.layers.Embedding(input_dim, 1, 6 | embeddings_initializer=tf.keras.initializers.Zeros()) 7 | fields = list() 8 | for name, tensor in features.items(): 9 | if name[0] == 'C': 10 | fields.append(embeddings(tensor)) 11 | else: 12 | fields.append(tf.reshape(tensor, [-1, 1, 1])) 13 | concat = tf.keras.layers.concatenate(fields) 14 | output = tf.keras.layers.Dense(1, activation='sigmoid')(concat) 15 | return tf.keras.models.Model(inputs=features, outputs=[output]) 16 | 17 | import argparse 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--data', required=True) 20 | parser.add_argument('--learning_rate', type=float, required=True) 21 | parser.add_argument('--batch_size', type=int, required=True) 22 | parser.add_argument('--save', required=True) 23 | args = parser.parse_args() 24 | data = pandas.read_csv(args.data) 25 | inputs = dict() 26 | features = dict() 27 | vocabulary_size = 0 28 | for name in data.columns: 29 | if name[0] == 'C': 30 | inputs[name] = data[name] + vocabulary_size 31 | features[name] = tf.keras.Input(shape=[1], name=name, dtype=tf.int64) 32 | vocabulary_size += data[name].max() + 1 33 | elif name[0] == 'I': 34 | inputs[name] = data[name] 35 | features[name] = tf.keras.Input(shape=[1], name=name, dtype=tf.float32) 36 | 37 | # compile distributed model 38 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate) 39 | model = CriteoLR(features, vocabulary_size) 40 | model.compile(optimizer, 'binary_crossentropy') 41 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, verbose=2) 42 | model.save(args.save, overwrite=True, include_optimizer=False) 43 | -------------------------------------------------------------------------------- /laboratory/strangedemo/criteo_predict.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pandas 4 | import argparse 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('--data', required=True) 7 | parser.add_argument('--rows', type=int, required=True) 8 | parser.add_argument('--model', required=True) 9 | parser.add_argument('--host', required=True) 10 | args = parser.parse_args() 11 | data = pandas.read_csv(args.data, nrows=args.rows) 12 | 13 | inputs = dict() 14 | for name in data.columns: 15 | if name[0] == 'C': 16 | inputs[name] = [[int(value)] for value in data[name]] 17 | elif name[0] == 'I': 18 | inputs[name] = [[float(value)] for value in data[name]] 19 | post = json.dumps({'inputs':inputs}) 20 | command = f"curl -d '{post}' {args.host}/v1/models/{args.model}:predict" 21 | print(command) 22 | os.system(command) 23 | 24 | -------------------------------------------------------------------------------- /laboratory/strangedemo/hook/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:2.2.0-gpu 2 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake 3 | RUN apt-get install -y vim wget 4 | RUN pip3 install horovod pandas scikit-learn deepctr 5 | RUN pip3 install jupyter jupyterlab 6 | 7 | ADD openembedding-0.1.0.tar.gz /openembedding/openembedding-0.1.0.tar.gz 8 | RUN pip3 install /openembedding/openembedding-0.1.0.tar.gz 9 | ADD laboratory/strangedemo/hook /openembedding/hook 10 | WORKDIR /openembedding/hook 11 | RUN bash install.sh 12 | WORKDIR /root 13 | RUN rm -rf /openembedding -------------------------------------------------------------------------------- /laboratory/strangedemo/hook/install.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | site=`python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"` 3 | cp openembedding_hook_tensorflow.py ${site}/ 4 | cp mlcompile /usr/local/bin/ 5 | cp mlrun /usr/local/bin/ 6 | -------------------------------------------------------------------------------- /laboratory/strangedemo/hook/mlcompile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ "$#" != "3" ]; then 3 | echo -e "Usage: mlcompile file.py -o out.py" 4 | exit 1 5 | fi 6 | 7 | echo import openembedding_hook_tensorflow >$3 8 | cat $1 >> $3 9 | -------------------------------------------------------------------------------- /laboratory/strangedemo/hook/mlrun: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | horovodrun $@ 3 | -------------------------------------------------------------------------------- /laboratory/strangedemo/hook/openembedding_hook_tensorflow.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python import keras 3 | from tensorflow.keras.models import Model as KerasModel 4 | from tensorflow.keras.layers import Embedding as KerasEmbedding 5 | import horovod.tensorflow.keras as hvd 6 | import openembedding.tensorflow as embed 7 | 8 | 9 | class Embedding(embed.Embedding): 10 | def __init__(self, *args, **kwargs): 11 | explicit = kwargs.pop('explicit', False) 12 | super().__init__(*args, explicit=explicit, **kwargs) 13 | 14 | 15 | class Model(embed.Model): 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | 19 | def compile(self, optimizer, *args, **kwargs): 20 | kwargs.pop('experimental_run_tf_function', None) 21 | optimizer = embed.distributed_optimizer(optimizer, explicit=False) 22 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum) 23 | return super().compile(optimizer, *args, experimental_run_tf_function=False, **kwargs) 24 | 25 | def save(self, *args, **kwargs): 26 | if hvd.rank() == 0: 27 | keras.layers.Embedding = KerasEmbedding 28 | tf.keras.layers.Embedding = KerasEmbedding 29 | keras.Model = KerasModel 30 | tf.keras.Model = KerasModel 31 | keras.models.Model = KerasModel 32 | tf.keras.models.Model = KerasModel 33 | super().save_as_original_model(*args, **kwargs) 34 | keras.layers.Embedding = Embedding 35 | tf.keras.layers.Embedding = Embedding 36 | keras.Model = Model 37 | tf.keras.Model = Model 38 | keras.models.Model = Model 39 | tf.keras.models.Model = Model 40 | 41 | 42 | def save_weights(self, *args, **kwargs): 43 | if hvd.rank() == 0: 44 | super().save_weights(*args, **kwargs) 45 | 46 | def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, *args, **kwargs): 47 | if isinstance(x, dict): 48 | x1 = dict() 49 | n = len(y) // hvd.size() * hvd.size() 50 | for key, value in x.items(): 51 | x1[key] = value[hvd.rank():n:hvd.size()] 52 | y1 = y[hvd.rank():n:hvd.size()] 53 | else: 54 | raise ValueError('only support dict input') 55 | if not callbacks: 56 | callbacks = [] 57 | callbacks = callbacks + [ 58 | hvd.callbacks.BroadcastGlobalVariablesCallback(0), 59 | hvd.callbacks.MetricAverageCallback() ] 60 | return super().fit(x1, y1, batch_size, epochs, verbose, callbacks=callbacks, *args, **kwargs) 61 | 62 | 63 | keras.layers.Embedding = Embedding 64 | tf.keras.layers.Embedding = Embedding 65 | keras.Model = Model 66 | tf.keras.Model = Model 67 | keras.models.Model = Model 68 | tf.keras.models.Model = Model 69 | 70 | hvd.init() 71 | gpus = tf.config.experimental.list_physical_devices('GPU') 72 | if gpus: 73 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank() % len(gpus)], 'GPU') 74 | -------------------------------------------------------------------------------- /openembedding/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include_directories(client server variable entry) 2 | 3 | file(GLOB exb_src client/*.cpp server/*.cpp variable/EmbeddingVariable.cpp entry/c_api.cc) 4 | add_library(cexb_obj OBJECT ${exb_src}) 5 | 6 | # cexb_static for tensorflow ops 7 | add_library(cexb_static STATIC $) 8 | target_link_libraries(cexb_static pico_ps_static pico_core_static 9 | ${PicoCoreDep_STATIC_LIBRARIES} ${Jemalloc_pic_STATIC_LIBRARIES} 10 | ${RDMA_LIBRARIES} ${PMEM_STATIC_LIBRARIES} ${NDCTL_LIBRARIES} dl) 11 | 12 | # cexb with out static libraries 13 | # add_library(cexb SHARED $) 14 | # target_link_libraries(cexb pico_ps pico_core ${PicoCoreDep_LIBRARIES} ${Jemalloc_pic_LIBRARIES} ${RDMA_LIBRARIES} dl) 15 | 16 | # cexb_pack with static libraries for tensorflow-serving 17 | add_library(cexb_pack SHARED $) 18 | target_link_libraries(cexb_pack pico_ps_static pico_core_static 19 | ${PicoCoreDep_STATIC_LIBRARIES} ${Jemalloc_pic_STATIC_LIBRARIES} 20 | ${RDMA_LIBRARIES} ${PMEM_STATIC_LIBRARIES} ${NDCTL_LIBRARIES} dl) 21 | 22 | add_executable(masterd entry/masterd.cc) 23 | target_link_libraries(masterd cexb_static) 24 | 25 | add_executable(server entry/server.cc) 26 | target_link_libraries(server cexb_static) 27 | 28 | find_package(Protobuf REQUIRED) 29 | find_package(OpenSSL REQUIRED) 30 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 31 | find_lib(BRPC_STATIC_LIBRARIES STATIC LIBS brpc protobuf) 32 | find_lib(BRPC_DYNAMIC_LIBRARIES SHARED LIBS leveldb) 33 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER entry/controller.proto) 34 | add_executable(controller entry/controller.cc ${PROTO_SRC}) 35 | target_compile_options(controller PRIVATE -Wno-unused-parameter) 36 | target_link_libraries(controller cexb_static ${BRPC_STATIC_LIBRARIES} ${BRPC_DYNAMIC_LIBRARIES} ${OPENSSL_CRYPTO_LIBRARY} ${OPENSSL_SSL_LIBRARY}) 37 | 38 | option(SKIP_CHECK_WHEEL_SETUP "try build tensorflow operator" OFF) 39 | if (NOT SKIP_CHECK_WHEEL_SETUP) 40 | # py_api should be compiled during pip install, here is just for simple verification. 41 | execute_process(COMMAND ${PYTHON} -c "import pybind11; print(pybind11.get_include(), end=\"\")" 42 | OUTPUT_VARIABLE PYBIND11_INCLUDE) 43 | add_library(exb SHARED entry/py_api.cc) 44 | target_include_directories(exb PRIVATE ${PYBIND11_INCLUDE}) 45 | target_link_libraries(exb PRIVATE cexb_pack) 46 | 47 | add_subdirectory(tensorflow) 48 | endif() 49 | 50 | # tests 51 | find_package(PicoTestDep) 52 | link_libraries(cexb_static ${PicoTestDep_STATIC_LIBRARIES}) 53 | add_executable(c_api_test entry/c_api_test.cpp) 54 | add_executable(c_api_ha_test entry/c_api_ha_test.cpp) 55 | if (USE_DCPMM) 56 | add_executable(pmem_c_api_test entry/pmem_c_api_test.cpp) 57 | add_executable(pmem_embedding_table_test variable/pmem_embedding_table_test.cpp) 58 | endif() 59 | 60 | include(GoogleTest) 61 | gtest_discover_tests(c_api_test) 62 | # At present, ha_test has a probability of failing, 63 | # because the current ps restore dead node has a small probability of failing. 64 | # This situation is currently considered by unittest to be caused by an abnormal restore crash. 65 | # Actually, you only need to restart again at this time. 66 | # When restarting the PS, the startup failure should be considered. 67 | # gtest_discover_tests(c_api_ha_test) -------------------------------------------------------------------------------- /openembedding/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ctypes 3 | libcexb_pack = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/libcexb_pack.so') 4 | import openembedding.libexb as libexb 5 | 6 | __version__ = libexb.version() 7 | 8 | 9 | 10 | ''' 11 | Master 12 | 13 | config: Configure in yaml format 14 | 15 | master_endpoint: Required when the Server or worker is initialized 16 | '': Start a Master in this process 17 | 18 | '{ip}:{port}': The endpoint of master 19 | 20 | bind_ip: Used by worker, Server and Master 21 | '': automatically bind to the ip address of a network card 22 | 23 | '{ip}': specify the ip address, bind on random port 24 | 25 | '{ip}:{port}': bind on the specified ip and port, only supported by Master 26 | 27 | num_workers: should be consistent in different workers 28 | 29 | wait_num_servers: should be consistent in different workers 30 | -1: start a Server in each worker process. 31 | n: need to wait the number of Servers start. 32 | ''' 33 | class Flags: 34 | def __init__(self, config='', master_endpoint='', bind_ip='', num_workers=1, wait_num_servers=-1): 35 | self.config = config 36 | self.master_endpoint = master_endpoint 37 | self.bind_ip = bind_ip 38 | self.num_workers = num_workers 39 | self.wait_num_servers = wait_num_servers 40 | flags = Flags() 41 | 42 | ''' 43 | Run a master in this process. 44 | ''' 45 | class Master: 46 | def __init__(self): 47 | self.__master = libexb.Master(flags.bind_ip) 48 | 49 | def __del__(self): 50 | self.__master.finalize() 51 | 52 | @property 53 | def endpoint(self): 54 | ''' 55 | The format is '{ip}:{port}'. 56 | ''' 57 | return self.__master.endpoint 58 | 59 | ''' 60 | Run a parameter server in this process. 61 | ''' 62 | class Server: 63 | def __init__(self): 64 | self.__server = libexb.Server(flags.config, flags.master_endpoint, flags.bind_ip) 65 | 66 | def exit(self): 67 | ''' 68 | Send exit request to this server. 69 | ''' 70 | return self.__server.exit() 71 | 72 | def join(self): 73 | ''' 74 | Waiting for the server to exit. 75 | ''' 76 | return self.__server.join() -------------------------------------------------------------------------------- /openembedding/client/Communication.cpp: -------------------------------------------------------------------------------- 1 | #include "Communication.h" 2 | 3 | namespace paradigm4 { 4 | namespace pico { 5 | namespace embedding { 6 | 7 | 8 | Communication::Communication(): _comm_size(1) {} // for native connection 9 | 10 | Communication::Communication(core::RpcService* rpc, int32_t comm_size, std::string rpc_name) { 11 | _rpc = rpc; 12 | _comm_size = comm_size; 13 | _rpc_name = rpc_name; 14 | _rpc_server = _rpc->create_server(_rpc_name); 15 | _serving_th = std::thread(&Communication::serving, this); 16 | _rpc_client = _rpc->create_client(_rpc_name, comm_size); 17 | core::RpcServiceInfo info; 18 | _rpc_client->get_rpc_service_info(info); 19 | SCHECK(info.servers.size() == static_cast(comm_size)) << "error sync num"; 20 | for (core::ServerInfo server: info.servers) { 21 | SCHECK(server.server_id < comm_size) << "error server id"; 22 | } 23 | _comm_rank = _rpc_server->id(); 24 | SCHECK(_comm_rank < comm_size) << "error comm rank"; 25 | _dealer = [this]() { return _rpc_client->create_dealer(); }; 26 | } 27 | 28 | Communication::~Communication() { 29 | _dealer.clear(); 30 | _rpc_server->terminate(); 31 | _serving_th.join(); 32 | _rpc_client.reset(); 33 | _rpc_server.reset(); 34 | _rpc->deregister_rpc_service(_rpc_name); 35 | } 36 | 37 | 38 | comm_rank_t Communication::barrier(std::string name) { 39 | int32_t num = _comm_size; 40 | if (num == 1) { 41 | return _comm_rank; 42 | } 43 | 44 | core::RpcRequest req; 45 | req.head().sid = std::hash()(name) % _comm_size; 46 | req << BARRIER << name << num << _comm_rank; 47 | 48 | std::shared_ptr dealer = _dealer.acquire(); 49 | core::RpcResponse resp = dealer->sync_rpc_call(std::move(req)); 50 | _dealer.release(std::move(dealer)); 51 | 52 | comm_rank_t selected; 53 | resp >> selected; 54 | return selected; 55 | } 56 | 57 | bool Communication::load_model_sign(const std::string& model_sign) { 58 | core::RpcRequest req; 59 | req.head().sid = 0; 60 | req << LOAD_MODEL_SIGN << model_sign; 61 | 62 | std::shared_ptr dealer = _dealer.acquire(); 63 | core::RpcResponse resp = dealer->sync_rpc_call(std::move(req)); 64 | _dealer.release(std::move(dealer)); 65 | 66 | bool result; 67 | resp >> result; 68 | return result; 69 | } 70 | 71 | void Communication::inner_boardcast(std::string name, core::BinaryArchive& ar, comm_rank_t from) { 72 | int32_t num = _comm_size; 73 | if (num == 1) { 74 | return; 75 | } 76 | 77 | core::RpcRequest req; 78 | req.head().sid = from; 79 | bool is_main = from == _comm_rank; 80 | req << BOARD_CAST << name << num << is_main; 81 | if (is_main) { 82 | req << ar; 83 | } 84 | 85 | std::shared_ptr dealer = _dealer.acquire(); 86 | core::RpcResponse resp = dealer->sync_rpc_call(std::move(req)); 87 | _dealer.release(std::move(dealer)); 88 | if (!is_main) { 89 | resp >> ar; 90 | } 91 | } 92 | 93 | void Communication::serving() { 94 | core::RpcRequest req; 95 | std::shared_ptr dealer = _rpc_server->create_dealer(); 96 | while (dealer->recv_request(req)) { 97 | uint32_t req_type; 98 | req >> req_type; 99 | if (req_type == BOARD_CAST) { 100 | std::string name; 101 | req >> name; 102 | uint32_t num; 103 | req >> num; 104 | auto& reqs = _reqs[name]; 105 | reqs.push_back(std::move(req)); 106 | if (reqs.size() >= num) { 107 | SCHECK(reqs.size() == num) << "error barrier node num!"; 108 | core::BinaryArchive ar; 109 | for (core::RpcRequest& req1: reqs) { 110 | bool is_main; 111 | req1 >> is_main; 112 | if (is_main) { 113 | req1 >> ar; 114 | } 115 | } 116 | for (core::RpcRequest& req1: reqs) { 117 | core::RpcResponse resp(req1); 118 | resp << ar; 119 | dealer->send_response(std::move(resp)); 120 | } 121 | _reqs.erase(name); 122 | } 123 | } else if (req_type == LOAD_MODEL_SIGN) { 124 | std::string model_sign; 125 | req >> model_sign; 126 | core::RpcResponse resp(req); 127 | resp << (_model_sign != model_sign); 128 | _model_sign = model_sign; 129 | dealer->send_response(std::move(resp)); 130 | } else if (req_type == BARRIER) { 131 | std::string name; 132 | req >> name; 133 | uint32_t num; 134 | req >> num; 135 | auto& reqs = _barriers[name]; 136 | reqs.push_back(std::move(req)); 137 | if (reqs.size() >= num) { 138 | SCHECK(reqs.size() == num) << "error barrier node num: " << reqs.size() << ' ' << num; 139 | int32_t fast_comm_rank = -1; 140 | for (core::RpcRequest& req1: reqs) { 141 | if (fast_comm_rank == -1) { 142 | req1 >> fast_comm_rank; 143 | } 144 | core::RpcResponse resp(req1); 145 | resp << fast_comm_rank; 146 | dealer->send_response(std::move(resp)); 147 | } 148 | _barriers.erase(name); 149 | } 150 | } 151 | } 152 | } 153 | 154 | 155 | } 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /openembedding/client/Communication.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_COMMUNICATION_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_COMMUNICATION_H 3 | 4 | #include 5 | #include "ObjectPool.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | // Here comm_rank is in [0, comm_size], and the corresponding rpc global_rank can be any value. 12 | class Communication { 13 | enum reqs { 14 | BOARD_CAST = 0, 15 | LOAD_MODEL_SIGN = 1, 16 | BARRIER = 2, 17 | }; 18 | public: 19 | Communication(); 20 | Communication(core::RpcService* rpc, int32_t comm_size, std::string rpc_name = "sync_runner_rpc_api"); 21 | 22 | ~Communication(); 23 | 24 | int32_t comm_rank() { 25 | return _comm_rank; 26 | } 27 | 28 | int32_t comm_size() { 29 | return _comm_size; 30 | } 31 | 32 | comm_rank_t barrier(std::string name); 33 | 34 | template 35 | auto sync_bcast(const std::string& name, Fn fn) { 36 | comm_rank_t from = barrier(name); 37 | decltype(fn()) result; 38 | if (_comm_rank == from) { 39 | result = fn(); 40 | } 41 | boardcast(name, result, from); 42 | return result; 43 | } 44 | 45 | template 46 | void boardcast(std::string name, T& value, comm_rank_t from) { 47 | core::BinaryArchive ar; 48 | ar << value; 49 | inner_boardcast(name, ar, from); 50 | ar >> value; 51 | } 52 | 53 | bool load_model_sign(const std::string& model_sign); 54 | 55 | private: 56 | void serving(); 57 | 58 | void inner_boardcast(std::string name, core::BinaryArchive& ar, comm_rank_t from); 59 | 60 | core::RpcService* _rpc = nullptr; 61 | int32_t _comm_size = 0; 62 | std::string _rpc_name; 63 | 64 | int32_t _comm_rank = 0; 65 | 66 | std::string _model_sign; 67 | std::thread _serving_th; 68 | std::unique_ptr _rpc_server; 69 | std::unique_ptr _rpc_client; 70 | ObjectPool> _dealer; 71 | std::unordered_map> _reqs; 72 | std::unordered_map> _barriers; 73 | }; 74 | 75 | } 76 | } 77 | } 78 | 79 | #endif -------------------------------------------------------------------------------- /openembedding/client/Connection.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_CONNECTION_H 2 | #define PARADIGM4_HYPEREMBEDDING_CONNECTION_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "Meta.h" 13 | #include "EmbeddingVariableHandle.h" 14 | #include "EnvConfig.h" 15 | 16 | namespace paradigm4 { 17 | namespace pico { 18 | namespace embedding { 19 | 20 | class Connection { 21 | public: 22 | virtual ~Connection() {}; 23 | virtual comm_rank_t global_rank() = 0; 24 | virtual ps::Context* server_context() = 0; 25 | virtual std::vector running_servers() = 0; 26 | virtual ps::Status close_server(int32_t server_id) = 0; 27 | virtual void close_servers() = 0; 28 | virtual ps::Status create_storage(const std::map>& node_shards, int32_t& storage_id) = 0; 29 | virtual ps::Status delete_storage(int32_t storage_id) = 0; 30 | virtual ps::Status set_storage_restore_uri(int32_t storage_id, const core::URIConfig& uri) = 0; 31 | 32 | virtual ps::Status create_storage_handler(int32_t storage_id, std::unique_ptr&) = 0; 33 | virtual uint32_t generate_id(const std::string&) = 0; 34 | virtual ps::Status pull_model_meta(const std::string& model_sign, ModelMeta& model_meta) = 0; 35 | 36 | 37 | virtual const EnvConfig& env_config()const = 0; 38 | 39 | void set_default_hadoop_bin(core::URIConfig& uri); 40 | 41 | protected: 42 | ps::Status create_operator(int32_t storage_id, const std::string& key, 43 | int32_t& handler_id, std::shared_ptr& op); 44 | }; 45 | 46 | 47 | class RpcConnection: public Connection { 48 | public: 49 | RpcConnection(const EnvConfig& env); 50 | 51 | ~RpcConnection() override; 52 | 53 | comm_rank_t global_rank() override { 54 | return _rpc->global_rank(); 55 | } 56 | 57 | std::unique_ptr create_server(); 58 | 59 | std::unique_ptr create_controller(); 60 | 61 | ps::Context* server_context() override { 62 | return _client->context().get(); 63 | } 64 | 65 | std::vector running_servers()override; 66 | 67 | ps::Status close_server(int32_t server_id)override; 68 | 69 | void close_servers() override; 70 | 71 | ps::Status create_storage(const std::map>& node_shards, int32_t& storage_id)override; 72 | 73 | ps::Status delete_storage(int32_t storage_id)override; 74 | 75 | ps::Status create_storage_handler(int32_t storage_id, std::unique_ptr& storage)override; 76 | 77 | ps::Status set_storage_restore_uri(int32_t storage_id, const core::URIConfig& uri); 78 | 79 | uint32_t generate_id(const std::string& name); 80 | 81 | ps::Status pull_model_meta(const std::string& model_sign, ModelMeta& model_meta)override; 82 | 83 | ps::Status push_model_meta(const ModelMeta& model_meta); 84 | 85 | ps::Status update_model_meta(const ModelMeta& model_meta); 86 | 87 | ps::Status delete_model_meta(const std::string& model_sign); 88 | 89 | std::vector list_model(); 90 | 91 | bool try_lock_model(const std::string& model_sign); 92 | 93 | void unlock_model(const std::string& model_sign); 94 | 95 | const EnvConfig& env_config()const override { 96 | return _env; 97 | } 98 | 99 | core::RpcService* rpc()const { 100 | return _rpc.get(); 101 | } 102 | 103 | core::MasterClient* master_client()const { 104 | return _master_client.get(); 105 | } 106 | 107 | private: 108 | template 109 | ps::Status create_handler(int32_t storage_id, const std::string& key, std::unique_ptr& handler) { 110 | int32_t handler_id = -1; 111 | std::shared_ptr op; 112 | CHECK_STATUS_RETURN(create_operator(storage_id, key, handler_id, op)); 113 | handler = std::make_unique(storage_id, handler_id, op, _client.get()); 114 | return ps::Status(); 115 | } 116 | 117 | template 118 | void create_handler_pool(int32_t storage_id, const std::string& key, 119 | ObjectPool>& handler_pool) { 120 | handler_pool = [this, storage_id, key]() { 121 | std::unique_ptr handler; 122 | ps::Status status = create_handler(storage_id, key, handler); 123 | if (!status.ok()) { 124 | SLOG(WARNING) << key << " " << status.ToString(); 125 | } 126 | return handler; 127 | }; 128 | } 129 | 130 | std::string _model_path = "_hyper-embedding-model_"; 131 | std::string _model_lock_path = "_hyper-embedding-model-lock_"; 132 | std::unique_ptr _rpc; 133 | std::unique_ptr _master_client; 134 | std::unique_ptr _rpc_client; 135 | std::unique_ptr _client; 136 | EnvConfig _env; 137 | }; 138 | 139 | } 140 | } 141 | } 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /openembedding/client/EmbeddingVariableHandle.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_HANDLE_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_HANDLE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Meta.h" 10 | #include "ObjectPool.h" 11 | 12 | #include "EmbeddingPullOperator.h" 13 | #include "EmbeddingPushOperator.h" 14 | #include "EmbeddingLoadOperator.h" 15 | #include "EmbeddingDumpOperator.h" 16 | #include "EmbeddingStoreOperator.h" 17 | 18 | namespace paradigm4 { 19 | namespace pico { 20 | namespace embedding { 21 | 22 | struct HandlerWaiter { 23 | public: 24 | template 25 | HandlerWaiter(F&& waiter): _waiter(std::forward(waiter)) { 26 | SCHECK(_waiter); 27 | } 28 | 29 | ~HandlerWaiter() { 30 | SCHECK(_wait_called); 31 | } 32 | 33 | HandlerWaiter(const HandlerWaiter&) = delete; 34 | HandlerWaiter& operator=(const HandlerWaiter&) = delete; 35 | 36 | HandlerWaiter(HandlerWaiter&& other) { 37 | _wait_called = other._wait_called; 38 | _waiter = other._waiter; 39 | other._wait_called = true; 40 | } 41 | 42 | ps::Status wait(void* result = nullptr) { 43 | _wait_called = true; 44 | return _waiter(result); 45 | } 46 | 47 | private: 48 | bool _wait_called = false; 49 | std::function _waiter; 50 | }; 51 | 52 | // not handler, just a handle of storage handler. 53 | class EmbeddingVariableHandle { 54 | public: 55 | // n * embedding_dim; 56 | const EmbeddingVariableMeta& meta()const { 57 | return _meta; 58 | } 59 | 60 | uint32_t variable_id()const { 61 | return _variable_id; 62 | } 63 | 64 | HandlerWaiter init_config(const core::Configure& config)const; 65 | 66 | // predictor controller 67 | HandlerWaiter clear_weights(); 68 | 69 | // predictor controller 70 | HandlerWaiter pull_weights(const uint64_t* indices, size_t n, int64_t batch_id)const; 71 | 72 | HandlerWaiter push_gradients(const uint64_t* indices, size_t n, const char* gradients)const; 73 | 74 | int _timeout = -1; 75 | bool _read_only = false; 76 | uint32_t _variable_id = 0; 77 | EmbeddingVariableMeta _meta; 78 | 79 | ObjectPool>* _read_only_pull_handler = nullptr; 80 | ObjectPool>* _pull_handler = nullptr; 81 | ObjectPool>* _push_handler = nullptr; 82 | ObjectPool>* _init_handler = nullptr; 83 | 84 | std::atomic* _should_persist; 85 | }; 86 | 87 | class EmbeddingStorageHandler { 88 | public: 89 | EmbeddingStorageHandler() {} 90 | EmbeddingStorageHandler(const EmbeddingStorageHandler&) = delete; 91 | EmbeddingStorageHandler& operator=(const EmbeddingStorageHandler&) = delete; 92 | 93 | EmbeddingStorageHandler(EmbeddingStorageHandler&&) = default; 94 | EmbeddingStorageHandler& operator=(EmbeddingStorageHandler&&) = default; 95 | 96 | EmbeddingVariableHandle variable(uint32_t variable_id, EmbeddingVariableMeta meta); 97 | 98 | HandlerWaiter update_weights(); 99 | 100 | // predictor controller 101 | HandlerWaiter load_storage(const URIConfig& uri, size_t server_concurency = 4); 102 | 103 | // predictor controller 104 | HandlerWaiter dump_storage(const URIConfig& uri, size_t file_number); 105 | 106 | int _timeout = -1; 107 | ObjectPool> _read_only_pull_handler; 108 | ObjectPool> _pull_handler; 109 | ObjectPool> _push_handler; 110 | ObjectPool> _store_handler; 111 | ObjectPool> _init_handler; 112 | 113 | ObjectPool> _load_handler; 114 | ObjectPool> _dump_handler; 115 | }; 116 | 117 | 118 | } 119 | } 120 | } 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /openembedding/client/EnvConfig.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EXB_ENV_CONFIG_H 2 | #define PARADIGM4_HYPEREMBEDDING_EXB_ENV_CONFIG_H 3 | 4 | #include 5 | #include 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | using core::ConfigNode; 12 | using core::ConfigUnit; 13 | 14 | #ifdef USE_RDMA 15 | DECLARE_CONFIG(RdmaConfig, ConfigNode) { 16 | PICO_CONFIGURE_DECLARE(std::string, ib_devname); 17 | PICO_CONFIGURE_DECLARE(int, gid_index); 18 | PICO_CONFIGURE_DECLARE(int, ib_port); 19 | PICO_CONFIGURE_DECLARE(int, traffic_class); 20 | PICO_CONFIGURE_DECLARE(int, sl); 21 | PICO_CONFIGURE_DECLARE(int, mtu); 22 | PICO_CONFIGURE_DECLARE(int, pkey_index); 23 | PICO_CONFIGURE_DECLARE(int, min_rnr_timer); 24 | PICO_CONFIGURE_DECLARE(int, retry_cnt); 25 | PICO_CONFIGURE_DECLARE(int, timeout); 26 | }; 27 | #endif 28 | 29 | DECLARE_CONFIG(TcpConfig, ConfigNode) { 30 | PICO_CONFIGURE_DECLARE(int, keepalive_time); 31 | PICO_CONFIGURE_DECLARE(int, keepalive_intvl); 32 | PICO_CONFIGURE_DECLARE(int, keepalive_probes); 33 | PICO_CONFIGURE_DECLARE(int, connect_timeout); 34 | }; 35 | 36 | DECLARE_CONFIG(RpcConfig, ConfigNode) { 37 | PICO_CONFIGURE_DECLARE(std::string, bind_ip); 38 | PICO_CONFIGURE_DECLARE(size_t, io_thread_num); 39 | PICO_CONFIGURE_DECLARE(std::string, protocol); 40 | #ifdef USE_RDMA 41 | PICO_CONFIGURE_DECLARE(RdmaConfig, rdma); 42 | #endif 43 | PICO_CONFIGURE_DECLARE(TcpConfig, tcp); 44 | }; 45 | 46 | DECLARE_CONFIG(MasterConfig, ConfigNode) { 47 | PICO_CONFIGURE_DECLARE(std::string, endpoint); 48 | PICO_CONFIGURE_DECLARE(std::string, type); 49 | PICO_CONFIGURE_DECLARE(std::string, root_path); 50 | PICO_CONFIGURE_DECLARE(size_t, recv_timeout); 51 | PICO_CONFIGURE_DECLARE(size_t, cache_timeout); 52 | }; 53 | 54 | DECLARE_CONFIG(ServerConfig, ConfigNode) { 55 | PICO_CONFIGURE_DECLARE(std::string, pmem_pool_root_path); 56 | PICO_CONFIGURE_DECLARE(size_t, cache_size); 57 | PICO_CONFIGURE_DECLARE(std::string, message_compress); 58 | PICO_CONFIGURE_DECLARE(size_t, server_dump_files); 59 | PICO_CONFIGURE_DECLARE(int, server_concurrency); 60 | PICO_CONFIGURE_DECLARE(int, recv_timeout); 61 | PICO_CONFIGURE_DECLARE(int, report_interval); 62 | PICO_CONFIGURE_DECLARE(bool, update_early_return); 63 | }; 64 | 65 | class EnvConfig: public ConfigNode { 66 | // client server shared 67 | // default shard_num = server_concurrency * server_num 68 | // PICO_CONFIGURE_DECLARE(size_t, max_request_merge_num); // pull push 69 | public: 70 | PICO_CONFIGURE_DECLARE(RpcConfig, rpc); 71 | PICO_CONFIGURE_DECLARE(MasterConfig, master); 72 | PICO_CONFIGURE_DECLARE(ServerConfig, server); 73 | public: 74 | 75 | void load_yaml(const core::Configure& configure, const std::string& master_endpoint = "", const std::string& rpc_bind_ip = "") { 76 | SCHECK(load_config(configure)); 77 | if (!master_endpoint.empty()) { 78 | master.endpoint = master_endpoint; 79 | } 80 | if (!rpc_bind_ip.empty()) { 81 | rpc.bind_ip = rpc_bind_ip; 82 | } 83 | } 84 | }; 85 | 86 | 87 | } 88 | } 89 | } 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /openembedding/client/Model.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_H 3 | 4 | #include "Meta.h" 5 | #include "Connection.h" 6 | #include "EmbeddingVariableHandle.h" 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | namespace embedding { 11 | 12 | class Model { 13 | public: 14 | Model(Connection* connection): _conn(connection) {} 15 | 16 | const ModelMeta& model_meta() { 17 | return _model_meta; 18 | } 19 | 20 | void set_model_status(ps::ModelStatus model_status); 21 | 22 | ps::Status test_status(const ps::Status& status); 23 | 24 | ps::Status update_model_meta(const ModelMeta& model_meta); 25 | 26 | ps::Status add_storage(int32_t storage_id, std::string storage_name); 27 | 28 | ps::Status add_variable(const ModelVariableMeta& variable); 29 | 30 | ps::Status access_storage(int32_t storage_id, EmbeddingStorageHandler*& storage)const; 31 | 32 | ps::Status access_variable(uint32_t variable_id, EmbeddingVariableHandle& handle)const; 33 | 34 | ps::Status dump_model(core::URIConfig uri, std::string model_sign, size_t num_files)const; 35 | 36 | ps::Status load_model(core::URIConfig uri); 37 | 38 | ps::Status load_model(); 39 | 40 | ps::Status create_model(core::URIConfig uri); 41 | 42 | ps::Status create_model_storages(int32_t replica_num, int32_t shard_num = -1); 43 | 44 | void delete_model_storages(); 45 | 46 | static ps::Status read_meta_file(const core::URIConfig& uri, ModelOfflineMeta& model_meta); 47 | 48 | private: 49 | Connection* _conn = nullptr; 50 | ModelMeta _model_meta; 51 | // The file name of storage is the ordered rank of the storage_id in this model. 52 | std::unordered_map> _storages; 53 | }; 54 | 55 | } 56 | } 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /openembedding/client/ModelController.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_CONTROLLER_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_CONTROLLER_H 3 | 4 | #include "Model.h" 5 | 6 | namespace paradigm4 { 7 | namespace pico { 8 | namespace embedding { 9 | 10 | // for predictor 11 | class ModelManager { 12 | public: 13 | ModelManager(Connection* connection): _conn(connection) {} 14 | 15 | /// TODO: Cache pull_model_meta with timeout. 16 | // Predictor pull handler may have different requirements of timeout. 17 | ps::Status find_model_variable(const std::string& model_sign, uint32_t variable_id, 18 | std::shared_ptr& out, EmbeddingVariableHandle& handle, int timeout = -1); 19 | 20 | private: 21 | core::RWSpinLock _lock; 22 | Connection* _conn = nullptr; 23 | std::unordered_map> _models; 24 | }; 25 | 26 | 27 | // for controller, all heavy methods are async 28 | class ModelController { 29 | public: 30 | ModelController(RpcConnection* connection): _conn(connection), 31 | _threads(_conn->env_config().server.server_concurrency) {} 32 | 33 | ps::Status create_model(const core::URIConfig& model_uri, 34 | std::string& model_sign, core::PicoJsonNode& result, int32_t replica_num, int32_t shard_num); 35 | 36 | ps::Status delete_model(const std::string& model_sign); 37 | 38 | ps::Status show_model(const std::string& model_sign, core::PicoJsonNode& result); 39 | 40 | ps::Status show_models(core::PicoJsonNode& result); 41 | 42 | ps::Status show_node(int32_t node_id, core::PicoJsonNode& result); 43 | 44 | ps::Status show_nodes(core::PicoJsonNode& result); 45 | 46 | ps::Status shutdown_node(int32_t node_id); 47 | private: 48 | RpcConnection* _conn = nullptr; 49 | ThreadGroup _threads; 50 | }; 51 | 52 | } 53 | } 54 | } 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /openembedding/client/ObjectPool.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_OBJECT_POOL_H 2 | #define PARADIGM4_HYPEREMBEDDING_OBJECT_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "EmbeddingVariable.h" 9 | 10 | namespace paradigm4 { 11 | namespace pico { 12 | namespace embedding { 13 | 14 | 15 | template 16 | class ObjectPool { 17 | public: 18 | ObjectPool() {} 19 | ObjectPool(ObjectPool&&) = default; 20 | ObjectPool& operator=(ObjectPool&&) = default; 21 | ObjectPool& operator=(std::function initializer) { 22 | SCHECK(_initializer == nullptr); 23 | _initializer = initializer; 24 | return *this; 25 | } 26 | 27 | T acquire() { 28 | core::lock_guard lk(*_lock); 29 | if (_pool.empty()) { 30 | if (_initializer) { 31 | return _initializer(); 32 | } else { 33 | return nullptr; 34 | } 35 | } else { 36 | T p = std::move(_pool.back()); 37 | _pool.pop_back(); 38 | return p; 39 | } 40 | } 41 | 42 | void release(T&& p) { 43 | core::lock_guard lk(*_lock); 44 | _pool.push_back(std::move(p)); 45 | } 46 | 47 | void clear() { 48 | core::lock_guard lk(*_lock); 49 | _pool.clear(); 50 | } 51 | 52 | std::unique_ptr _lock = std::make_unique(); 53 | std::function _initializer; 54 | std::deque _pool; 55 | }; 56 | 57 | 58 | 59 | } 60 | } 61 | } 62 | 63 | 64 | #endif -------------------------------------------------------------------------------- /openembedding/client/WorkerContext.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_WORKER_CONTEXT_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_WORKER_CONTEXT_H 3 | 4 | #include 5 | #include "Connection.h" 6 | #include "Communication.h" 7 | #include "EmbeddingVariableHandle.h" 8 | #include "Model.h" 9 | 10 | namespace paradigm4 { 11 | namespace pico { 12 | namespace embedding { 13 | 14 | class WorkerContext { 15 | public: 16 | WorkerContext(RpcConnection* connection, 17 | int32_t worker_num, int32_t wait_server_num = -1); 18 | 19 | ~WorkerContext(); 20 | 21 | int32_t create_storage(int32_t shard_num = -1); 22 | 23 | void delete_storage(int32_t storage_id); 24 | 25 | EmbeddingVariableHandle create_variable(int32_t storage_id, const EmbeddingVariableMeta& meta); 26 | 27 | HandlerWaiter update_weights(int32_t storage_id); 28 | 29 | int32_t worker_rank()const { 30 | return _comm->comm_rank(); 31 | } 32 | 33 | int32_t worker_num()const { 34 | return _comm->comm_size(); 35 | } 36 | 37 | Connection* connection()const { 38 | return _conn; 39 | } 40 | 41 | void load_model(const core::URIConfig& uri)const; 42 | 43 | void dump_model(const core::URIConfig& uri, const std::string& model_sign); 44 | 45 | void barrier(const std::string& key) { 46 | _comm->barrier(key); 47 | } 48 | 49 | template 50 | void boardcast(const std::string& key, T& value) { 51 | _comm->boardcast(key, value, 0); 52 | } 53 | 54 | void report_accumulator(); 55 | 56 | std::atomic should_persist = {false}; 57 | 58 | private: 59 | core::RWSpinLock _lock; 60 | Connection* _conn; 61 | std::unique_ptr _comm; 62 | std::unique_ptr _server; 63 | 64 | std::unique_ptr _model; 65 | 66 | ServerConfig _server_config; 67 | 68 | bool _reporter = false; 69 | size_t _report_monitor = 0; 70 | }; 71 | 72 | } 73 | } 74 | } 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /openembedding/entry/c_api.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_EXB_CAPI_H 3 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_EXB_CAPI_H 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #include 10 | #include 11 | 12 | struct exb_connection; 13 | 14 | struct exb_master; 15 | struct exb_server; 16 | struct exb_configure; 17 | struct exb_context; 18 | struct exb_storage; 19 | struct exb_variable; 20 | struct exb_optimizer; 21 | struct exb_initializer; 22 | struct exb_pull_waiter; 23 | struct exb_waiter; 24 | struct exb_channel; 25 | struct exb_mutex { 26 | int64_t data[16]; 27 | }; 28 | struct exb_string { 29 | char data[128]; 30 | }; 31 | 32 | struct exb_connection* exb_serving(); 33 | // TCP configuration should be consistent for all connections. 34 | // There may be unknown problems with multiple connections at the same time. 35 | // wait_server_num = -1 means to start ps in each worker process. 36 | struct exb_connection* exb_connect(const char* yaml_config, 37 | const char* master_endpoint, const char* rpc_bind_ip = ""); 38 | 39 | // thread local 40 | const char* exb_last_error(); 41 | 42 | int exb_last_wait_time_ms(); 43 | 44 | int exb_running_server_count(struct exb_connection*); 45 | 46 | void exb_disconnect(struct exb_connection*); 47 | 48 | struct exb_master* exb_master_start(const char* bind_ip = ""); 49 | 50 | void exb_master_endpoint(struct exb_master*, exb_string* value); 51 | 52 | void exb_master_join(struct exb_master*); // destroy 53 | 54 | struct exb_server* exb_server_start(struct exb_connection*); 55 | 56 | void exb_server_exit(struct exb_server*); 57 | 58 | void exb_server_join(struct exb_server*); // destroy 59 | 60 | struct exb_context* exb_context_initialize(struct exb_connection*, 61 | int32_t worker_num, int32_t wait_server_num = -1); 62 | 63 | void exb_context_finalize(struct exb_context*); 64 | 65 | int exb_worker_rank(struct exb_context*); 66 | 67 | struct exb_storage* exb_create_storage(struct exb_context*, int32_t shard_num = -1); 68 | 69 | void exb_delete_storage(struct exb_storage*); 70 | 71 | struct exb_variable* exb_create_variable(struct exb_storage*, 72 | uint64_t vocabulary_size, size_t embedding_dim, const char* dtype = "float32"); 73 | 74 | int32_t exb_storage_id(struct exb_storage*); 75 | 76 | uint32_t exb_variable_id(struct exb_variable*); 77 | 78 | void exb_set_initializer(struct exb_variable*, struct exb_initializer*); 79 | 80 | void exb_set_optimizer(struct exb_variable*, struct exb_optimizer*); 81 | 82 | size_t exb_unique_indices(const uint64_t* indices, size_t n, size_t* unique); 83 | 84 | struct exb_pull_waiter* exb_pull_weights(const struct exb_variable*, 85 | const uint64_t* indices, size_t n, int64_t batch_id); 86 | 87 | struct exb_waiter* exb_push_gradients(struct exb_variable*, 88 | const uint64_t* indices, size_t n, const void* gradients); 89 | 90 | struct exb_waiter* exb_update_weights(struct exb_storage*); 91 | 92 | bool exb_pull_wait(struct exb_pull_waiter*, const uint64_t* indices, size_t n, void* weights); 93 | 94 | bool exb_wait(struct exb_waiter*); 95 | 96 | struct exb_optimizer* exb_create_optimizer(const char* category); 97 | 98 | void exb_set_optimizer_property(struct exb_optimizer*, const char* key, const char* value); 99 | 100 | struct exb_initializer* exb_create_initializer(const char* category); 101 | 102 | void exb_set_initializer_property(struct exb_initializer*, const char* key, const char* value); 103 | 104 | const char* exb_version(); 105 | 106 | void exb_dump_model_include_optimizer(struct exb_context*, const char* path, const char* model_sign); 107 | 108 | void exb_dump_model(struct exb_context*, const char* path, const char* model_sign); 109 | 110 | void exb_load_model(struct exb_context*, const char* path); 111 | 112 | void exb_create_model(struct exb_connection*, const char* path, int32_t replica_num, int32_t shard_num = -1); 113 | 114 | struct exb_variable* exb_get_model_variable(struct exb_connection*, const char* model_sign, int32_t variable_id, int pull_timeout = -1); 115 | 116 | void exb_release_model_variable(struct exb_variable*); 117 | 118 | void exb_barrier(struct exb_context*, const char* name, exb_string* value = NULL); 119 | 120 | void exb_start_monitor(struct exb_context*); 121 | 122 | struct exb_channel* exb_channel_create(); 123 | void exb_channel_delete(struct exb_channel*); 124 | void exb_channel_close(struct exb_channel*); 125 | void exb_channel_write(struct exb_channel*, void*); 126 | bool exb_channel_read(struct exb_channel*, void**); 127 | 128 | void exb_mutex_lock(struct exb_mutex*); 129 | void exb_mutex_unlock(struct exb_mutex*); 130 | void exb_mutex_lock_shared(struct exb_mutex*); 131 | void exb_mutex_unlock_shared(struct exb_mutex*); 132 | void exb_mutex_upgrade(struct exb_mutex*); 133 | void exb_mutex_downgrade(struct exb_mutex*); 134 | 135 | void* exb_malloc(size_t size); 136 | void exb_free(void* p); 137 | 138 | void exb_info(const char* message); 139 | void exb_warning(const char* message); 140 | void exb_fatal(const char* message); 141 | 142 | bool exb_should_persist_model(struct exb_context*); 143 | 144 | void exb_persist_model(struct exb_context*, const char* path, const char* model_sign, size_t persist_pending_window); 145 | void exb_restore_model(struct exb_context*, const char* path); 146 | 147 | #ifdef __cplusplus 148 | } 149 | #endif 150 | 151 | #endif 152 | -------------------------------------------------------------------------------- /openembedding/entry/c_api_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "c_api_test.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | TEST(c_api, model_mix) { 12 | c_api_threads(1, 1, 1, 10, true); 13 | c_api_threads(1, 15, 5, 10, true); 14 | c_api_threads(2, 10, 5, 10, true); 15 | c_api_threads(3, 8, 5, 10, true); 16 | c_api_threads(4, 6, 5, 10, true); 17 | 18 | c_api_threads(1, 1, 1, 100, true); 19 | c_api_threads(2, 10, 5, 100, true); 20 | } 21 | 22 | TEST(c_api, model_shard_num) { 23 | c_api_threads(1, 3, 1, 10, true, 1); 24 | c_api_threads(1, 3, 5, 10, true, 3); 25 | c_api_threads(3, 3, 1, 10, true, 7); 26 | c_api_threads(5, 2, 1, 10, true, 111); 27 | c_api_threads(8, 2, 2, 10, true, 256); 28 | } 29 | 30 | TEST(c_api, pull_push) { 31 | for (size_t i = 1; i < 10; ++i) { 32 | c_api_pull_push(i, 100, 128, false); 33 | c_api_pull_push(i, 100000, 1, false); 34 | c_api_pull_push(i, 100000, 8, false); 35 | c_api_pull_push(i, 100000, 1, true); 36 | c_api_pull_push(i, 100000, 16, true); 37 | } 38 | } 39 | 40 | TEST(c_api, one) { 41 | c_api_threads(1, 1, 1, 1000); 42 | c_api_threads(3, 1, 1, 1000); 43 | c_api_threads(5, 1, 1, 1000); 44 | c_api_threads(8, 1, 1, 1000); 45 | } 46 | 47 | TEST(c_api, trd) { 48 | c_api_threads(1, 3, 1, 300); 49 | c_api_threads(2, 3, 1, 300); 50 | c_api_threads(3, 3, 1, 300); 51 | c_api_threads(4, 3, 1, 300); 52 | } 53 | 54 | TEST(c_api, mix) { 55 | for (int node_num = 1; node_num < 9; ++node_num) { 56 | c_api_threads(node_num, 20, 5, 100, false, node_num * node_num); 57 | } 58 | } 59 | 60 | TEST(c_api, rep) { 61 | for (int i = 0; i < 3; ++i) { 62 | c_api_threads(2, 7, 2, 300); 63 | c_api_threads(3, 5, 2, 300); 64 | c_api_threads(4, 3, 3, 300); 65 | } 66 | } 67 | 68 | } 69 | } 70 | } 71 | 72 | 73 | int main(int argc, char* argv[]) { 74 | testing::InitGoogleTest(&argc, argv); 75 | int ret = RUN_ALL_TESTS(); 76 | return ret; 77 | } 78 | -------------------------------------------------------------------------------- /openembedding/entry/controller.proto: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | syntax="proto2"; 19 | package exb; 20 | 21 | option cc_generic_services = true; 22 | 23 | message HttpRequest {}; 24 | message HttpResponse {}; 25 | 26 | service models { 27 | rpc default_method(HttpRequest) returns (HttpResponse); 28 | }; 29 | 30 | service nodes { 31 | rpc default_method(HttpRequest) returns (HttpResponse); 32 | }; 33 | -------------------------------------------------------------------------------- /openembedding/entry/masterd.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | int main(int argc, char* argv[]) { 7 | google::InstallFailureSignalHandler(); 8 | google::InitGoogleLogging(argv[0]); 9 | FLAGS_logtostderr = 1; 10 | google::AllowCommandLineReparsing(); 11 | google::ParseCommandLineFlags(&argc, &argv, false); 12 | 13 | paradigm4::pico::core::LogReporter::set_id("MASTER", 0); 14 | paradigm4::pico::core::Master master(""); 15 | 16 | master.initialize(); 17 | master.finalize(); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /openembedding/entry/server.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "Connection.h" 4 | #include "c_api.h" 5 | #include 6 | 7 | DEFINE_bool(enable_metrics, false, "enable/disable metrics"); 8 | DEFINE_string(service_name, "service_name", "service name of this binary"); 9 | DEFINE_string(instance_name, "instance_name", "instance name of this binary"); 10 | DEFINE_string(metrics_ip, "0.0.0.0", "Binding IP of the metrics exposer"); 11 | DEFINE_int32(metrics_port, 8001, "TCP port of the metrics exposer"); 12 | DEFINE_string(metrics_url, "/metrics", "URL of the metrics exposer"); 13 | 14 | DEFINE_bool(restore, true, "is replace one dead node"); // try replace one dead node 15 | 16 | DEFINE_string(config, "", ""); 17 | DEFINE_string(config_file, "", ""); 18 | DEFINE_string(rpc_bind_ip, "", ""); 19 | DEFINE_string(master_endpoint, "", ""); 20 | 21 | 22 | using namespace paradigm4::pico; 23 | using namespace paradigm4::pico::ps; 24 | 25 | int main(int argc, char* argv[]) { 26 | // exb_serving(); // Import registered optimizer. 27 | google::InstallFailureSignalHandler(); 28 | google::InitGoogleLogging(argv[0]); 29 | FLAGS_logtostderr = 1; 30 | google::AllowCommandLineReparsing(); 31 | google::ParseCommandLineFlags(&argc, &argv, false); 32 | 33 | paradigm4::pico::core::Memory::singleton().initialize(); 34 | 35 | paradigm4::pico::metrics_initialize(FLAGS_metrics_ip, FLAGS_metrics_port, FLAGS_metrics_url, 36 | FLAGS_service_name, FLAGS_instance_name, FLAGS_enable_metrics); 37 | 38 | paradigm4::pico::embedding::EnvConfig env; 39 | paradigm4::pico::core::Configure configure; 40 | if (FLAGS_config.empty()) { 41 | configure.load(FLAGS_config); 42 | } else { 43 | configure.load_file(FLAGS_config_file); 44 | } 45 | env.load_yaml(configure, FLAGS_master_endpoint, FLAGS_rpc_bind_ip); 46 | paradigm4::pico::embedding::RpcConnection conn(env); 47 | paradigm4::pico::core::LogReporter::set_id("SERVER", conn.rpc()->global_rank()); 48 | std::unique_ptr server; 49 | server = conn.create_server(); 50 | server->initialize(); 51 | 52 | if (FLAGS_restore) { 53 | server->restore_storages(false); 54 | } 55 | 56 | server->finalize(); 57 | paradigm4::pico::core::Memory::singleton().finalize(); 58 | paradigm4::pico::metrics_finalize(); 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /openembedding/server/EmbeddingDumpOperator.cpp: -------------------------------------------------------------------------------- 1 | #include "EmbeddingDumpOperator.h" 2 | 3 | #include 4 | #include "EmbeddingVariable.h" 5 | #include "EmbeddingShardFile.h" 6 | #include "EmbeddingStorage.h" 7 | #include "Factory.h" 8 | 9 | namespace paradigm4 { 10 | namespace pico { 11 | namespace embedding { 12 | 13 | void EmbeddingDumpOperator::apply_request(ps::RuntimeInfo& rt, 14 | ps::PSRequest& req, 15 | ps::Storage* storage, 16 | ps::PSResponse& resp_ret) { 17 | ps::DumpArgs dump_args; 18 | req >> dump_args; 19 | int32_t file_id; 20 | req >> file_id; 21 | std::vector shard_ids; 22 | req >> shard_ids; 23 | SCHECK(req.archive().is_exhausted()); 24 | ps::PSResponse resp(req); 25 | //core::FileSystem::mkdir_p(dump_args.uri()); 26 | 27 | core::URIConfig uri(dump_args.uri()); 28 | std::string file = format_string("/model_%d_%d", rt.node_id(), file_id); 29 | FileWriter writer; 30 | if (!writer.open(uri + file)) { 31 | if (uri.storage_type() != core::FileSystemType::HDFS) { 32 | core::FileSystem::mkdir_p(uri); 33 | } 34 | SCHECK(writer.open(uri + file)); 35 | } 36 | bool include_optimizer = true; 37 | uri.config().get_val("include_optimizer", include_optimizer); 38 | 39 | bool persist_model = false; 40 | uri.config().get_val("persist_model", persist_model); 41 | if (persist_model && !include_optimizer) { 42 | SLOG(WARNING) << "persist model not support without optimizer."; 43 | include_optimizer = true; 44 | } 45 | size_t persist_pending_window = 2; 46 | uri.config().get_val("persist_pending_window", persist_pending_window); 47 | 48 | auto& st = *(static_cast(storage)); 49 | core::shared_lock_guard l(st); 50 | for (int32_t shard_id: shard_ids) { 51 | SCHECK(rt.local_shards().count(shard_id) != 0) 52 | << "Bad Request: invalid shard_id = " << shard_id; 53 | auto& shard = *(st.get(shard_id)); 54 | // should not lock shared 55 | core::lock_guard sl(shard); 56 | EmbeddingShard& ht = *boost::any_cast(&shard.data); 57 | for (uint32_t variable_id: ht.variable_ids()) { 58 | EmbeddingVariableBase& variable = ht[variable_id]; 59 | 60 | EmbeddingShardDataMeta shard_meta; 61 | shard_meta.variable_id = variable_id; 62 | shard_meta.meta = ht.meta(variable_id); 63 | 64 | core::Configure config; 65 | if (persist_model) { 66 | SCHECK(variable.persist_config(persist_pending_window, config)); 67 | if (!include_optimizer) { 68 | config.node().remove("optimizer"); 69 | } 70 | } else { 71 | variable.dump_config(config); 72 | } 73 | shard_meta.config = config.dump(); 74 | shard_meta.shard_id = shard_id; 75 | shard_meta.shard_num = rt.global_shard_num(); 76 | shard_meta.state_line_size = include_optimizer ? variable.state_line_size() : 0; 77 | shard_meta.num_items = persist_model ? 0 : variable.num_indices(); 78 | writer.write(shard_meta); 79 | 80 | if (shard_meta.num_items) { 81 | int reader_id = variable.create_reader(); 82 | size_t n = 0; 83 | core::vector indices(variable.server_block_num_items()); 84 | while ( (n = variable.read_indices(reader_id, indices.data(), indices.size())) ) { 85 | writer.write(n); 86 | indices.resize(n); 87 | core::vector weights(indices.size() * shard_meta.meta.line_size()); 88 | core::vector states(indices.size() * shard_meta.state_line_size); 89 | variable.get_weights(indices.data(), n, weights.data(), states.data()); 90 | writer.write(indices.data(), indices.size()); 91 | writer.write(weights.data(), weights.size()); 92 | writer.write(states.data(), states.size()); 93 | } 94 | variable.delete_reader(reader_id); 95 | } 96 | } 97 | } 98 | resp << ps::Status(); 99 | resp_ret = std::move(resp); 100 | } 101 | 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /openembedding/server/EmbeddingDumpOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_DUMP_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_DUMP_OPERATOR_H 3 | 4 | #include 5 | 6 | namespace paradigm4 { 7 | namespace pico { 8 | namespace embedding { 9 | 10 | class EmbeddingDumpOperator : public ps::ShardStorageDumpOperator { 11 | public: 12 | 13 | EmbeddingDumpOperator(const core::Configure& conf) : ps::ShardStorageDumpOperator(conf) {} 14 | 15 | virtual ~EmbeddingDumpOperator() {} 16 | 17 | EmbeddingDumpOperator(EmbeddingDumpOperator&&) = default; 18 | EmbeddingDumpOperator& operator=(EmbeddingDumpOperator&&) = default; 19 | 20 | void apply_request(ps::RuntimeInfo& rt, 21 | ps::PSRequest& req, 22 | ps::Storage* storage, 23 | ps::PSResponse& resp_ret)override; 24 | 25 | std::unique_ptr init_result_impl() { 26 | return nullptr; 27 | } 28 | 29 | void merge_result_impl(const ps::ForEachResult&, ps::ForEachResult&, 30 | const ps::CarriedItem&)override {} 31 | }; 32 | 33 | } 34 | } 35 | } 36 | 37 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingInitOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_INIT_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_INIT_OPERATOR_H 3 | 4 | #include 5 | #include 6 | #include "Meta.h" 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | namespace embedding { 11 | 12 | class EmbeddingInitItems: public ps::PushItems { 13 | public: 14 | EmbeddingVariableMeta meta; 15 | uint32_t variable_id = -1; 16 | uint64_t n = 0; // indices for push 17 | // vocabulary_size for resize or create 18 | bool clear_weights = false; 19 | const uint64_t* indices = nullptr; // for push 20 | const char* weights = nullptr; 21 | const char* states = nullptr; 22 | uint64_t state_line_size = 0; // != 0 means pushing optimizer state 23 | std::string variable_config; // for create 24 | }; 25 | 26 | // for init, load, update context 27 | class EmbeddingInitOperator : public ps::PushOperator { 28 | public: 29 | EmbeddingInitOperator(const Configure& config) : ps::PushOperator(config) { 30 | initialize_compress_info(config, "EmbeddingInitOperator", _compress_info); 31 | } 32 | 33 | ~EmbeddingInitOperator()override {} 34 | 35 | EmbeddingInitOperator(EmbeddingInitOperator&&) = default; 36 | EmbeddingInitOperator& operator=(EmbeddingInitOperator&&) = default; 37 | 38 | void generate_request_data(core::vector>& push_items, 39 | ps::RuntimeInfo& rt, 40 | std::unique_ptr& push_request_data) override; 41 | 42 | void generate_push_request( 43 | std::vector& push_request_data, 44 | ps::RuntimeInfo& rt, 45 | std::vector& reqs) override; 46 | 47 | void generate_store_request(ps::RuntimeInfo& rt, 48 | std::vector& reqs) override; 49 | 50 | void apply_async_push_request(ps::RuntimeInfo& rt, 51 | ps::PSRequest& req, 52 | ps::Storage* storage, 53 | ps::Storage*, 54 | ps::PSResponse& resp) override; 55 | 56 | void apply_sync_push_request(ps::RuntimeInfo&, 57 | ps::PSRequest&, 58 | ps::Storage*, 59 | ps::PSResponse&) override { 60 | return; 61 | } 62 | 63 | void apply_store_request(ps::RuntimeInfo&, 64 | ps::PSRequest&, 65 | ps::Storage*, 66 | ps::Storage*, 67 | ps::Storage*, 68 | std::function) override { 69 | return; 70 | } 71 | 72 | void apply_response(ps::PSResponse& resp) override; 73 | 74 | std::unique_ptr create_delta_storage(ps::RuntimeInfo&) override { 75 | return nullptr; 76 | } 77 | 78 | std::unique_ptr create_incr_storage(ps::RuntimeInfo&) override { 79 | return nullptr; 80 | } 81 | 82 | protected: 83 | ps::CompressInfo _compress_info; 84 | }; 85 | 86 | 87 | } 88 | } 89 | } 90 | 91 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingLoadOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_LOAD_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_LOAD_OPERATOR_H 3 | 4 | #include 5 | #include "EmbeddingInitOperator.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | class EmbeddingLoadOperator: public ps::LoadOperator { 12 | typedef uint64_t key_type; 13 | public: 14 | EmbeddingLoadOperator(const Configure& config): ps::LoadOperator(config), _push_op(config) {} 15 | 16 | virtual ~EmbeddingLoadOperator() {} 17 | 18 | EmbeddingLoadOperator(EmbeddingLoadOperator&&) = default; 19 | EmbeddingLoadOperator& operator=(EmbeddingLoadOperator&&) = default; 20 | 21 | 22 | void apply_load_response(ps::PSResponse& resp) override; 23 | 24 | void restore(const URIConfig&, ps::RuntimeInfo&, ps::Storage*) override; 25 | 26 | void create_stream(const URIConfig& uri, std::shared_ptr& stream) override; 27 | 28 | size_t generate_push_items(std::shared_ptr& stream_in, 29 | core::vector>& push_items) override; 30 | 31 | ps::PushOperator* push_operator() override { 32 | return &_push_op; 33 | } 34 | 35 | 36 | protected: 37 | EmbeddingInitOperator _push_op; 38 | }; 39 | 40 | 41 | } 42 | } 43 | } 44 | 45 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingPullOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_PULL_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_PULL_OPERATOR_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "EmbeddingStorage.h" 8 | 9 | namespace paradigm4 { 10 | namespace pico { 11 | namespace embedding { 12 | 13 | struct EmbeddingPullItems { 14 | uint32_t variable_id = 0; 15 | EmbeddingVariableMeta meta; 16 | 17 | const uint64_t* indices = nullptr; 18 | uint64_t n = 0; 19 | 20 | int64_t batch_id = 0; 21 | 22 | }; 23 | 24 | struct EmbeddingPullResults { 25 | const uint64_t* indices = nullptr; 26 | uint64_t n = 0; 27 | 28 | char* weights = nullptr; 29 | bool should_persist = false; 30 | }; 31 | 32 | struct EmbeddingPullRequestData { 33 | struct ShardData { 34 | size_t cursor = 0; 35 | core::vector num_indices; // prefix count 36 | ps::RpcVector indices; 37 | BinaryArchive weights; 38 | }; 39 | 40 | EmbeddingPullRequestData() {} 41 | 42 | void init(size_t shard_num, size_t block_num); 43 | 44 | size_t waiting_reqs = 0; 45 | core::vector> block_offsets; 46 | core::vector block_items; 47 | std::unordered_map> node_shards; 48 | core::vector shards; 49 | }; 50 | 51 | class EmbeddingPullOperator: public ps::UDFOperator, EmbeddingPullRequestData> { 52 | public: 53 | EmbeddingPullOperator(const Configure& config): 54 | ps::UDFOperator, EmbeddingPullRequestData>(config) { 55 | initialize_compress_info(config, "EmbeddingPullOperator", _compress_info); 56 | _algo = ps::initialize_shard_pick_algo(config); 57 | if (config.has("read_only")) { 58 | _read_only = config["read_only"].as(); 59 | } 60 | } 61 | 62 | ~EmbeddingPullOperator() override {} 63 | 64 | EmbeddingPullOperator(EmbeddingPullOperator&&) = default; 65 | EmbeddingPullOperator& operator=(EmbeddingPullOperator&&) = default; 66 | 67 | bool read_only() override { return _read_only; } 68 | 69 | ps::Status generate_request(core::vector& block_items, 70 | ps::RuntimeInfo& rt, EmbeddingPullRequestData& data, std::vector& reqs)override; 71 | 72 | void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 73 | const ps::TableDescriptor& table, core::Dealer* dealer) override; 74 | 75 | /// TODO: check context version 76 | void apply_request_pull(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 77 | const ps::TableDescriptor& table, core::Dealer* dealer); 78 | 79 | ps::Status apply_response(ps::PSResponse& resp, EmbeddingPullRequestData& data, void* result) override; 80 | 81 | protected: 82 | bool _read_only = false; 83 | ps::CompressInfo _compress_info; 84 | ps::PickAlgo _algo; 85 | }; 86 | 87 | 88 | } 89 | } 90 | } 91 | 92 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingPushOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_PUSH_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_PUSH_OPERATOR_H 3 | 4 | #include 5 | #include 6 | #include "EmbeddingStorage.h" 7 | #include "EmbeddingPullOperator.h" 8 | #include "RpcView.h" 9 | 10 | namespace paradigm4 { 11 | namespace pico { 12 | namespace embedding { 13 | 14 | // key <--> index 15 | // value <--> gradients 16 | class EmbeddingPushItems { 17 | public: 18 | uint32_t variable_id = -1; 19 | EmbeddingVariableMeta meta; 20 | 21 | const uint64_t* indices = nullptr; 22 | uint64_t n = 0; 23 | const char* gradients = nullptr; 24 | }; 25 | 26 | struct EmbeddingPushRequestData { 27 | struct ShardData { 28 | size_t indices_base = 0; 29 | size_t gradients_base = 0; 30 | core::vector num_indices; // prefix count 31 | ps::RpcVector indices; 32 | ps::RpcVector gradients; 33 | ps::RpcVector counts; 34 | }; 35 | 36 | EmbeddingPushRequestData(): offsets(-1) {} 37 | 38 | void init(size_t shard_num); 39 | 40 | template 41 | void operator()(TypeCase, EmbeddingPushItems& items); 42 | 43 | EasyHashMap offsets; 44 | core::vector shards; 45 | }; 46 | 47 | 48 | class EmbeddingPushOperator : public ps::UDFOperator, EmbeddingPushRequestData> { 49 | public: 50 | EmbeddingPushOperator(const Configure& config): 51 | ps::UDFOperator, EmbeddingPushRequestData>(config) { 52 | initialize_compress_info(config, "EmbeddingPushOperator", _compress_info); 53 | } 54 | 55 | virtual ~EmbeddingPushOperator() {} 56 | 57 | EmbeddingPushOperator(EmbeddingPushOperator&&) = default; 58 | EmbeddingPushOperator& operator=(EmbeddingPushOperator&&) = default; 59 | 60 | bool read_only() override { return false; } 61 | 62 | ps::Status generate_request(core::vector& block_items, 63 | ps::RuntimeInfo& rt, EmbeddingPushRequestData& data, std::vector& reqs) override; 64 | 65 | void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 66 | const ps::TableDescriptor& table, core::Dealer* dealer) override; 67 | 68 | 69 | ps::Status apply_response(ps::PSResponse& resp, EmbeddingPushRequestData&, void* result) override; 70 | 71 | protected: 72 | 73 | ps::CompressInfo _compress_info; 74 | }; 75 | 76 | 77 | } 78 | } 79 | } 80 | 81 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingRestoreOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_RESTORE_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_RESTORE_OPERATOR_H 3 | 4 | #include 5 | #include "EmbeddingStorage.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | // need restore initializer for default value 12 | class EmbeddingRestoreOperator: public ps::RestoreOperator { 13 | typedef uint64_t key_type; 14 | public: 15 | EmbeddingRestoreOperator(const core::Configure& config) : ps::RestoreOperator(config) { 16 | initialize_compress_info(config, "EmbeddingRestoreOperator", _compress_info); 17 | } 18 | 19 | ~EmbeddingRestoreOperator() override {} 20 | EmbeddingRestoreOperator(EmbeddingRestoreOperator&&) = default; 21 | EmbeddingRestoreOperator& operator=(EmbeddingRestoreOperator&&) = default; 22 | 23 | void generate_coordinated_restore_request( 24 | ps::CoordinatedRestoreRequestItem* req_item, std::vector& req)override; 25 | 26 | virtual void apply_coordinated_restore_request( 27 | ps::PSRequest& req, ps::Storage* storage, ps::PSResponse& resp)override; 28 | 29 | virtual void apply_coordinated_restore_response(ps::PSResponse& resp, ps::Storage* storage, ps::CoordinatedRestoreResponseItem* resp_item); 30 | 31 | virtual void restore(const core::URIConfig& uri, ps::RuntimeInfo& rt, ps::Storage* storage); 32 | 33 | protected: 34 | ps::CompressInfo _compress_info; 35 | }; 36 | 37 | typedef ps::ShardStorageOperator EmbeddingStorageOperator; 38 | 39 | 40 | } 41 | } 42 | } 43 | 44 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingShardFile.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_SHRAD_FILE_H 3 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_SHRAD_FILE_H 4 | 5 | #include 6 | #include 7 | #include "Meta.h" 8 | 9 | namespace paradigm4 { 10 | namespace pico { 11 | namespace embedding { 12 | 13 | struct EmbeddingShardDataMeta { 14 | uint32_t variable_id = 0; 15 | EmbeddingVariableMeta meta; 16 | std::string config; 17 | int32_t shard_id = 0; 18 | int32_t shard_num = 0; 19 | uint64_t state_line_size = 0; 20 | uint64_t num_items = 0; 21 | PICO_SERIALIZATION(variable_id, meta, config, shard_id, shard_num, state_line_size, num_items); 22 | 23 | uint64_t get_index(uint64_t index)const { 24 | return index * shard_num + shard_id; 25 | } 26 | }; 27 | 28 | class FileReader { 29 | public: 30 | bool open(const core::URIConfig& uri) { 31 | std::string hadoop_bin; 32 | uri.config().get_val(core::URI_HADOOP_BIN, hadoop_bin); 33 | _file = core::ShellUtility::open_read(uri.name(), "", hadoop_bin); 34 | _archive.reset(_file); 35 | return _file; 36 | } 37 | 38 | template 39 | bool read(T& value) { 40 | return core::pico_deserialize(_archive, value); 41 | } 42 | 43 | template 44 | typename std::enable_if::value, bool>::type 45 | read(T* buffer, size_t n) { 46 | return _archive.read_raw_uncheck(buffer, n * sizeof(T)); 47 | } 48 | 49 | private: 50 | core::shared_ptr _file; 51 | core::BinaryFileArchive _archive; 52 | }; 53 | 54 | class FileWriter { 55 | public: 56 | bool open(const core::URIConfig& uri) { 57 | std::string null_uri = "mem://null/"; 58 | if (uri.uri().substr(0, null_uri.size()) == null_uri) { 59 | _null = true; 60 | return true; 61 | } 62 | std::string hadoop_bin; 63 | uri.config().get_val(core::URI_HADOOP_BIN, hadoop_bin); 64 | _file = core::ShellUtility::open_write(uri.name(), "", hadoop_bin); 65 | _archive.reset(_file); 66 | return _file; 67 | } 68 | 69 | template 70 | void write(const T& value) { 71 | if (_null) return; 72 | SCHECK(core::pico_serialize(_archive, value)); 73 | } 74 | 75 | template 76 | typename std::enable_if::value>::type 77 | write(const T* buffer, size_t n) { 78 | if (_null) return; 79 | SCHECK(_archive.write_raw_uncheck(buffer, n * sizeof(T))); 80 | } 81 | 82 | private: 83 | bool _null = false; 84 | core::shared_ptr _file; 85 | core::BinaryFileArchive _archive; 86 | }; 87 | 88 | 89 | } 90 | } 91 | } 92 | 93 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingStorage.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_PICO_PS_EMBEDDING_EMBEDDING_STORAGE_H 2 | #define PARADIGM4_PICO_PS_EMBEDDING_EMBEDDING_STORAGE_H 3 | 4 | #include "Meta.h" 5 | #include "EmbeddingVariable.h" 6 | #include 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | /*! \brief namespace of parameter server */ 11 | namespace embedding { 12 | 13 | class EmbeddingShard { 14 | public: 15 | bool insert_variable(uint32_t variable_id, 16 | std::unique_ptr variable, 17 | const EmbeddingVariableMeta& meta) { 18 | if (variable_id >= _variables.size()) { 19 | _variables.resize(variable_id + 1); 20 | _metas.resize(variable_id + 1); 21 | } 22 | if (_variables[variable_id]) { 23 | return false; 24 | } 25 | if (variable) { 26 | _metas[variable_id] = meta; 27 | _variables[variable_id] = std::move(variable); 28 | _variable_ids.push_back(variable_id); 29 | return true; 30 | } 31 | return false; 32 | } 33 | 34 | bool contains(uint32_t variable_id)const { 35 | return variable_id < _variables.size() && _variables[variable_id]; 36 | } 37 | 38 | EmbeddingVariableBase& operator[](uint32_t variable_id) { 39 | SCHECK(contains(variable_id)) << variable_id; 40 | return *_variables[variable_id]; 41 | } 42 | 43 | const std::vector& variable_ids()const { 44 | return _variable_ids; 45 | } 46 | 47 | const EmbeddingVariableMeta& meta(uint32_t variable_id) { 48 | SCHECK(contains(variable_id)) << variable_id; 49 | return _metas[variable_id]; 50 | } 51 | 52 | EmbeddingVariableBase& get(uint32_t variable_id, const EmbeddingVariableMeta& meta) { 53 | if (!contains(variable_id)) { 54 | auto pvar = EmbeddingVariableBase::create(meta.datatype, meta.embedding_dim); 55 | SCHECK(insert_variable(variable_id, std::move(pvar), meta)); 56 | } 57 | SCHECK(this->meta(variable_id) == meta) 58 | << this->meta(variable_id).to_json_node().dump() << " " << meta.to_json_node().dump(); 59 | return (*this)[variable_id]; 60 | } 61 | private: 62 | std::vector _variable_ids; 63 | std::vector _metas; 64 | std::vector> _variables; 65 | }; 66 | 67 | struct PendingRequest { 68 | ps::PSMessageMeta psmeta; 69 | ps::PSRequest request; 70 | }; 71 | 72 | class EmbeddingStorage : public ps::ShardStorage { 73 | public: 74 | using ps::ShardStorage::_shards; 75 | typedef uint64_t key_type; 76 | typedef EmbeddingShard shard_type; 77 | EmbeddingStorage(const std::unordered_set& shard_id, const Configure&) { 78 | for (const auto& id : shard_id) { 79 | create_shard(id); 80 | } 81 | } 82 | 83 | void clear() override { 84 | for (auto& shard : _shards) { 85 | shard.second->data = shard_type(); 86 | } 87 | } 88 | 89 | virtual bool create_shard(int32_t shard_id) override { 90 | core::lock_guard lk(this->_mtx); 91 | if (_shards.count(shard_id) != 0) { 92 | return false; 93 | } 94 | _shards.emplace(shard_id, std::make_unique()); 95 | _shards[shard_id]->data = EmbeddingShard(); 96 | _shards_meta.emplace(shard_id, std::make_unique()); 97 | _shards_meta[shard_id]->on_dcpmm = false; 98 | return true; 99 | } 100 | 101 | //no use 102 | virtual size_t shard_size(int32_t) override { 103 | return 0; 104 | } 105 | 106 | //no use 107 | virtual size_t shard_memory_usage(int32_t) override { 108 | return 0; 109 | } 110 | 111 | virtual ps::ShardIterator* get_shard_iterator(int32_t, int32_t) override { 112 | SLOG(FATAL) << "No implementation"; 113 | return nullptr; 114 | } 115 | 116 | core::RWSpinLock& shared_mutex() { 117 | return this->_mtx; 118 | } 119 | 120 | core::RWSpinLock pending_mutex; 121 | int64_t batch_id = 0; 122 | std::atomic async_tasks = {0}; 123 | core::deque> pending; 124 | core::vector holders; 125 | }; 126 | 127 | 128 | } 129 | } 130 | } 131 | 132 | 133 | #endif -------------------------------------------------------------------------------- /openembedding/server/EmbeddingStoreOperator.cpp: -------------------------------------------------------------------------------- 1 | #include "EmbeddingStoreOperator.h" 2 | 3 | #include 4 | #include 5 | #include "EmbeddingStorage.h" 6 | #include "EmbeddingPullOperator.h" 7 | #include "RpcView.h" 8 | #include "PersistManager.h" 9 | 10 | namespace paradigm4 { 11 | namespace pico { 12 | namespace embedding { 13 | 14 | ps::Status EmbeddingStoreOperator::generate_request(int&, 15 | ps::RuntimeInfo& rt, int&, std::vector& reqs) { 16 | VTIMER(1, embedding_push, generate_push_request, ms); 17 | for (auto& node: rt.nodes()) { 18 | reqs.emplace_back(node.first); 19 | } 20 | return ps::Status(); 21 | } 22 | 23 | void EmbeddingStoreOperator::apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 24 | const ps::TableDescriptor& table, core::Dealer* dealer) { 25 | VTIMER(1, embedding_update, apply_request, ms); 26 | ps::PSResponse resp(req); 27 | resp << psmeta; 28 | auto& rt = *table.runtime_info; 29 | auto& st = *(static_cast(table.storage.get())); 30 | core::shared_lock_guard l(st); 31 | VariableAsyncTask::wait(st.async_tasks); 32 | 33 | #ifdef USE_DCPMM 34 | VariableAsyncTaskThreadPool::singleton().initialize_batch_task(); 35 | #endif 36 | 37 | for (int32_t shard_id: rt.local_shards()) { 38 | auto& shard = *(st.get(shard_id)); 39 | shard.lock(); // TODO: use guard 40 | } 41 | 42 | if (_early_return) { 43 | dealer->send_response(std::move(resp.rpc_response())); 44 | } 45 | 46 | for (int32_t shard_id: rt.local_shards()) { 47 | auto& shard = *(st.get(shard_id)); 48 | EmbeddingShard& ht = *boost::any_cast(&shard.data); 49 | for (uint32_t variable_id: ht.variable_ids()) { 50 | ht[variable_id].update_weights(); 51 | } 52 | shard.unlock(); 53 | } 54 | 55 | if (!_early_return) { 56 | dealer->send_response(std::move(resp.rpc_response())); 57 | } 58 | core::vector reqs; 59 | { 60 | core::lock_guard pl(st.pending_mutex); 61 | // Store and push should not happen at the same time, otherwise holders.clear() will cause error. 62 | st.holders.clear(); 63 | 64 | if (!st.pending.empty()) { 65 | reqs = std::move(st.pending.front()); 66 | st.pending.pop_front(); 67 | } 68 | st.batch_id += 1; 69 | } 70 | // Start processing the pull requests of batch_id + 1. 71 | for (PendingRequest& pend: reqs) { 72 | ps::Status status; 73 | if (status.ok()) { 74 | _pull.apply_request_pull(pend.psmeta, pend.request, table, dealer); 75 | } else { 76 | ps::PSResponse resp(pend.request); 77 | resp.rpc_response().set_error_code(RpcErrorCodeType::ELOGICERROR); 78 | resp << status << pend.psmeta; 79 | dealer->send_response(std::move(resp.rpc_response())); 80 | } 81 | } 82 | } 83 | 84 | ps::Status EmbeddingStoreOperator::apply_response(ps::PSResponse& resp, int&, void* result) { 85 | SCHECK(result == nullptr) << "return no result!"; 86 | SCHECK(resp.archive().is_exhausted()); 87 | return ps::Status(); 88 | } 89 | 90 | 91 | 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /openembedding/server/EmbeddingStoreOperator.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_STORE_OPERATOR_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_STORE_OPERATOR_H 3 | 4 | #include 5 | #include 6 | #include "EmbeddingStorage.h" 7 | #include "EmbeddingPullOperator.h" 8 | #include "RpcView.h" 9 | 10 | namespace paradigm4 { 11 | namespace pico { 12 | namespace embedding { 13 | 14 | 15 | class EmbeddingStoreOperator : public ps::UDFOperator { 16 | public: 17 | EmbeddingStoreOperator(const Configure& config): 18 | ps::UDFOperator(config), _pull(config) { 19 | if (config.has("update_early_return")) { 20 | _early_return = config["update_early_return"].as(); 21 | } 22 | } 23 | 24 | virtual ~EmbeddingStoreOperator() {} 25 | 26 | EmbeddingStoreOperator(EmbeddingStoreOperator&&) = default; 27 | EmbeddingStoreOperator& operator=(EmbeddingStoreOperator&&) = default; 28 | 29 | bool read_only() override { return false; } 30 | 31 | ps::Status generate_request(int&, 32 | ps::RuntimeInfo& rt, int&, std::vector& reqs) override; 33 | 34 | void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 35 | const ps::TableDescriptor& table, core::Dealer* dealer) override; 36 | 37 | ps::Status apply_response(ps::PSResponse& resp, int&, void* result) override; 38 | 39 | protected: 40 | EmbeddingPullOperator _pull; 41 | bool _early_return = true; 42 | }; 43 | 44 | 45 | } 46 | } 47 | } 48 | 49 | #endif -------------------------------------------------------------------------------- /openembedding/server/RpcView.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_RPC_VIEW_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_RPC_VIEW_H 3 | 4 | #include 5 | #include 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | template 12 | struct RpcView { 13 | static_assert(std::is_trivially_copyable::value, ""); 14 | 15 | RpcView() {} 16 | 17 | // not owner 18 | RpcView(ps::RpcVector& vector) { 19 | data = vector.data(); 20 | size = vector.size(); 21 | } 22 | 23 | RpcView(RpcView&& other) { 24 | *this = std::move(other); 25 | } 26 | 27 | RpcView& operator=(RpcView&& other) { 28 | data = other.data; 29 | size = other.size; 30 | holder = std::move(other.holder); 31 | other.data = nullptr; 32 | other.size = 0; 33 | return *this; 34 | } 35 | 36 | // for src_rank == dest_rank 37 | // be owner after receive() 38 | void receive() { 39 | if (!holder.deleter.owner) { 40 | holder = data_block_t(size * sizeof(T)); 41 | memcpy(holder.data, data, holder.length); 42 | data = reinterpret_cast(holder.data); 43 | } 44 | } 45 | 46 | void receive(BinaryArchive&& ar) { 47 | SCHECK(ar.length() % sizeof(T) == 0); 48 | holder = data_block_t(ar.length()); 49 | memcpy(holder.data, ar.buffer(), ar.length()); 50 | data = reinterpret_cast(holder.data); 51 | size = ar.length() / sizeof(T); 52 | ar = BinaryArchive(); 53 | } 54 | 55 | T* data = nullptr; 56 | size_t size = 0; 57 | data_block_t holder; 58 | }; 59 | 60 | 61 | template 62 | bool pico_serialize(core::ArchiveWriter&, core::SharedArchiveWriter& sar, RpcView& view) { 63 | sar.put_shared_uncheck(view.data, view.size); 64 | return true; 65 | } 66 | 67 | template 68 | bool pico_deserialize(core::ArchiveReader&, core::SharedArchiveReader& sar, RpcView& view) { 69 | // be owner after receive() 70 | if (sar.is_exhausted()) { 71 | return false; 72 | } 73 | sar.get_shared_uncheck(view.data, view.size, view.holder); 74 | return true; 75 | } 76 | 77 | 78 | template 79 | void serialize(core::LazyArchive& lazy, ps::CompressInfo& compress_info, RpcView&& view) { 80 | if (compress_info._enabled) { 81 | BinaryArchive msg_ar, compressed_ar(true); 82 | msg_ar.set_read_buffer(reinterpret_cast(view.data), view.size * sizeof(T)); 83 | compress_info._compresser.raw_compress(msg_ar, compressed_ar); 84 | lazy << std::move(compressed_ar); 85 | } else { 86 | lazy << std::move(view); 87 | } 88 | view = RpcView(); 89 | } 90 | 91 | template 92 | void deserialize(core::LazyArchive& lazy, ps::CompressInfo& compress_info, RpcView& view) { 93 | if (compress_info._enabled) { 94 | BinaryArchive msg_ar, compressed_ar; 95 | lazy >> compressed_ar; 96 | compress_info._compresser.raw_uncompress(compressed_ar, msg_ar); 97 | view.receive(std::move(msg_ar)); 98 | } else { 99 | lazy >> view; 100 | view.receive(); 101 | } 102 | } 103 | 104 | 105 | 106 | } 107 | 108 | 109 | } 110 | } 111 | 112 | #endif -------------------------------------------------------------------------------- /openembedding/tensorflow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # exb_ops should be compiled when pip install, here is only a simple v 2 | execute_process(COMMAND ${PYTHON} -c "import tensorflow as tf; print(\" \".join(tf.sysconfig.get_compile_flags()), end=\"\")" 3 | OUTPUT_VARIABLE TF_COMPILE_FLAGS) 4 | execute_process(COMMAND ${PYTHON} -c "import tensorflow as tf; print(\" \".join(tf.sysconfig.get_link_flags()), end=\"\")" 5 | OUTPUT_VARIABLE TF_LINK_FLAGS) 6 | 7 | message(TF_COMPILE_FLAGS: ${TF_COMPILE_FLAGS}) 8 | message(TF_COMPILE_FLAGS: ${TF_LINK_FLAGS}) 9 | 10 | add_library(exb_ops SHARED exb_ops.cpp) 11 | target_link_libraries(exb_ops cexb_pack) 12 | target_compile_options(exb_ops PRIVATE -Wno-unused-parameter -Wno-unused-but-set-parameter -Wno-ignored-qualifiers) 13 | set_target_properties(exb_ops PROPERTIES 14 | COMPILE_FLAGS ${TF_COMPILE_FLAGS} 15 | LINK_FLAGS ${TF_LINK_FLAGS}) 16 | 17 | -------------------------------------------------------------------------------- /openembedding/tensorflow/Prefetch.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_COMMON_PREFETCH_H 2 | #define PARADIGM4_HYPEREMBEDDING_COMMON_PREFETCH_H 3 | 4 | #include 5 | 6 | #include "ThreadPool.h" 7 | 8 | namespace paradigm4 { 9 | namespace exb { 10 | 11 | class BatchIDTable { 12 | public: 13 | int64_t pull_batch_id(int64_t key) { 14 | exb_lock_guard guard(_mutex); 15 | return _table[key]; 16 | } 17 | 18 | void next_work(int64_t key) { 19 | exb_lock_guard guard(_mutex); 20 | ++_table[key]; 21 | } 22 | 23 | private: 24 | exb_mutex _mutex; 25 | std::unordered_map _table; 26 | int64_t _batch_id; 27 | }; 28 | 29 | struct PrefetchKey { 30 | exb_variable* variable = nullptr; 31 | const uint64_t* indices = nullptr; 32 | size_t n = 0; 33 | int64_t batch_id = 0; 34 | size_t hash()const { 35 | // boost::hash_combine 36 | size_t hash = reinterpret_cast(variable); 37 | hash ^= batch_id + 0x9e3779b9 + (hash << 6) + (hash >> 2); 38 | hash ^= n + 0x9e3779b9 +(hash << 6) + (hash >> 2); 39 | // sampling key 40 | for (size_t i = 0; i < 4 && i < n; ++i) { 41 | hash ^= indices[hash % n] + 0x9e3779b9 + (hash << 6) + (hash >> 2); 42 | } 43 | return hash; 44 | } 45 | }; 46 | 47 | struct PrefetchValue { 48 | uint64_t check = 0; 49 | exb_pull_waiter* waiter; 50 | }; 51 | 52 | class PrefetchTable { 53 | public: 54 | void push(const PrefetchKey& key, PrefetchValue&& value) { 55 | exb_lock_guard guard(_mutex); 56 | _table[key.variable].push_back(std::move(value)); 57 | } 58 | 59 | bool pop(const PrefetchKey& key, PrefetchValue& value) { 60 | exb_lock_guard guard(_mutex); 61 | auto it = _table.find(key.variable); 62 | if (it == _table.end() || it->second.empty()) { 63 | return false; 64 | } 65 | value = std::move(it->second.front()); 66 | it->second.pop_front(); 67 | return true; 68 | } 69 | 70 | exb_mutex _mutex; 71 | std::unordered_map> _table; 72 | }; 73 | 74 | } 75 | } 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /openembedding/tensorflow/ThreadPool.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_COMMON_THREAD_POOL_H 2 | #define PARADIGM4_HYPEREMBEDDING_COMMON_THREAD_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../entry/c_api.h" 9 | 10 | namespace paradigm4 { 11 | namespace exb { 12 | 13 | class exb_lock_guard { 14 | public: 15 | exb_lock_guard(exb_mutex& mutex): _mutex(&mutex) { 16 | exb_mutex_lock(_mutex); 17 | } 18 | ~exb_lock_guard() { 19 | exb_mutex_unlock(_mutex); 20 | } 21 | exb_lock_guard(exb_lock_guard&&) = default; 22 | exb_lock_guard& operator=(exb_lock_guard&&) = default; 23 | private: 24 | exb_mutex* _mutex; 25 | }; 26 | 27 | class ThreadPool { 28 | public: 29 | static ThreadPool& singleton() { 30 | static ThreadPool pool; 31 | return pool; 32 | } 33 | 34 | template 35 | void submit(F job) { 36 | std::function* p = new std::function(std::move(job)); 37 | exb_channel_write(_channels[_jid.fetch_add(std::memory_order_acq_rel) % _channels.size()], p); 38 | } 39 | 40 | private: 41 | ThreadPool(size_t thread_num = std::thread::hardware_concurrency()): _threads(thread_num), _channels(thread_num) { 42 | for (size_t i = 0; i < _threads.size(); ++i) { 43 | _channels[i] = exb_channel_create(); 44 | _threads[i] = std::thread(&ThreadPool::running, this, i); 45 | } 46 | } 47 | 48 | ~ThreadPool() { 49 | for (size_t i = 0; i < _threads.size(); ++i) { 50 | exb_channel_close(_channels[i]); 51 | _threads[i].join(); 52 | exb_channel_delete(_channels[i]); 53 | } 54 | } 55 | 56 | void running(size_t i) { 57 | void* job; 58 | while (exb_channel_read(_channels[i], &job)) { 59 | std::function* p = static_cast*>(job); 60 | (*p)(); 61 | delete p; 62 | } 63 | } 64 | 65 | std::atomic _jid = {0}; 66 | std::vector _threads; 67 | std::vector _channels; 68 | }; 69 | 70 | 71 | 72 | } 73 | } 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /openembedding/tensorflow/__init__.py: -------------------------------------------------------------------------------- 1 | from openembedding.tensorflow.exb import * 2 | from openembedding import __version__ -------------------------------------------------------------------------------- /openembedding/variable/DataType.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_DATATYPE_H 2 | #define PARADIGM4_HYPEREMBEDDING_DATATYPE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace paradigm4 { 12 | namespace pico { 13 | namespace embedding { 14 | 15 | typedef float float32_t; 16 | typedef double float64_t; 17 | 18 | template struct TypeCase {}; 19 | 20 | class DataType { 21 | public: 22 | enum DType { 23 | UNKNOWN = 0x0, 24 | INT8 = 0x1, 25 | INT16 = 0x2, 26 | INT32 = 0x4, 27 | INT64 = 0x8, 28 | 29 | FLOAT32 = 0x104, 30 | FLOAT64 = 0x108, 31 | }; 32 | 33 | explicit DataType(int dtype = FLOAT32): dtype(dtype) {} 34 | 35 | DataType(const std::string& str) { 36 | if (str == "int8") { 37 | dtype = INT8; 38 | } else if (str == "int16") { 39 | dtype = INT16; 40 | } else if (str == "int32") { 41 | dtype = INT32; 42 | } else if (str == "int64") { 43 | dtype = INT64; 44 | } else if (str == "float32") { 45 | dtype = FLOAT32; 46 | } else if (str == "float64") { 47 | dtype = FLOAT64; 48 | } else { 49 | dtype = UNKNOWN; 50 | } 51 | } 52 | 53 | class ToString { 54 | public: 55 | void operator()(TypeCase, std::string& str) { str = "int8"; } 56 | void operator()(TypeCase, std::string& str) { str = "int16"; } 57 | void operator()(TypeCase, std::string& str) { str = "int32"; } 58 | void operator()(TypeCase, std::string& str) { str = "int64"; } 59 | void operator()(TypeCase, std::string& str) { str = "float32"; } 60 | void operator()(TypeCase, std::string& str) { str = "float64"; } 61 | }; 62 | 63 | operator std::string()const { 64 | std::string str = "unknown"; 65 | invoke(ToString(), str); 66 | return str; 67 | } 68 | 69 | std::string to_string()const { 70 | return *this; 71 | } 72 | 73 | template 74 | void invoke(Function&& f, Params&&... params)const { 75 | switch (dtype) { 76 | case INT8: 77 | std::forward(f)(TypeCase(), 78 | std::forward(params)...); 79 | break; 80 | case INT16: 81 | std::forward(f)(TypeCase(), 82 | std::forward(params)...); 83 | break; 84 | case INT32: 85 | std::forward(f)(TypeCase(), 86 | std::forward(params)...); 87 | break; 88 | case INT64: 89 | std::forward(f)(TypeCase(), 90 | std::forward(params)...); 91 | break; 92 | case FLOAT32: 93 | std::forward(f)(TypeCase(), 94 | std::forward(params)...); 95 | break; 96 | case FLOAT64: 97 | std::forward(f)(TypeCase(), 98 | std::forward(params)...); 99 | break; 100 | case UNKNOWN: 101 | break; 102 | default: 103 | SLOG(FATAL) << "unexpected unknown datatype!"; 104 | } 105 | } 106 | 107 | size_t size()const { 108 | return dtype & 0xFF; 109 | } 110 | 111 | template 112 | static DataType from() { 113 | return DataType(inner_from(TypeCase())); 114 | } 115 | 116 | friend bool operator==(DataType a, DataType b) { 117 | return a.dtype == b.dtype; 118 | } 119 | 120 | friend bool operator!=(DataType a, DataType b) { 121 | return a.dtype != b.dtype; 122 | } 123 | 124 | static DType inner_from(TypeCase) { return INT8; } 125 | static DType inner_from(TypeCase) { return INT16; } 126 | static DType inner_from(TypeCase) { return INT32; } 127 | static DType inner_from(TypeCase) { return INT64; } 128 | static DType inner_from(TypeCase) { return FLOAT32; } 129 | static DType inner_from(TypeCase) { return FLOAT64; } 130 | 131 | int dtype = FLOAT32; 132 | 133 | PICO_SERIALIZATION(dtype); 134 | }; 135 | 136 | 137 | } 138 | } 139 | } 140 | 141 | #endif -------------------------------------------------------------------------------- /openembedding/variable/EmbeddingInitializer.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_INITIALIZER_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_INITIALIZER_H 3 | 4 | #include "DataType.h" 5 | #include "Factory.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | template 12 | class EmbeddingInitializer: public Configurable { 13 | public: 14 | using weight_type = T; 15 | virtual std::string category() = 0; 16 | virtual void train_init(T* weights, size_t embedding_dim) = 0; 17 | }; 18 | 19 | template 20 | class EmbeddingConstantInitializer: public EmbeddingInitializer { 21 | public: 22 | std::string category()override { return "constant"; } 23 | 24 | void train_init(T* weights, size_t embedding_dim) override { 25 | for (size_t i = 0; i < embedding_dim; ++i) { 26 | weights[i] = value; 27 | } 28 | } 29 | 30 | private: 31 | CONFIGURE_PROPERTY(T, value, 0.0); 32 | }; 33 | 34 | 35 | template 36 | class EmbeddingUniformInitializer: public EmbeddingInitializer { 37 | public: 38 | std::string category()override { return "uniform"; } 39 | 40 | void load_config(const core::Configure& config) override { 41 | EmbeddingInitializer::load_config(config); 42 | device = std::make_unique(); 43 | engine = std::make_unique((*device)()); 44 | distribution = std::make_unique>(minval, maxval); 45 | } 46 | 47 | void train_init(T* weights, size_t embedding_dim) override { 48 | for (size_t i = 0; i < embedding_dim; ++i) { 49 | weights[i] = (*distribution)(*engine); 50 | } 51 | } 52 | 53 | private: 54 | CONFIGURE_PROPERTY(T, minval, 0.0); 55 | CONFIGURE_PROPERTY(T, maxval, 1.0); 56 | std::unique_ptr device; 57 | std::unique_ptr engine; 58 | std::unique_ptr> distribution; 59 | }; 60 | 61 | template 62 | class EmbeddingNormalInitializer: public EmbeddingInitializer { 63 | public: 64 | std::string category()override { return "normal"; } 65 | 66 | void load_config(const core::Configure& config) override { 67 | EmbeddingInitializer::load_config(config); 68 | device = std::make_unique(); 69 | engine = std::make_unique((*device)()); 70 | distribution = std::make_unique>(mean, stddev); 71 | } 72 | 73 | void train_init(T* weights, size_t embedding_dim) override { 74 | for (size_t i = 0; i < embedding_dim; ++i) { 75 | weights[i] = (*distribution)(*engine); 76 | if (truncated > 0.1) { 77 | while ((weights[i] - mean) / stddev > truncated) { 78 | weights[i] = (*distribution)(*engine); 79 | } 80 | } 81 | } 82 | } 83 | 84 | private: 85 | CONFIGURE_PROPERTY(T, mean, 0.0); 86 | CONFIGURE_PROPERTY(T, stddev, 1.0); 87 | CONFIGURE_PROPERTY(T, truncated, 0.0); 88 | std::unique_ptr device; 89 | std::unique_ptr engine; 90 | std::unique_ptr> distribution; 91 | }; 92 | 93 | } 94 | } 95 | } 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /openembedding/variable/EmbeddingVariable.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_H 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_H 3 | 4 | #include 5 | #include "Meta.h" 6 | #include "VariableAsyncTask.h" 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | namespace embedding { 11 | 12 | struct EmbeddingVariableContext { 13 | int variable_id = 0; 14 | }; 15 | 16 | class EmbeddingVariableBase { 17 | using key_type = uint64_t; 18 | public: 19 | static std::unique_ptr create(DataType datatype, size_t embedding_dim); 20 | virtual ~EmbeddingVariableBase() {} 21 | virtual void set_variable_context(const EmbeddingVariableContext&) = 0; 22 | virtual void load_config(const core::Configure& config) = 0; 23 | virtual void dump_config(core::Configure& config) = 0; 24 | virtual bool persist_config(size_t persist_pending_window, core::Configure& config) = 0; 25 | virtual bool should_persist() = 0; 26 | virtual void clear_weights() = 0; // clear initializer,weights. optimizer not change. reset slots. 27 | virtual size_t server_block_num_items() = 0; 28 | virtual void get_weights(const key_type* indices, size_t n, 29 | char* weights, char* states = nullptr) = 0; // thread safe 30 | virtual void set_weights(const key_type* indices, size_t n, 31 | const char* weights, const char* states = nullptr) = 0; 32 | 33 | virtual void pull_weights(const key_type* indices, size_t n, 34 | char* weights, VariableAsyncTask& async_task) = 0; // thread safe 35 | virtual void push_gradients(const key_type* indices, size_t n, 36 | const char* gradients, const key_type* counts, VariableAsyncTask& async_task) = 0; // thread safe 37 | virtual void update_weights() = 0; 38 | virtual size_t state_line_size() = 0; 39 | 40 | virtual size_t num_indices() = 0; 41 | virtual int create_reader() = 0; // thread safe 42 | virtual size_t read_indices(int reader_id, key_type* indices, size_t n) = 0; // thread safe for unique reader_id 43 | virtual uint64_t get_reader_cursor(int reader_id) = 0; // // thread safe for unique reader_id 44 | virtual void delete_reader(int reader_id) = 0; // thread safe 45 | }; 46 | 47 | } 48 | } 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /openembedding/variable/Factory.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_FACTORY_H 2 | #define PARADIGM4_HYPEREMBEDDING_FACTORY_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace paradigm4 { 10 | namespace pico { 11 | namespace embedding { 12 | 13 | template 14 | void LOAD_CONFIG_load_config(const core::Configure& config, const std::string& key, T& value) { 15 | if (config.has(key)) { 16 | value = config.get(key, value); 17 | } 18 | } 19 | 20 | template 21 | void SAVE_CONFIG_save_config(core::Configure& config, const std::string& key, const T& value) { 22 | config.node()[key] = value; 23 | } 24 | 25 | 26 | #define LOAD_CONFIG(config, x) do { \ 27 | LOAD_CONFIG_load_config((config), #x, (x)); \ 28 | } while(0) 29 | 30 | #define SAVE_CONFIG(config, x) do { \ 31 | SAVE_CONFIG_save_config((config), #x, (x)); \ 32 | } while(0) 33 | 34 | 35 | class Configurable: core::VirtualObject { 36 | public: 37 | 38 | virtual void dump_config(core::Configure& config)const { 39 | for (auto& dumper: _inner_dumpers) { 40 | dumper(config); 41 | } 42 | } 43 | 44 | virtual void load_config(const core::Configure& config) { 45 | for (auto& loader: _inner_loaders) { 46 | loader(config); 47 | } 48 | core::Configure self; 49 | dump_config(self); 50 | 51 | // bool has_default = false; 52 | // core::Configure defaults; 53 | // for (auto pair: self.node()) { 54 | // std::string key = pair.first.as(); 55 | // if (!config.has(key)) { 56 | // has_default = true; 57 | // defaults.node()[key] = self.node()[key]; 58 | // } 59 | // } 60 | // if (has_default) { 61 | // SLOG(INFO) << "using default configure: \n" << defaults.dump(); 62 | // } 63 | 64 | bool has_unknown = false; 65 | core::Configure unknowns; 66 | for (auto pair: config.node()) { 67 | std::string key = pair.first.as(); 68 | if (!self.has(key)) { 69 | has_unknown = true; 70 | unknowns.node()[key] = config.node()[key]; 71 | } 72 | } 73 | if (has_unknown) { 74 | SLOG(WARNING) << "unknown configure: \n" << unknowns.dump(); 75 | } 76 | } 77 | 78 | protected: 79 | std::vector > _inner_dumpers; 80 | std::vector > _inner_loaders; 81 | }; 82 | 83 | template 84 | struct CONFIGURE_PROPERTY_LOADER { 85 | CONFIGURE_PROPERTY_LOADER(const char* key, T* p): key(key), p(p) {} 86 | void operator()(const core::Configure& config) { 87 | LOAD_CONFIG_load_config(config, key, *p); 88 | } 89 | const char* key; 90 | T* p; 91 | }; 92 | 93 | template 94 | struct CONFIGURE_PROPERTY_DUMPER { 95 | CONFIGURE_PROPERTY_DUMPER(const char* key, const T* p): key(key), p(p) {} 96 | void operator()(core::Configure& config) { 97 | SAVE_CONFIG_save_config(config, key, *p); 98 | } 99 | const char* key; 100 | const T* p; 101 | }; 102 | 103 | 104 | #define CONFIGURE_PROPERTY(type, name, default_value)\ 105 | public:\ 106 | type name = (default_value);\ 107 | private:\ 108 | bool name##_loader_dummy = (this->_inner_loaders.push_back(\ 109 | CONFIGURE_PROPERTY_LOADER(#name, &this->name)), true);\ 110 | bool name##_dumper_dummy = (this->_inner_dumpers.push_back(\ 111 | CONFIGURE_PROPERTY_DUMPER(#name, &this->name)), true);\ 112 | 113 | 114 | template 115 | class Factory: core::VirtualObject { 116 | public: 117 | typedef std::function(Args...)> creator_type; 118 | virtual ~Factory() {} 119 | 120 | template 121 | bool register_creator(const std::string& category) { 122 | return _creators.emplace(category, creator).second; 123 | } 124 | 125 | std::unique_ptr create(const std::string& category, Args... args)const { 126 | if (_creators.count(category)) { 127 | return _creators.at(category)(args...); 128 | } else { 129 | std::string all_registered; 130 | for (auto& pair: _creators) { 131 | all_registered += pair.first + " "; 132 | } 133 | SLOG(WARNING) << "Do not find \"" << category 134 | << "\" in factory of " << core::readable_typename() 135 | << ". Registered: " << all_registered; 136 | return nullptr; 137 | } 138 | } 139 | 140 | static Factory& singleton() { 141 | static Factory factory; 142 | return factory; 143 | } 144 | private: 145 | Factory() = default; 146 | template 147 | static std::unique_ptr creator(Args... args) { 148 | return std::make_unique(std::forward(args)...); 149 | } 150 | std::map _creators; 151 | }; 152 | 153 | 154 | } 155 | } 156 | } 157 | 158 | #endif -------------------------------------------------------------------------------- /openembedding/variable/MpscGradientReducer.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_MPSC_GRADIENT_REDUCER_H 2 | #define PARADIGM4_HYPEREMBEDDING_MPSC_GRADIENT_REDUCER_H 3 | 4 | #include 5 | #include "EmbeddingInitializer.h" 6 | 7 | namespace paradigm4 { 8 | namespace pico { 9 | namespace embedding { 10 | 11 | template 12 | class MpscGradientReducer { 13 | public: 14 | using key_type = Key; 15 | struct block_type { 16 | const key_type* keys; 17 | size_t n; 18 | const T* gradients; 19 | const uint64_t* counts; 20 | }; 21 | 22 | MpscGradientReducer(size_t embedding_dim, key_type empty_key) 23 | : _embedding_dim(embedding_dim), _offsets(empty_key) {} 24 | 25 | // thread safe 26 | void push_gradients(block_type block) { 27 | _queue.push(std::move(block)); 28 | } 29 | 30 | block_type reduce_gradients() { 31 | block_type block; 32 | while (_queue.pop(block)) { 33 | const T* grad = block.gradients; 34 | for (size_t i = 0; i < block.n; ++i) { 35 | key_type key = block.keys[i]; 36 | if (_offsets.count(key)) { 37 | size_t offset = _offsets.at(key); 38 | T* sum = _gradients.data() + offset * _embedding_dim; 39 | for (size_t j = 0; j < _embedding_dim; ++j) { 40 | sum[j] += grad[j]; 41 | } 42 | _counts[offset] += block.counts[i]; 43 | } else { 44 | _offsets.force_emplace(key, _offsets.size()); 45 | _keys.push_back(key); 46 | _gradients.insert(_gradients.end(), grad, grad + _embedding_dim); 47 | _counts.push_back(block.counts[i]); 48 | } 49 | grad += _embedding_dim; 50 | } 51 | } 52 | return {_keys.data(), _keys.size(), _gradients.data(), _counts.data()}; 53 | } 54 | 55 | void clear() { 56 | _offsets.clear(); 57 | _keys.clear(); 58 | _gradients.clear(); 59 | _counts.clear(); 60 | } 61 | 62 | private: 63 | size_t _embedding_dim = 0; 64 | core::MpscQueue _queue; 65 | EasyHashMap _offsets; 66 | core::vector _keys; 67 | core::vector _gradients; 68 | core::vector _counts; 69 | }; 70 | 71 | } 72 | } 73 | } 74 | 75 | #endif -------------------------------------------------------------------------------- /openembedding/variable/PersistManager.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_PERSIST_MANAGER_H 2 | #define PARADIGM4_HYPEREMBEDDING_PERSIST_MANAGER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | namespace embedding { 11 | 12 | class PersistManager { 13 | PersistManager() = default; 14 | PersistManager(const PersistManager&) = default; 15 | public: 16 | class CacheManager { 17 | public: 18 | void initialize() { 19 | _cache_size.store(0); 20 | _acquired_size.store(0); 21 | } 22 | 23 | void set_cache_size(size_t cache_size) { 24 | _cache_size.store(cache_size); 25 | } 26 | 27 | bool acquire_cache(size_t size) { 28 | if (_acquired_size.fetch_add(size, std::memory_order_relaxed) + size > 29 | _cache_size.load(std::memory_order_relaxed)) { 30 | _acquired_size.fetch_sub(size, std::memory_order_relaxed); 31 | return false; 32 | } 33 | return true; 34 | } 35 | 36 | bool acquire_reserve_cache(size_t size) { 37 | if (3 * _acquired_size.load(std::memory_order_relaxed) < 38 | _cache_size.load(std::memory_order_relaxed)) { 39 | return acquire_cache(size); 40 | } 41 | return false; 42 | } 43 | 44 | void release_cache(size_t size) { 45 | _acquired_size.fetch_sub(size); 46 | } 47 | private: 48 | std::atomic _cache_size = {0}; 49 | std::atomic _acquired_size = {0}; 50 | }; 51 | 52 | static PersistManager& singleton() { 53 | static PersistManager manager; 54 | return manager; 55 | } 56 | 57 | bool use_pmem() { // server & client 58 | return !_pmem_pool_root_path.empty(); 59 | } 60 | 61 | void initialize(const std::string& path) { 62 | core::FileSystem::mkdir_p(path); 63 | _pmem_pool_root_path = path; 64 | _prefix = std::to_string(time(NULL)) + '-' + std::to_string(::getpid()); 65 | _next_pool_id.store(0); 66 | reserved_cache.initialize(); 67 | dynamic_cache.initialize(); 68 | } 69 | 70 | std::string new_pmem_pool_path() { 71 | SCHECK(use_pmem()); 72 | std::string name = std::to_string(_next_pool_id.fetch_add(1)); 73 | while (name.size() < 6) name = "0" + name; 74 | return _pmem_pool_root_path + "/" + _prefix + "-" + name; 75 | } 76 | 77 | CacheManager reserved_cache; 78 | CacheManager dynamic_cache; 79 | private: 80 | std::string _prefix; 81 | std::string _pmem_pool_root_path; 82 | std::atomic _next_pool_id = {0}; 83 | }; 84 | 85 | } 86 | } 87 | } 88 | 89 | #endif -------------------------------------------------------------------------------- /openembedding/variable/VariableAsyncTask.h: -------------------------------------------------------------------------------- 1 | #ifndef PARADIGM4_HYPEREMBEDDING_ASYNC_OPERATOR_THREAD_POOL_H 2 | #define PARADIGM4_HYPEREMBEDDING_ASYNC_OPERATOR_THREAD_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace paradigm4 { 9 | namespace pico { 10 | namespace embedding { 11 | 12 | class VariableAsyncTask { 13 | public: 14 | static void wait(std::atomic& _counter) { 15 | for (int tests = 0; unlikely(_counter.load(std::memory_order_acquire)); ++tests) { 16 | if (tests < 128) { 17 | cpu_relax(); 18 | } else { 19 | static constexpr std::chrono::microseconds us0{0}; 20 | std::this_thread::sleep_for(us0); 21 | } 22 | } 23 | } 24 | 25 | VariableAsyncTask() {} 26 | VariableAsyncTask(int thread_id, std::atomic& counter, core::RWSpinLock& shard_lock) 27 | : _thread_id(thread_id), _counter(&counter), _shard_lock(&shard_lock) {} 28 | VariableAsyncTask(const VariableAsyncTask&) = delete; 29 | VariableAsyncTask(VariableAsyncTask&& other) = default; 30 | 31 | VariableAsyncTask& operator=(VariableAsyncTask other) { 32 | SCHECK(_done == nullptr); 33 | new (this) VariableAsyncTask(std::move(other)); 34 | return *this; 35 | } 36 | 37 | ~VariableAsyncTask() {} 38 | 39 | explicit operator bool() { 40 | return _done.operator bool(); 41 | } 42 | 43 | int thread_id() { 44 | return _thread_id; 45 | } 46 | 47 | void done() { 48 | SCHECK(_done); 49 | if (_shard_lock) { 50 | core::lock_guard guard(*_shard_lock); 51 | _done(); 52 | } else { 53 | _done(); 54 | } 55 | _entity = nullptr; 56 | _done = nullptr; 57 | _counter->fetch_sub(1, std::memory_order_relaxed); 58 | } 59 | 60 | void set_done(std::function&& done) { 61 | SCHECK(_done == nullptr && _counter); 62 | if (done) { 63 | _counter->fetch_add(1, std::memory_order_relaxed); 64 | _done = std::move(done); 65 | } 66 | } 67 | 68 | void hold_entity(const std::shared_ptr& entity) { 69 | _entity = entity; 70 | } 71 | 72 | private: 73 | size_t _thread_id = 0; 74 | std::atomic* _counter = nullptr; 75 | core::RWSpinLock* _shard_lock = nullptr; 76 | std::shared_ptr _entity = nullptr; 77 | std::function _done; 78 | }; 79 | 80 | class VariableAsyncTaskThreadPool { 81 | public: 82 | static VariableAsyncTaskThreadPool& singleton() { 83 | static VariableAsyncTaskThreadPool pool; 84 | return pool; 85 | } 86 | 87 | void submit(VariableAsyncTask&& async_task) { 88 | SCHECK(_initialized); 89 | core::lock_guard guard(_lock); 90 | size_t num_tasks = _num_tasks.load(std::memory_order_relaxed) + 1; 91 | _num_tasks.store(num_tasks, std::memory_order_relaxed); 92 | _tasks.push_back(std::move(async_task)); 93 | if (_tasks.size() >= _batch_num_tasks) { 94 | for (VariableAsyncTask& task: _tasks) { 95 | if (task) { 96 | _channels[task.thread_id() % _threads.size()]->send(std::move(task)); 97 | } 98 | } 99 | _tasks.clear(); 100 | } 101 | } 102 | 103 | // very illformed! TODO: remove 104 | void initialize_batch_task() { 105 | if (_batch_num_tasks.load(std::memory_order_relaxed) == 0 && 106 | _num_tasks.load(std::memory_order_relaxed) != 0) { 107 | core::lock_guard guard(_lock); 108 | if (_batch_num_tasks.load() == 0) { 109 | SLOG(INFO) << "set batch num tasks " << _num_tasks.load(); 110 | _batch_num_tasks.store(_num_tasks); 111 | } 112 | } 113 | } 114 | 115 | void initialize(size_t thread_num) { 116 | SCHECK(!_initialized); 117 | _initialized = true; 118 | _num_tasks.store(0); 119 | _batch_num_tasks.store(0); 120 | _threads.resize(thread_num); 121 | _channels.resize(thread_num); 122 | for (size_t i = 0; i < _threads.size(); ++i) { 123 | _channels[i] = std::make_unique>(); 124 | _threads[i] = std::thread(&VariableAsyncTaskThreadPool::running, this, i); 125 | } 126 | } 127 | 128 | void finalize() { 129 | SCHECK(_initialized); 130 | for (size_t i = 0; i < _threads.size(); ++i) { 131 | _channels[i]->terminate(); 132 | _threads[i].join(); 133 | } 134 | _initialized = false; 135 | } 136 | 137 | private: 138 | void running(size_t i) { 139 | VariableAsyncTask task; 140 | while (_channels[i]->recv(task, -1)) { 141 | // must finalize task in loop 142 | VariableAsyncTask done = std::move(task); 143 | done.done(); 144 | } 145 | } 146 | 147 | bool _initialized = false; 148 | std::vector _threads; 149 | std::vector>> _channels; 150 | 151 | core::RWSpinLock _lock; 152 | std::atomic _num_tasks = {0}; 153 | std::atomic _batch_num_tasks = {0}; 154 | std::vector _tasks; 155 | }; 156 | 157 | 158 | } 159 | } 160 | } 161 | 162 | #endif 163 | -------------------------------------------------------------------------------- /openembedding/variable/pmem_embedding_table_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "PmemEmbeddingTable.h" 3 | #include 4 | 5 | namespace paradigm4 { 6 | namespace pico { 7 | namespace embedding { 8 | 9 | std::string pmem_pool_root_path = "/mnt/pmem0/tmp/exb_pmem_test"; 10 | TEST(PmemEmbeddingTable, MultipleGetAndSet) { 11 | PersistManager::singleton().initialize(pmem_pool_root_path); 12 | PmemEmbeddingArrayTable pt(64, -1); 13 | PersistManager::singleton().dynamic_cache.set_cache_size(pt.cache_item_memory_cost()); 14 | 15 | size_t total_items = 5; 16 | for (size_t j = 0; j < total_items; ++j){ 17 | ASSERT_EQ(j, pt.work_id()); 18 | ASSERT_EQ(nullptr, pt.get_value(j)); 19 | double* value = pt.set_value(j); 20 | for(size_t i = 0; i < 64; ++i){ 21 | value[i] = i + j; 22 | } 23 | const double* get = pt.get_value(j); 24 | for(size_t i = 0; i < 64; ++i){ 25 | ASSERT_EQ(double(i + j), get[i]); 26 | } 27 | pt.next_work(); 28 | } 29 | ASSERT_EQ(total_items, pt.work_id()); 30 | 31 | for (size_t k = 0; k < total_items; ++k){ 32 | const double* tmp = pt.get_value(k); 33 | for(size_t i = 0; i < 64; ++i) { 34 | ASSERT_EQ(double(i + k), tmp[i]); 35 | } 36 | } 37 | 38 | pt.start_commit_checkpoint(); 39 | ASSERT_EQ(pt.checkpoints().size(), 0); 40 | pt.flush_committing_checkpoint(); 41 | ASSERT_EQ(pt.checkpoints().size(), 1); 42 | 43 | for (size_t j = 0; j < total_items; ++j){ 44 | const double* get = pt.get_value(j); 45 | for(size_t i = 0; i < 64; ++i){ 46 | ASSERT_EQ(double(i + j), get[i]); 47 | } 48 | double* value = pt.set_value(j); 49 | for(size_t i = 0; i < 64; ++i){ 50 | value[i] = i + j; 51 | } 52 | pt.next_work(); 53 | } 54 | core::FileSystem::rmrf(pmem_pool_root_path); 55 | } 56 | 57 | TEST(PmemEmbeddingTable, SingleCheckpoint) { 58 | PersistManager::singleton().initialize(pmem_pool_root_path); 59 | PmemEmbeddingHashTable pt(64, -1); 60 | PersistManager::singleton().dynamic_cache.set_cache_size(pt.cache_item_memory_cost() * 5); 61 | 62 | double* tmp; 63 | EXPECT_EQ(0, pt.work_id()); 64 | EXPECT_EQ(0, pt.checkpoints().size()); 65 | 66 | for(size_t j=0; j<5; ++j){ 67 | EXPECT_EQ(j, pt.work_id()); 68 | EXPECT_EQ(nullptr, pt.get_value(j)); 69 | tmp = pt.set_value(j); 70 | for(size_t i=0; i<64; ++i){ 71 | *tmp = double(i+j); 72 | ++tmp; 73 | } 74 | tmp = (double *)pt.get_value(j); 75 | for(size_t i=0; i<64; ++i){ 76 | EXPECT_EQ(double(i+j), *tmp); 77 | ++tmp; 78 | } 79 | pt.next_work(); 80 | } 81 | EXPECT_EQ(5, pt.work_id()); 82 | pt.start_commit_checkpoint(); //_committing=5 83 | 84 | EXPECT_EQ(0, pt.checkpoints().size()); 85 | 86 | tmp = pt.set_value(0); 87 | for(size_t i=0; i<64; ++i){ 88 | *tmp = (*tmp) + 10; 89 | ++tmp; 90 | } 91 | EXPECT_EQ(5, pt.work_id()); 92 | EXPECT_EQ(0, pt.checkpoints().size()); 93 | 94 | for(int k=1; k<5; ++k){ 95 | tmp = pt.set_value(k); 96 | for(size_t i=0; i<64; ++i){ 97 | *tmp = (*tmp) + 10; 98 | ++tmp; 99 | } 100 | } 101 | pt.next_work(); 102 | EXPECT_EQ(6, pt.work_id()); 103 | EXPECT_EQ(1, pt.checkpoints().size()); 104 | 105 | tmp = pt.set_value(0); 106 | for(size_t i=0; i<64; ++i){ 107 | *tmp = (*tmp) + 10; 108 | ++tmp; 109 | } 110 | pt.next_work(); 111 | EXPECT_EQ(7, pt.work_id()); 112 | EXPECT_EQ(1, pt.checkpoints().size()); 113 | 114 | for(size_t k=0; k<100; ++k){ 115 | tmp = pt.set_value(0); 116 | for(size_t i=0; i<64; ++i){ 117 | *tmp = (*tmp) + 10; 118 | ++tmp; 119 | } 120 | //pt.next_work(); 121 | } 122 | pt.next_work(); 123 | EXPECT_EQ(8, pt.work_id()); 124 | EXPECT_EQ(1, pt.checkpoints().size()); 125 | 126 | for(int k=5; k>=0; --k){ 127 | tmp = pt.set_value(k); 128 | for(size_t i=0; i<64; ++i){ 129 | *tmp = (*tmp) + 10; 130 | ++tmp; 131 | } 132 | pt.next_work(); 133 | } 134 | EXPECT_EQ(14, pt.work_id()); 135 | EXPECT_EQ(1, pt.checkpoints().size()); 136 | 137 | if(pt.checkpoints().size()>=2){ 138 | pt.pop_checkpoint(); 139 | } 140 | pt.next_work(); 141 | EXPECT_EQ(15, pt.work_id()); 142 | EXPECT_EQ(1, pt.checkpoints().size()); 143 | 144 | core::FileSystem::rmrf(pmem_pool_root_path); 145 | } 146 | 147 | 148 | 149 | } 150 | } 151 | } 152 | 153 | int main(int argc, char* argv[]) { 154 | testing::InitGoogleTest(&argc, argv); 155 | int ret = RUN_ALL_TESTS(); 156 | return ret; 157 | } 158 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import setuptools 6 | import setuptools.command.build_ext 7 | import distutils.errors 8 | import distutils.sysconfig 9 | import openembedding_setup 10 | 11 | 12 | work_path = os.path.dirname(os.path.realpath(__file__)) + '/' 13 | cpp_flags = ['--std=c++14', '-Wall', '-Wextra', '-frecord-gcc-switches', '-fPIC'] 14 | link_flags = ['-lcexb_pack', '-L' + work_path + 'openembedding'] 15 | libexb = setuptools.Extension('openembedding.libexb', []) 16 | tensorflow_exb_ops = setuptools.Extension('openembedding.tensorflow.exb_ops', []) 17 | 18 | 19 | class custom_build_ext(setuptools.command.build_ext.build_ext): 20 | def build_extensions(self): 21 | self.build_core_extension() 22 | self.build_tensorflow_extension() 23 | 24 | def build_core_extension(self): 25 | import pybind11 26 | libexb.sources = ['openembedding/entry/py_api.cc'] 27 | libexb.extra_compile_args = cpp_flags + ['-I' + pybind11.get_include()] 28 | libexb.extra_link_args = link_flags 29 | distutils.sysconfig.customize_compiler(self.compiler) 30 | self.build_extension(libexb) 31 | 32 | def build_tensorflow_extension(self): 33 | import tensorflow as tf 34 | tensorflow_exb_ops.sources = ['openembedding/tensorflow/exb_ops.cpp'] 35 | tensorflow_exb_ops.extra_compile_args = cpp_flags + tf.sysconfig.get_compile_flags() 36 | tensorflow_exb_ops.extra_link_args = link_flags + tf.sysconfig.get_link_flags() 37 | distutils.sysconfig.customize_compiler(self.compiler) 38 | self.build_extension(tensorflow_exb_ops) 39 | 40 | 41 | import textwrap 42 | setuptools.setup( 43 | name='openembedding', 44 | version=openembedding_setup.__version__, 45 | description='Distributed framework to accelerate training and support serving.', 46 | author='4paradigm', 47 | author_email='opensource@4paradigm.com', 48 | long_description=textwrap.dedent('''\ 49 | OpenEmbedding is a distributed framework to accelerate TensorFlow training and 50 | support TensorFlow Serving. It uses the parameter server architecture to store 51 | the Embedding Layer. So that single machine memory is not the limit of model size. 52 | OpenEmbedding can cooperate with all-reduce framework to support both data parallel 53 | and model parallel.'''), 54 | url='https://github.com/4paradigm/OpenEmbedding', 55 | keywords=['deep learning', 'tensorflow', 'keras', 'AI'], 56 | classifiers=[ 57 | 'Programming Language :: Python :: 3', 58 | 'Development Status :: 2 - Pre-Alpha', 59 | 'Operating System :: POSIX :: Linux', 60 | 'License :: OSI Approved :: Apache Software License'], 61 | python_requires='>=3.6', 62 | setup_requires=['pybind11'], 63 | extras_require={'tensorflow':['tensorflow']}, 64 | packages=setuptools.find_packages(), 65 | package_data={'': [work_path + 'openembedding/libcexb_pack.so']}, 66 | ext_modules=[libexb, tensorflow_exb_ops], 67 | cmdclass={'build_ext': custom_build_ext}) 68 | -------------------------------------------------------------------------------- /test/benchmark/criteo_deepctr_torch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas 3 | import torch 4 | import horovod.torch as hvd 5 | from deepctr_torch.inputs import SparseFeat, DenseFeat 6 | from deepctr_torch.models import WDL, DeepFM, xDeepFM 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--data', required=True) 11 | parser.add_argument('--optimizer', default='Adagrad', choices=['Adagrad']) 12 | parser.add_argument('--model', default="DeepFM", choices=["WDL", 'DeepFM', 'XDeepFM']) 13 | parser.add_argument('--embedding_dim', default=9, type=int) 14 | parser.add_argument('--batch_size', default=4096, type=int) 15 | parser.add_argument('--epochs', default=2, type=int) 16 | parser.add_argument('--cpu', action='store_true') 17 | args = parser.parse_args() 18 | hvd.init() 19 | if args.cpu: 20 | device = 'cpu' 21 | else: 22 | #torch.cuda.set_device(hvd.local_rank()) 23 | device = 'cuda:{}'.format(hvd.local_rank()) 24 | 25 | if __name__ == "__main__": 26 | data = pandas.read_csv(args.data) 27 | num_lines = data.shape[0] 28 | num_local_lines = int(num_lines / hvd.size()) // args.batch_size * args.batch_size 29 | local_start = hvd.local_rank() * num_local_lines 30 | local_end = local_start + num_local_lines 31 | print("num_lines:%d, num_local_lines:%d" % (num_lines, num_local_lines)) 32 | print("local_start:%d, local_end:%d" % (local_start, local_end)) 33 | 34 | target = ['label'] 35 | dense_features = ['I' + str(i) for i in range(1, 14)] 36 | sparse_features = ['C' + str(i) for i in range(1, 27)] 37 | print(data.columns) 38 | 39 | feature_columns = [] 40 | for name in sparse_features: 41 | feature_columns.append(SparseFeat(name, data[name].max() + 1, dtype='int64')) 42 | for name in dense_features: 43 | feature_columns.append(DenseFeat(name, 1, dtype='float32')) 44 | train = data.iloc[local_start:local_end] 45 | train_model_input = {name:train[name] for name in sparse_features + dense_features} 46 | 47 | if args.model == 'WDL': 48 | fc_sizes = (512, 256, 128, 32) 49 | elif args.model in {'DeepFM', 'xDeepFM'}: 50 | fc_sizes = (400, 400, 400) 51 | else: 52 | print("unknown model ", args.model) 53 | model = eval(args.model)(feature_columns, feature_columns, device=device, 54 | task='binary', dnn_hidden_units=fc_sizes, l2_reg_linear=0, l2_reg_embedding=0) 55 | 56 | optimizer = torch.optim.Adagrad(model.parameters()) 57 | if hvd.size() > 1: 58 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), op=hvd.Sum) 59 | hvd.broadcast_optimizer_state(optimizer, root_rank=0) 60 | hvd.broadcast_parameters(model.state_dict(), root_rank=0) 61 | model.compile(optimizer, "binary_crossentropy", metrics=["binary_crossentropy", "auc"]) 62 | history = model.fit(train_model_input, train[target].values, 63 | batch_size=args.batch_size, epochs=args.epochs, verbose=2) 64 | -------------------------------------------------------------------------------- /test/benchmark/criteo_tfrecord.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas 3 | import tensorflow as tf 4 | 5 | if len(sys.argv) < 4: 6 | print('usage: criteo_tfrecord.py input pid np') 7 | 8 | def serialize_example(train, j): 9 | fea_desc = {} 10 | for name, column in train.items(): 11 | if name[0] == 'I': 12 | # dense feature 13 | fea_desc[name] = tf.train.Feature(float_list=tf.train.FloatList(value=[float(column[j])])) 14 | else: 15 | # label or sparse feature 16 | fea_desc[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(column[j])])) 17 | example_proto = tf.train.Example(features=tf.train.Features(feature=fea_desc)) 18 | return example_proto.SerializeToString() 19 | 20 | data = pandas.read_csv(sys.argv[1]) 21 | pid = int(sys.argv[2]) 22 | np = int(sys.argv[3]) 23 | 24 | target = ['label'] 25 | dense_features = ['I' + str(i) for i in range(1, 14)] 26 | sparse_features = ['C' + str(i) for i in range(1, 27)] 27 | columns = target + dense_features + sparse_features 28 | train = {name:data[name] for name in columns} 29 | 30 | count = 1000000 31 | for start in range(count * pid, data.shape[0], count * np): 32 | end = start + count 33 | if end > data.shape[0]: 34 | end = data.shape[0] 35 | name = str(start // count + 1) 36 | while len(name) < 5: 37 | name = '0' + name 38 | with tf.io.TFRecordWriter("./tfrecord/tf-part.{}".format(name)) as writer: 39 | for j in range(start, end): 40 | example = serialize_example(train, j) 41 | writer.write(example) 42 | 43 | if pid == 0: 44 | with open('./tfrecord/meta', 'w') as writer: 45 | for name in sparse_features: 46 | writer.write(name, data[name].max() + 1) 47 | -------------------------------------------------------------------------------- /test/benchmark/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import psutil 5 | from threading import Thread 6 | import openembedding as embed 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--bind_ip', default='') 11 | parser.add_argument('--server_concurrency', default=28, type=int) 12 | 13 | # For paper experiment 14 | parser.add_argument('--pmem', default='') 15 | parser.add_argument('--cache_size', default=1000, type=int) 16 | 17 | args = parser.parse_args() 18 | if args.pmem: 19 | embed.flags.config = ('{"server":{"server_concurrency":%d' 20 | ',"pmem_pool_root_path":"%s", "cache_size":%d } }') % ( 21 | args.server_concurrency, args.pmem, args.cache_size) 22 | else: 23 | embed.flags.config = '{"server":{"server_concurrency":%d } }' % ( 24 | args.server_concurrency) 25 | 26 | 27 | def print_rss(): 28 | print(psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024, 'GB', flush=True) 29 | 30 | 31 | def start(): 32 | if len(sys.argv) > 1: 33 | embed.flags.bind_ip = args.bind_ip 34 | _master = embed.Master() 35 | print(_master.endpoint) 36 | embed.flags.bind_ip = embed.flags.bind_ip[:embed.flags.bind_ip.find(':')] 37 | embed.flags.master_endpoint = _master.endpoint 38 | _server = embed.Server() 39 | _server.join() 40 | 41 | 42 | i = 0 43 | print_rss() 44 | th = Thread(target=start, args=[]) 45 | th.start() 46 | while th.is_alive(): 47 | i += 1 48 | th.join(0.1) 49 | if i % 100 == 0: 50 | print_rss() 51 | print_rss() -------------------------------------------------------------------------------- /test/criteo_preprocess.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | namespace paradigm4 { 6 | namespace pico { 7 | 8 | class LabelEncoder { 9 | public: 10 | LabelEncoder(): _encoder(-1) {} 11 | size_t encode(int64_t key) { 12 | if (key == -1) { 13 | key = std::numeric_limits::max(); 14 | } 15 | return _encoder.try_emplace(key, _encoder.size()).first->second; 16 | } 17 | 18 | size_t unique_count() { 19 | return _encoder.size(); 20 | } 21 | private: 22 | EasyHashMap _encoder; 23 | }; 24 | 25 | class TSVProcesser { 26 | public: 27 | TSVProcesser(size_t dense_features, size_t sparse_features, size_t repeat) 28 | : _dense_features(dense_features), _sparse_features(sparse_features), _repeat(repeat), 29 | _encoders(sparse_features), _key_labels(sparse_features), 30 | _buffer(64 * (sparse_features + dense_features + 1)), _out_buffer(_buffer.size()) {} 31 | 32 | size_t process(FILE* in, FILE* out) { 33 | fgets(_buffer.data(), _buffer.size(), in); 34 | size_t n = strlen(_buffer.data()); 35 | if (n == 0) { 36 | return 0; 37 | } 38 | size_t i = 0; 39 | for (size_t k = 0; k < _dense_features + 1; ++k) { 40 | skip_dense(_buffer.data(), i); 41 | ++i; 42 | } 43 | size_t sparse_start = i; 44 | memcpy(_out_buffer.data(), _buffer.data(), sparse_start); 45 | for (size_t k = 0; k < _sparse_features; ++k) { 46 | uint64_t key = parse_sparse(_buffer.data(), i); 47 | _key_labels[k] = _encoders[k].encode(key); 48 | ++i; 49 | } 50 | for (size_t row = 0; row < _repeat; ++row) { 51 | size_t i = sparse_start; 52 | for (size_t k = 0; k < _sparse_features; ++k) { 53 | output_sparse(_out_buffer.data(), i, _key_labels[k] * _repeat + row); 54 | _out_buffer[i] = k == _sparse_features - 1 ? '\0' : '\t'; 55 | ++i; 56 | } 57 | fprintf(out, "%s\n", _out_buffer.data()); 58 | } 59 | return _repeat; 60 | } 61 | 62 | void skip_dense(char* buffer, size_t& i) { 63 | while (buffer[i] && buffer[i] != '\t') ++i; 64 | } 65 | 66 | int64_t parse_sparse(char* buffer, size_t& i) { 67 | if (buffer[i] == '\0' || buffer[i] == '\t') { 68 | return -1; 69 | } 70 | int64_t result = 0; 71 | while (buffer[i] && buffer[i] != '\t') { 72 | int val = buffer[i] <= '9' && buffer[i] >= '0' ? buffer[i] - '0' : buffer[i] - 'a'; 73 | result = result * 16 + val; 74 | ++i; 75 | } 76 | return result; 77 | } 78 | 79 | void output_sparse(char* buffer, size_t& i, size_t key) { 80 | size_t p = i; 81 | do { 82 | buffer[i] = '0' + key % 10; 83 | key /= 10; 84 | ++i; 85 | } while (key != 0); 86 | std::reverse(buffer + p, buffer + i); 87 | } 88 | 89 | size_t unique_count(size_t sparse_feature) { 90 | return _encoders[sparse_feature].unique_count() * _repeat; 91 | } 92 | private: 93 | size_t _dense_features = 0; 94 | size_t _sparse_features = 0; 95 | size_t _repeat = 0; 96 | 97 | std::vector _encoders; 98 | std::vector _key_labels; 99 | 100 | std::vector _buffer; 101 | std::vector _out_buffer; 102 | 103 | }; 104 | 105 | 106 | void process(std::string input_dir, std::string output_dir, size_t file_lines, size_t repeat) { 107 | int day = 1; 108 | size_t lines = 0; 109 | auto fout = core::ShellUtility::open(output_dir + "/day_" + std::to_string(day), "w"); 110 | TSVProcesser processer(13, 26, repeat); 111 | for (std::string input_file: FileSystem::get_file_list(input_dir, "")) { 112 | SLOG(INFO) << input_file; 113 | auto fin = core::ShellUtility::open(input_file, "r"); 114 | while (!feof(fin.get())) { 115 | if (lines >= file_lines) { 116 | SLOG(INFO) << "day_" << day << " generated"; 117 | ++day; 118 | lines = 0; 119 | fout = core::ShellUtility::open(output_dir + "/day_" + std::to_string(day), "w"); 120 | } 121 | lines += processer.process(fin.get(), fout.get()); 122 | } 123 | } 124 | SLOG(INFO) << "day_" << day << " generated"; 125 | 126 | fout = core::ShellUtility::open(output_dir + "/meta", "w"); 127 | for (int i = 0; i < 26; ++i) { 128 | std::string str = pico_lexical_cast(processer.unique_count(i)); 129 | fprintf(fout.get(), "C%d %s\n", i + 1, str.c_str()); 130 | } 131 | }; 132 | 133 | } 134 | } // namespace paradigm4 135 | 136 | DEFINE_string(output, "", ""); 137 | DEFINE_string(input, "", ""); 138 | DEFINE_int32(file_lines, 10000000, ""); 139 | DEFINE_int32(repeat, 2, ""); 140 | 141 | int main(int argc, char* argv[]) { 142 | google::ParseCommandLineFlags(&argc, &argv, false); 143 | paradigm4::pico::process(FLAGS_input, FLAGS_output, FLAGS_file_lines, FLAGS_repeat); 144 | return 0; 145 | } -------------------------------------------------------------------------------- /test/optimizer_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import tensorflow as tf 3 | import openembedding.tensorflow as embed 4 | 5 | 6 | def run_tf_optimizer(optimizer, gradients): 7 | optimizer = optimizer.__class__.from_config(optimizer.get_config()) 8 | var = tf.Variable(tf.ones(gradients[0].shape, gradients[0].dtype)) 9 | for grad in gradients: 10 | optimizer.apply_gradients([(grad, var)]) 11 | return var.read_value() 12 | 13 | 14 | def run_my_optimizer(optimizer, gradients): 15 | var = embed.Embedding(gradients[0].shape[0], gradients[0].shape[1], 16 | tf.keras.initializers.Constant(1.0), dtype=gradients[0].dtype) 17 | indices = tf.range(var.input_dim) 18 | var.build(indices.shape) 19 | var.variable.set_server_optimizer(optimizer) 20 | for grad in gradients: 21 | fakegrad = var.variable.push_gradients(indices, grad) 22 | var.variable.update_weights(fakegrad) 23 | return var.variable.sparse_read(indices) 24 | 25 | from tensorflow.keras.optimizers import * 26 | 27 | gradients1d = [ tf.ones([1, 1], dtype=tf.float64) ] 28 | gradients10d = [ tf.random.uniform([111, 11], -1, 1, dtype=tf.float64) for i in range(10) ] 29 | gradients100d = [ tf.random.uniform([111, 11], -1, 1, dtype=tf.float64) for i in range(100) ] 30 | gradients1 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients1d ] 31 | gradients10 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients10d ] 32 | gradients100 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients100d ] 33 | optimizers = [ 34 | Adadelta(), Adadelta(0.1), Adadelta(0.1, rho=0.8), 35 | Adagrad(), Adagrad(0.1), Adagrad(0.1, 1000), 36 | Adam(), Adam(0.1), Adam(0.1, beta_1=0.8, beta_2=0.97), 37 | Adamax(), Adamax(0.1), Adamax(0.1, beta_1=0.8, beta_2=0.97), 38 | Ftrl(), Ftrl(0.1), 39 | Ftrl(0.1, -0.5, 0.1, 0.01, 0.05, 'Ftrl', 0), 40 | Ftrl(0.1, -0.5, 0.1, 0.01, 0.05, 'Ftrl', 0, 0.05), 41 | Ftrl(0.1, -0.5, 0.1, 0.00, 0.05, 'Ftrl', 0), 42 | Ftrl(0.1, -0.5, 0.1, 0.00, 0.05, 'Ftrl', 0, 0.1), 43 | Ftrl(0.1, -0.5, 0.1, 0.01, 0.01, 'Ftrl', 0.05), 44 | Ftrl(0.1, -0.5, 0.1, 0.05, 0.00, 'Ftrl', 0), 45 | Ftrl(0.1, -0.5, 10, 0.00, 0.05, 'Ftrl', 0), 46 | Ftrl(0.1, -0.5, 10, 0.00, 0.05, 'Ftrl', 0, 0.5), 47 | Ftrl(0.1, -0.5, 10, 0.01, 0.01, 'Ftrl', 0.05), 48 | Ftrl(0.1, -0.5, 10, 0.05, 0.01, 'Ftrl', 0.05), 49 | RMSprop(), RMSprop(0.1, rho=0.8), RMSprop(0.1, momentum=0.5), RMSprop(rho=0.7, momentum=0.7), 50 | SGD(), SGD(0.1), SGD(momentum=0.5) 51 | ] 52 | 53 | 54 | all_results = [] 55 | for gradients in [gradients1, gradients1d, gradients10, gradients10d, gradients100, gradients100d]: 56 | results = [] 57 | for optimizer in optimizers: 58 | A = run_tf_optimizer(optimizer, gradients) 59 | B = run_my_optimizer(optimizer, gradients) 60 | row = A.shape[0] - 1 61 | col = A.shape[1] - 1 62 | error = tf.reduce_sum(tf.reduce_sum(tf.abs(A - B))) 63 | results.append((float(error), optimizer.get_config()['name'], 64 | float(A[0][0]), float(B[0][0]), float(A[row][col]), float(B[row][col]))) 65 | all_results.append(sorted(results, key=lambda x: x[0])) 66 | 67 | for results in all_results: 68 | for result in results: 69 | if result[0] > 10.0: 70 | print("error! ", result, file=sys.stderr) 71 | exit(1) 72 | print(result) 73 | print() 74 | --------------------------------------------------------------------------------