├── .dockerignore
├── .github
    └── workflows
    │   ├── build.yml
    │   └── release.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_cn.md
├── build.sh
├── docker
    ├── Dockerfile
    ├── Dockerfile.base
    ├── Dockerfile.build
    ├── Dockerfile.nccl
    └── docker_build.sh
├── documents
    ├── cn
    │   ├── benchmark.md
    │   ├── serving.md
    │   └── training.md
    ├── en
    │   ├── benchmark.md
    │   ├── pmem.md
    │   ├── serving.md
    │   ├── train.md
    │   └── training.md
    ├── images
    │   ├── benchmark-server.png
    │   ├── benchmark.png
    │   ├── pmem_vs_dram_oe.png
    │   ├── serving.drawio.png
    │   ├── standalone.drawio.png
    │   └── training.drawio.png
    └── papers
    │   └── openembedding_icde2023.pdf
├── examples
    ├── criteo_deepctr_hook.py
    ├── criteo_deepctr_network.py
    ├── criteo_deepctr_network_mirrored.py
    ├── criteo_deepctr_network_mpi.py
    ├── criteo_lr_subclass.py
    ├── criteo_preprocess.py
    ├── run
    │   ├── criteo_deepctr_checkpoint.sh
    │   ├── criteo_deepctr_horovod.sh
    │   ├── criteo_deepctr_mirrored.sh
    │   ├── criteo_deepctr_mpi.sh
    │   ├── criteo_deepctr_restful.sh
    │   ├── criteo_deepctr_standalone.sh
    │   └── criteo_preprocess.sh
    ├── tensorflow_serving_client.py
    ├── tensorflow_serving_restful.py
    ├── train100.csv
    └── wide100.csv
├── laboratory
    ├── benchmark
    │   ├── Dockerfile
    │   ├── analyze.py
    │   ├── benchmark.Dockerfile
    │   ├── benchmark.py
    │   ├── criteo_sample.txt
    │   ├── deepctr_criteo_model.py
    │   ├── parse_tensor_board.py
    │   ├── summary.py
    │   └── tensornet.Dockerfile
    ├── inject
    │   ├── Dockerfile
    │   ├── inject.sh
    │   ├── network_model.py
    │   ├── openembedding_inject_tensorflow.py
    │   ├── python
    │   └── sitecustomize.py
    ├── onnx
    │   └── criteo_deepctr_torch.py
    ├── publish-serving.sh
    └── strangedemo
    │   ├── Dockerfile.criteo
    │   ├── Dockerfile.push
    │   ├── criteo_deepctr
    │       ├── criteo_deepctr.ipynb
    │       └── criteo_deepctr.py
    │   ├── criteo_deepctr_np
    │       ├── criteo_deepctr.py
    │       ├── criteo_deepctr_np.ipynb
    │       └── horovod_criteo_deepctr.py
    │   ├── criteo_lr
    │       ├── criteo_lr.ipynb
    │       └── criteo_lr.py
    │   ├── criteo_predict.py
    │   └── hook
    │       ├── Dockerfile
    │       ├── install.sh
    │       ├── mlcompile
    │       ├── mlrun
    │       └── openembedding_hook_tensorflow.py
├── openembedding
    ├── CMakeLists.txt
    ├── __init__.py
    ├── client
    │   ├── Communication.cpp
    │   ├── Communication.h
    │   ├── Connection.cpp
    │   ├── Connection.h
    │   ├── EmbeddingVariableHandle.cpp
    │   ├── EmbeddingVariableHandle.h
    │   ├── EnvConfig.cpp
    │   ├── EnvConfig.h
    │   ├── Model.cpp
    │   ├── Model.h
    │   ├── ModelController.cpp
    │   ├── ModelController.h
    │   ├── ObjectPool.h
    │   ├── WorkerContext.cpp
    │   └── WorkerContext.h
    ├── entry
    │   ├── c_api.cc
    │   ├── c_api.h
    │   ├── c_api_ha_test.cpp
    │   ├── c_api_test.cpp
    │   ├── c_api_test.h
    │   ├── controller.cc
    │   ├── controller.proto
    │   ├── masterd.cc
    │   ├── pmem_c_api_test.cpp
    │   ├── py_api.cc
    │   └── server.cc
    ├── server
    │   ├── EmbeddingDumpOperator.cpp
    │   ├── EmbeddingDumpOperator.h
    │   ├── EmbeddingInitOperator.cpp
    │   ├── EmbeddingInitOperator.h
    │   ├── EmbeddingLoadOperator.cpp
    │   ├── EmbeddingLoadOperator.h
    │   ├── EmbeddingPullOperator.cpp
    │   ├── EmbeddingPullOperator.h
    │   ├── EmbeddingPushOperator.cpp
    │   ├── EmbeddingPushOperator.h
    │   ├── EmbeddingRestoreOperator.cpp
    │   ├── EmbeddingRestoreOperator.h
    │   ├── EmbeddingShardFile.h
    │   ├── EmbeddingStorage.h
    │   ├── EmbeddingStoreOperator.cpp
    │   ├── EmbeddingStoreOperator.h
    │   └── RpcView.h
    ├── tensorflow
    │   ├── CMakeLists.txt
    │   ├── Prefetch.h
    │   ├── ThreadPool.h
    │   ├── __init__.py
    │   ├── exb.py
    │   └── exb_ops.cpp
    └── variable
    │   ├── DataType.h
    │   ├── EmbeddingInitializer.h
    │   ├── EmbeddingItemPool.h
    │   ├── EmbeddingOptimizer.h
    │   ├── EmbeddingOptimizerVariable.h
    │   ├── EmbeddingTable.h
    │   ├── EmbeddingVariable.cpp
    │   ├── EmbeddingVariable.h
    │   ├── Factory.h
    │   ├── Meta.h
    │   ├── MpscGradientReducer.h
    │   ├── PersistManager.h
    │   ├── PmemEmbeddingItemPool.h
    │   ├── PmemEmbeddingOptimizerVariable.h
    │   ├── PmemEmbeddingTable.h
    │   ├── VariableAsyncTask.h
    │   └── pmem_embedding_table_test.cpp
├── setup.py
└── test
    ├── benchmark
        ├── criteo_deepctr.py
        ├── criteo_deepctr_torch.py
        ├── criteo_tfrecord.py
        └── server.py
    ├── criteo_preprocess.cpp
    └── optimizer_test.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | tmp
3 | build
4 | tools
5 | pico-ps/build
6 | pico-ps/pico-core/build
7 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     services:
13 |       registry:
14 |         image: registry:2
15 |         ports:
16 |           - 5000:5000
17 |     steps:
18 |       - name: Checkout
19 |         uses: actions/checkout@v2
20 |         with:
21 |           submodules: 'recursive'
22 |           token: ${{ secrets.CHECKOUT_TOKEN }}
23 |       - name: Set up Docker Buildx
24 |         id: buildx
25 |         uses: docker/setup-buildx-action@v1
26 |       - name: docker build
27 |         run: |
28 |           docker/docker_build.sh
29 |       - name: docker image
30 |         run: |
31 |           docker/docker_build.sh image
32 |       - name: docker test
33 |         run: |
34 |           docker/docker_build.sh test
35 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - v[0-9]+.[0-9]+.[0-9]+
 7 | 
 8 | jobs:
 9 |   release:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout
13 |         uses: actions/checkout@v2
14 |         with:
15 |           submodules: 'recursive'
16 |           token: ${{ secrets.CHECKOUT_TOKEN }}
17 |       - name: Get branch name
18 |         uses: nelonoel/branch-name@v1.0.1
19 |       - name: Set up Docker Buildx
20 |         id: buildx
21 |         uses: docker/setup-buildx-action@v1
22 |       - name: Login to DockerHub
23 |         uses: docker/login-action@v1
24 |         with:
25 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
26 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
27 |       - name: docker build
28 |         run: |
29 |           VERSION=${BRANCH_NAME:1} docker/docker_build.sh
30 |       - name: docker image
31 |         run: |
32 |           VERSION=${BRANCH_NAME:1} docker/docker_build.sh image
33 |       - name: docker test
34 |         run: |
35 |           VERSION=${BRANCH_NAME:1} docker/docker_build.sh test
36 |       - name: docker push
37 |         run: |
38 |           docker push 4pdosc/openembedding:${BRANCH_NAME:1}
39 |           docker tag 4pdosc/openembedding:${BRANCH_NAME:1} 4pdosc/openembedding:latest
40 |           docker push 4pdosc/openembedding:latest
41 |       - name: pypi upload
42 |         env:
43 |           TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
44 |           TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
45 |         run: |
46 |           pip3 install twine
47 |           twine upload output/dist/openembedding-${BRANCH_NAME:1}.tar.gz
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ## General
 2 | 
 3 | # Compiled Object files
 4 | *.slo
 5 | *.lo
 6 | *.o
 7 | *.cuo
 8 | 
 9 | # Compiled Dynamic libraries
10 | *.so
11 | *.dylib
12 | *.pyd
13 | 
14 | # Compiled Static libraries
15 | *.lai
16 | *.la
17 | *.a
18 | 
19 | # Compiled protocol buffers
20 | *.pb.h
21 | *.pb.cc
22 | *_pb2.py
23 | *_pb.py
24 | *_pb2.pyi
25 | *_pb.pyi
26 | 
27 | # Compiled python
28 | *.pyc
29 | 
30 | # Compiled MATLAB
31 | *.mex*
32 | 
33 | # IPython notebook checkpoints
34 | .ipynb_checkpoints
35 | 
36 | # Editor temporaries
37 | *.swn
38 | *.swo
39 | *.swp
40 | *~
41 | 
42 | # Sublime Text settings
43 | *.sublime-workspace
44 | *.sublime-project
45 | 
46 | # Eclipse Project settings
47 | *.*project
48 | .settings
49 | 
50 | # QtCreator files
51 | *.user
52 | 
53 | # PyCharm files
54 | .idea
55 | 
56 | # Visual Studio Code files
57 | .vscode
58 | 
59 | # OSX dir files
60 | .DS_Store
61 | 
62 | 
63 | 
64 | CMakeCache.txt
65 | CMakeFiles
66 | build.config
67 | build
68 | build_*
69 | build-debug
70 | build-release
71 | core.*
72 | lib
73 | output
74 | tmp
75 | tools
76 | 
77 | *.log
78 | *.tar.gz
79 | 
80 | .build_debug/*
81 | .build_release/*
82 | .setuptools-cmake-build/*
83 | .unittest_tmp/
84 | .ycm_extra_conf.py
85 | 
86 | virtualenv
87 | venv
88 | .envrc
89 | .psenvrc
90 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "parameter-server"]
2 | 	path = pico-ps
3 | 	url = https://github.com/4paradigm/parameter-server.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | if (NOT OPENEMBEDDING_VERSION)
 4 |     set(OPENEMBEDDING_VERSION 0.0.0)
 5 | endif()
 6 | 
 7 | project(openembedding VERSION ${OPENEMBEDDING_VERSION})
 8 | set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${PROJECT_SOURCE_DIR}/pico-ps/cmake ${PROJECT_SOURCE_DIR}/pico-ps/pico-core/cmake)
 9 | 
10 | if (NOT PYTHON)
11 |     set(PYTHON "python3")
12 | endif()
13 | 
14 | if(THIRD_PARTY)
15 |     set(CMAKE_PREFIX_PATH "${THIRD_PARTY}")
16 |     message(STATUS "THIRD_PARTY=${THIRD_PARTY}")
17 |     include_directories(SYSTEM ${THIRD_PARTY}/include)
18 |     link_directories(${THIRD_PARTY}/lib ${THIRD_PARTY}/lib64)
19 |     set(OPENSSL_ROOT_DIR ${THIRD_PARTY}/lib64)
20 | endif()
21 | execute_process(COMMAND ${PYTHON} -c "import sysconfig; print(sysconfig.get_paths()['include'], end='')" OUTPUT_VARIABLE PYTHON_INCLUDE)
22 | include_directories(SYSTEM ${PYTHON_INCLUDE})
23 | message(STATUS "PYTHON_INCLUDE=${PYTHON_INCLUDE}")
24 | 
25 | # check gcc version
26 | if(CMAKE_COMPILER_IS_GNUCXX)
27 |     execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
28 |     message(STATUS "gcc ${GCC_VERSION}")
29 |     if(GCC_VERSION VERSION_GREATER 7 OR GCC_VERSION VERSION_EQUAL 7)
30 |         message(STATUS "C++14 activated.")
31 |     else()
32 |         message(FATAL_ERROR "gcc version should be compatible with tensorflow")
33 |     endif()
34 | else()
35 |     message(FATAL_ERROR "only gcc supported")
36 | endif()
37 | 
38 | add_definitions(--std=c++14 -Wall -Wextra -Wno-deprecated-declarations -Werror -frecord-gcc-switches -fPIC)
39 | include_directories(${PROJECT_SOURCE_DIR})
40 | 
41 | option(USE_RDMA "whether build with rdma support" OFF)
42 | if (USE_RDMA)
43 |     add_definitions(-DUSE_RDMA)
44 |     set(RDMA_LIBRARIES rdmacm ibverbs)
45 |     message(STATUS "RDMA enabled")
46 | else()
47 |     message(STATUS "RDMA disabled")
48 |     set(RDMA_LIBRARIES )
49 | endif()
50 | 
51 | option(USE_DCPMM "whether build with rdma support" OFF)
52 | if (USE_DCPMM)
53 |     add_definitions(-DUSE_DCPMM)
54 |     find_package(PMEM REQUIRED)
55 |     message(STATUS "DCPMM enabled")
56 | else()
57 |     message(STATUS "DCPMM disabled")
58 | endif()
59 | 
60 | 
61 | if (DEBUG)
62 |     add_definitions(-O0 -g)
63 | else()
64 |     #add_definitions(-O0 -g)
65 |     #add_definitions(-O3 -DNDEBUG)
66 |     add_definitions(-O3 -g -DNDEBUG -DEIGEN_NO_DEBUG) #perf
67 | endif()
68 | 
69 | set(CMAKE_SHARED_LINKER_FLAGS "-pthread -Wl,--whole-archive -lrt -Wl,--no-whole-archive")
70 | set(CMAKE_EXE_LINKER_FLAGS "-pthread -Wl,--whole-archive -lrt -Wl,--no-whole-archive")
71 | 
72 | add_definitions(-DOPENEMBEDDING_VERSION="${PROJECT_VERSION}")
73 | 
74 | find_package(Jemalloc REQUIRED)
75 | find_package(PicoCoreDep REQUIRED)
76 | 
77 | enable_testing()
78 | add_subdirectory(openembedding)
79 | 
80 | file(GLOB_RECURSE WHL_SRC LICENSE README.md setup.py MANIFEST.in openembedding/tensorflow/*.py openembedding/*.py)
81 | set(HYPEREMBEDDING_OUT ${CMAKE_CURRENT_BINARY_DIR}/openembedding-${PROJECT_VERSION}.tar.gz)
82 | add_custom_command(
83 |         OUTPUT ${HYPEREMBEDDING_OUT}
84 |         DEPENDS ${WHL_SRC} cexb_pack
85 |         COMMAND rm -rf pypi
86 |         COMMAND mkdir -p pypi
87 |         COMMAND echo __version__ = \\\'${PROJECT_VERSION}\\\' > pypi/openembedding_setup.py
88 |         COMMAND cd ${PROJECT_SOURCE_DIR} && cp -r LICENSE README.md setup.py MANIFEST.in openembedding ${CMAKE_CURRENT_BINARY_DIR}/pypi
89 |         COMMAND cp openembedding/libcexb_pack.so pypi/openembedding
90 |         COMMAND cd pypi && ${PYTHON} setup.py sdist
91 |         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
92 | )
93 | add_custom_target(pip_package ALL DEPENDS ${HYPEREMBEDDING_OUT})
94 | 
95 | add_executable(criteo_preprocess test/criteo_preprocess.cpp)
96 | target_link_libraries(criteo_preprocess pico_core ${PicoCoreDep_LIBRARIES} ${Jemalloc_pic_LIBRARIES})


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE openembedding_setup.py 
2 | recursive-include openembedding *.h *.cpp *.cc
3 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:2.5.1-gpu
2 | # remove tensorflow docker logo to avoid confusion
3 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake mpich vim wget curl
4 | RUN HOROVOD_WITHOUT_MPI=1 pip3 install mpi4py horovod
5 | RUN pip3 install pandas scikit-learn deepctr
6 | ADD . /openembedding
7 | RUN pip3 install /openembedding/output/dist/openembedding-*.tar.gz
8 | WORKDIR /openembedding
9 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | # to use glibc 2.12 and dt7 which have the same system compatibility as tensorflow
 2 | FROM tensorflow/tensorflow:2.3.0-custom-op-ubuntu16
 3 | 
 4 | RUN cd /dt7/usr/bin && ln -s gcc cc && cd /
 5 | 
 6 | # use glibc 2.12
 7 | ADD pico-ps/pico-core/third_party /third_party
 8 | 
 9 | RUN third_party/prepare.sh build cmake
10 | RUN PATH=/dt7/usr/bin:$PATH prefix=/tools third_party/prepare.sh build \
11 |     gflags glog googletest sparsehash zlib snappy lz4 boost yaml jemalloc prometheus-cpp \
12 |     avro-cpp zookeeper protobuf leveldb openssl brpc && cd ..
13 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.build:
--------------------------------------------------------------------------------
 1 | FROM 4pdosc/openembedding-base:0.1.0
 2 | 
 3 | # only c api is tested here
 4 | ADD . /openembedding
 5 | ARG VERSION=0.0.0
 6 | RUN pip3 install pybind11
 7 | RUN PATH=/dt7/usr/bin:$PATH prefix=/tools /openembedding/pico-ps/pico-core/third_party/prepare.sh build eigen
 8 | RUN cd /openembedding && \
 9 |     PATH=/dt7/usr/bin:$PATH SKIP_CHECK_WHEEL_SETUP=1 VERSION=${VERSION} THIRD_PARTY=/tools ./build.sh
10 | RUN cd /openembedding/build && make test
11 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.nccl:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:latest-gpu
 2 | RUN RUN apt-get update && apt-get update && apt-get install -y gcc-7 g++-7 cmake \
 3 |     openssh-client openmpi-bin libopenmpi-dev vim wget curl \
 4 |     build-essential devscripts debhelper fakeroot
 5 | RUN NCCL=2.9.9-1 && mkdir nccl && cd nccl && \
 6 |     wget https://github.com/NVIDIA/nccl/archive/v${NCCL}.tar.gz && tar -xzf v{NCCL}.tar.gz && \
 7 |     cd nccl-{NCCL} && make src.build && make pkg.debian.build && \
 8 |     apt-get -y install ./build/pkg/deb/libnccl2_*_amd64.deb ./build/pkg/deb/libnccl-dev_*_amd64.deb
 9 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip3 install horovod
10 | RUN pip3 install pandas scikit-learn deepctr
11 | ADD . /openembedding
12 | RUN pip3 install /openembedding/output/dist/openembedding-*.tar.gz
13 | WORKDIR /openembedding
14 | 


--------------------------------------------------------------------------------
/docker/docker_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | if [ "${VERSION}" == "" ]; then
 4 |     VERSION=0.0.0
 5 | fi
 6 | 
 7 | function build() {    
 8 |     IMAGE=4pdosc/openembedding:${VERSION}-build
 9 |     docker build -t ${IMAGE} -f docker/Dockerfile.build --build-arg VERSION=${VERSION} .
10 |     docker run --name dockerbuild -itd ${IMAGE} /bin/bash
11 |     rm -rf output
12 |     mkdir -p output/dist
13 |     docker cp dockerbuild:/openembedding/build/pypi/dist/openembedding-${VERSION}.tar.gz output/dist
14 |     docker stop dockerbuild
15 |     docker rm dockerbuild
16 |     docker rmi ${IMAGE}
17 | }
18 | 
19 | function image() {
20 |     IMAGE=4pdosc/openembedding:${VERSION}
21 |     docker build -t ${IMAGE} -f docker/Dockerfile .
22 | }
23 | 
24 | function image_test() {
25 |     mkdir -p tmp
26 |     IMAGE=4pdosc/openembedding:${VERSION}
27 |     echo '{'                            > tmp/daemon.json
28 |     echo '    "storage-driver": "vfs"' >> tmp/daemon.json
29 |     echo '}'                           >> tmp/daemon.json
30 | 
31 |     echo 'set -e' > tmp/test.sh
32 |     echo 'curl -fsSL https://get.docker.com | sh' >> tmp/test.sh
33 |     echo 'mkdir -p /etc/docker' >> tmp/test.sh
34 |     echo 'cp tmp/daemon.json /etc/docker' >> tmp/test.sh
35 |     echo 'service docker start' >> tmp/test.sh
36 |     echo './build.sh test' >> tmp/test.sh
37 | 
38 |     docker run --privileged --name image_test -v `pwd`/tmp:/openembedding/tmp ${IMAGE} bash tmp/test.sh
39 |     docker rm image_test
40 | }
41 | 
42 | case "$1" in
43 |     build|"")
44 |         build
45 |     ;;
46 |     image)
47 |         image
48 |     ;;
49 |     test)
50 |         image_test
51 |     ;;
52 |     *)
53 |         echo "unkown cmd"
54 |         exit 1
55 |     ;;
56 | esac
57 | 


--------------------------------------------------------------------------------
/documents/cn/benchmark.md:
--------------------------------------------------------------------------------
 1 | # 性能测试
 2 | 
 3 | ## 单机多卡
 4 | 
 5 | 在单机多卡的环境下，对比仅使用 Horovod 和 OpenEmbedding & Horovod 对 TensorFlow 的加速效果。
 6 | 
 7 | | 选项 | 设置 |
 8 | | - | - |
 9 | | CPU | 2 * CPU Xeon(R) Gold 5218 CPU @ 2.30GHz |
10 | | GPU | 8 * Tesla T4 |
11 | | 数据集 | Criteo |
12 | | 数据格式 | TFRecord |
13 | | Model | WDL, DeepFM, XDeepFM |
14 | | Embedding维度 | 9, 64 |
15 | | Optimizer | Adagrad |
16 | | Batch Size per GPU | 4096 |
17 | 
18 | ![benchmark](../images/benchmark.png)
19 | 
20 | 随着 GPU 数量的增加，仅使用 Horovod 难以得到加速，对于稀疏部分占比更大的 WDL 64, DeepFM 64 性能还会反而下降。对于 XDeepFM 9，由于模型计算量特别大，稀疏部分占比相对较小，Horovod 仍然能得到较好的加速，但是当 GPU 数量增多时，与 OpenEmbedding & Horovod 的差距就越来越大了。由于 XDeepFM 64 计算量极大，用时过长，这里没有测试。
21 | 
22 | ## 分离参数服务器
23 | 
24 | > 在上一节中，OpenEmbedding & Horovod 实际上使用的是本节中 Cache Local 的设置。
25 | 
26 | | 测试名称 | 测试模式 |
27 | | - | - |
28 | | Local | Server 在本地 |
29 | | Cache Local | Server 在本地，且高频 Embedding 视为稠密参数，使用 all-reduce 同步 |
30 | | Remote 100G | Server 在远程，与 Worker 通过 100G bit/s 网络互联 |
31 | | Cache Remote 100G | Server 在远程，与 Worker 通过 100G bit/s 网络互联，其他同 Cache Local |
32 | 
33 | ![avatar](../images/benchmark-server.png)
34 | 
35 | 在 100G 网络中，Server 与 Worker 之间的通信不会对性能造成很大影响。另外，使用 Cache 通常能够得到 10% 左右的加速。
36 | 
37 | ## 大规模数据
38 | 
39 | OpenEmbedding 具备处理超大规模的数据的能力，对于大规模数据中的稀疏特征，有时难以去重并重新编号，在 OpenEmbedding 中可以将其 hash 到 int64 的非负整数范围，Server 会使用 hash table 存储参数。
40 | 
41 | 1TB 的 Criteo 数据集的性能测试结果如下。
42 | 
43 | | | |
44 | | - | - |
45 | | Model | DeepFM 9 |
46 | | Optimizer | Adagrad |
47 | | 测试模式 | Remote |
48 | | 数据集 | Criteo1T |
49 | | 数据格式 | TSV |
50 | | 每轮 Instance 数量 | 3.3 G |
51 | | 训练速度 | 692 kips |
52 | | 每轮用时 | 4763 s |
53 | | Checkpoint 用时 | 869 s |
54 | | Server 最大内存 | 1 * 175 GB |
55 | | Worker 最大内存 | 8 * 1.6 GB |
56 | | Checkpoint 大小 | 78 GB |
57 | | SavedModel 大小 | 45 GB |
58 | 
59 | # 测试步骤
60 | 
61 | ## 单机多卡
62 | 
63 | 1. copy 出 test/benchmark 和 example/criteo_preprocess.py
64 | 2. 下载并解压 Criteo 数据集，得到 `train.txt` 约 11 GB
65 | 3. 预处理 `python3 criteo_preprocess.py train.txt train.csv` 
66 | 4. 转换为 TFRecord 格式 `mkdir -p tfrecord && python3 criteo_tfrecord.py train.csv 0 1` 如果转换过程较慢，这一步可以并行执行，详见 `criteo_tfrecord.py`
67 | 5. 测试 Horovod 2 GPU `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord`
68 | 6. 测试 OpenEmbedding & Horovod 2 GPU `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord --server`
69 | 
70 | ## 分离参数服务器
71 | 
72 | 两台机器的 ip 分别是 ip1 和 ip2。
73 | 1. 启动 Server `python3 server.py ip2:34567`
74 | 2. 启动 Worker `python3 criteo_deepctr.py --data tfrecord --server --cache --master_endpoint ip2:34567 --bind_ip ip1`
75 | 
76 | ## 大规模数据
77 | 
78 | 1. 下载并解压 Criteo 1TB 的数据放到 criteo1T 文件夹下，文件路径为 criteo1T/day_*
79 | 2. 在另一台机器 ip1 上执行 `python3 server.py ip1:34567`
80 | 3. 执行 `horovodrun -np 8 python3 criteo_deepctr.py --data criteo1T --server --master_endpoint ip1:34567`
81 | 
82 | 可以使用 `--checkpoint`, `--save` 等参数指定模型保存路径，注意包括 `--data` 在内所有路径都应是共享的路径，不同机器之间可以通过挂载分布式文件系统来共享路径。
83 | 
84 | 


--------------------------------------------------------------------------------
/documents/cn/serving.md:
--------------------------------------------------------------------------------
 1 | # Serving
 2 | 
 3 | ## 单机模型
 4 | 
 5 | ![standalone](../images/standalone.drawio.png)
 6 | 
 7 | 用户可以通过 `save_as_orignal_model` 将分布式 Model 保存为单机的 SavedModel。SavedModel 包含前向计算图和包括 Embedding 在内的所有参数，可以被 TensorFlow Serving 直接加载。这个 SavedModel 不能用于训练，因为它没有存储 `Optimizer` 状态。
 8 | 
 9 | ## 分布式模型
10 | 
11 | ![serving](../images/serving.drawio.png)
12 | 
13 | 分布式 Model 需要使用包含 OpenEmbedding Operator 的 TensorFlow Serving 才能加载，启动流程如下：
14 | 1. 启动参数服务器集群，包括 ZooKeeper Master, Server, Controller。
15 | 2. 通过 Controller 将 EmbeddingModel 加载到参数服务器上。
16 | 3. 启动 TensorFlow Serving 加载 SavedModel 并连接到参数服务器的 ZooKeeper Master。
17 | 
18 | 在 SavedModel 中存储了一个 UUID 用于维护 SavedModel 和 EmbeddingModel 的对应关系。如果参数服务器上没有找到对应的 EmbeddingModel，Tensorflow Serving 会返回 "not found model"，而不会引发其他异常。


--------------------------------------------------------------------------------
/documents/cn/training.md:
--------------------------------------------------------------------------------
1 | # Training
2 | 
3 | ## 数据并行和模型并行
4 | 
5 | ![training](../images/training.drawio.png)
6 | 
7 | 训练的并行模式如上图，稠密部分和高频 `Embedding` 参数 在每个 Worker 中都存储一个副本，通过 all-reduce 同步，实现数据并行。低频的 Embedding 分 shard 存储在 Server 上，实现模型并行。基于同步模式的参数服务器架构，Worker 从 Server 上 pull 参数并且将梯度 push 给 Server，Server 收集到一个 mini-batch 中所有 Worker 的梯度后使用 `Optimizer` 更新参数。
8 | 


--------------------------------------------------------------------------------
/documents/en/benchmark.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | ## Multi GPUs
 4 | 
 5 | Compare the acceleration effects of only Horovod and OpenEmbedding & Horovod on TensorFlow.
 6 | 
 7 | | Option | Setting |
 8 | | - | - |
 9 | | CPU | 2 * CPU Xeon(R) Gold 5218 CPU @ 2.30GHz |
10 | | GPU | 8 * Tesla T4 |
11 | | Data | Criteo |
12 | | Data Format | TFRecord |
13 | | Model | WDL, DeepFM, XDeepFM |
14 | | Embedding Dimension | 9, 64 |
15 | | Optimizer | Adagrad |
16 | | Batch Size per GPU | 4096 |
17 | 
18 | ![benchmark](../images/benchmark.png)
19 | 
20 | With the increase in the number of GPUs, it is difficult to speed up using the Allreduce-based framework Horovod. For WDL 64, which accounts for a larger proportion of the sparse part, the performance of DeepFM 64 will decrease instead. For XDeepFM 9, Horovod can still get better acceleration due to the large amount of model calculations and the relatively small proportion of the sparse part. However, when the number of GPUs increases, the gap with OpenEmbedding & Horovod becomes larger and larger. Since XDeepFM 64 has a huge amount of calculation and takes too long, there is no test here.
21 | 
22 | ## Remote Parameter Server
23 | 
24 | > In the previous section, OpenEmbedding & Horovod actually used the Cache Local setting in this section.
25 | 
26 | | Case | Setting |
27 | | - | - |
28 | | Local | Local server |
29 | | Cache Local | Local server, high-frequency `Embedding` parameters updated by dense method and synchronized by all-reduce operator |
30 | | Remote 100G | Remote server，connect with worker through 100G bit/s network |
31 | | Cache Remote 100G | Remote server，connect with worker through 100G bit/s network，`Embedding` same as Cache Local |
32 | 
33 | ![avatar](../images/benchmark-server.png)
34 | 
35 | As shown in the figure, in a 100G network, the communication between server and worker will not affect the performance significantly. In addition, the `Cache` test cases can usually get about 10% speedup.
36 | 
37 | ## Big Data
38 | 
39 | OpenEmbedding has the ability to handle large-scale data. For the sparse features in large-scale data, it is sometimes difficult to de-duplicate and relabel. In OpenEmbedding, it can be hashed to the non-negative integer range of int64, and parameter servers will use hash table to store the parameters.
40 | 
41 | The performance test results of the 1TB Criteo data set are as follows.
42 | | | |
43 | | - | - |
44 | | Model | DeepFM 9 |
45 | | Optimizer | Adagrad |
46 | | Setting | Remote |
47 | | Data | Criteo1T |
48 | | Data Format | TSV |
49 | | Instance per Epoch | 3.3 G |
50 | | Training speed | 692 kips |
51 | | Time per Epoch | 4763 s |
52 | | Checkpoint Time | 869 s |
53 | | Server Memory | 1 * 175 GB |
54 | | Worker Memory | 8 * 1.6 GB |
55 | | Checkpoint Size | 78 GB |
56 | | SavedModel Size | 45 GB |
57 | 
58 | # Run Steps
59 | 
60 | ## Multi GPUs
61 | 
62 | 1. Copy test/benchmark example/criteo_preprocess.py
63 | 2. Download and decompress Criteo dat and get `train.txt` about 11 GB
64 | 3. Preprocess `python3 criteo_preprocess.py train.txt train.csv`
65 | 4. Transform data to TFRecord format `mkdir -p tfrecord && python3 criteo_tfrecord.py train.csv 0 1`
66 | 5. Run the brenchmark case Horovod `horovodrun -np 2 python3 criteo_deepctr.py --data tfrecord`
67 | 6. Run the brenchmark case OpenEmbedding & Horovod `horovodrun -np 2 python3 criteo_deepctr.py --data`
68 | 
69 | ## Remote Parameter Server
70 | 
71 | For the ip of the two machines are ip1 and ip2 respectively
72 | 1. Run servers `python3 server.py ip2:34567`
73 | 2. Run workers `python3 criteo_deepctr.py --data tfrecord --server --cache --master_endpoint ip2:34567 --bind_ip ip1`
74 | 
75 | ## Big Data
76 | 
77 | 1. Download and decompress Criteo 1TB data to `criteo1T` folder and the pattern of file path should be criteo1T/day_*
78 | 2. Run servers `python3 server.py ip1:34567`
79 | 3. Run workers `horovodrun -np 8 python3 criteo_deepctr.py --data criteo1T --server --master_endpoint ip1:34567`
80 | 
81 | You can use `--checkpoint`, `--save` and other parameters to specify the model save path. Note that all paths including `--data` should be shared. Distributed file systems can be mounted between different machines to share the path.
82 | 


--------------------------------------------------------------------------------
/documents/en/serving.md:
--------------------------------------------------------------------------------
 1 | # Serving
 2 | 
 3 | ## Stand-alone Model
 4 | 
 5 | ![standalone](../images/standalone.drawio.png)
 6 | 
 7 | You can save the distributed model as a stand-alone SavedModel by `save_as_orignal_model`. SavedModel contains forward calculation graph and all parameters including `Embedding`, which can be loaded directly by TensorFlow Serving. This SavedModel cannot be used for training because it does not store `Optimizer` states.
 8 | 
 9 | ## Distributed Model
10 | 
11 | ![serving](../images/serving.drawio.png)
12 | 
13 | The distributed model needs to be loaded with TensorFlow Serving including OpenEmbedding Operator. The startup process is as follows:
14 | 1. Start the parameter server cluster, including ZooKeeper Master, Server, and Controller.
15 | 2. Load the EmbeddingModel to the parameter server cluster through the Controller.
16 | 3. Start TensorFlow Serving and load the SavedModel and connect to the ZooKeeper Master of the parameter server.
17 | 
18 | A UUID is stored in SavedModel to maintain the correspondence with EmbeddingModel. If the corresponding EmbeddingModel is not found on the parameter server, Tensorflow Serving will return "not found model" without causing other exceptions.


--------------------------------------------------------------------------------
/documents/en/train.md:
--------------------------------------------------------------------------------
 1 | # Start
 2 | 
 3 | ## Parameter Server in Process
 4 | ```python
 5 | import openembedding.tensorflow as embed
 6 | ```
 7 | 
 8 | ## Remote Parameter Server
 9 | 
10 | ### Master
11 | ```python
12 | import time
13 | import openembedding as embed
14 | master = embed.Master()
15 | time.sleep(10) # Wait
16 | ```
17 | 
18 | ### Parameter Server
19 | ```python
20 | import openembedding as embed
21 | embed.flags.master_endpoint = '{ip}:{port}'
22 | _server = embed.Server()
23 | _server.join()
24 | ```
25 | 
26 | ### Worker
27 | ```python
28 | import openembedding.tensorflow as embed
29 | embed.flags.master_endpoint = '{ip}:{port}'
30 | embed.flags.wait_num_servers = num_servers
31 | ```
32 | 


--------------------------------------------------------------------------------
/documents/en/training.md:
--------------------------------------------------------------------------------
1 | # Training
2 | 
3 | ## Model parallel and data parallel
4 | 
5 | ![training](../images/training.drawio.png)
6 | 
7 | The parallel mode of training is shown in the figure above. The dense part and high-frequency `Embedding ` parameters are stored in multi workers mirrored mode and synchronized by all-reduce operator, which implement the data parallel. The low-frequency `Embedding` parameters divided into shards and stored on servers, which implement the model parallel. Based on the synchronized training mode of parameter server architecture, workers pull parameters from servers and push gradients to servers, and each server collects the gradients of all workers in a mini-batch and then update parameters by `Optimizer`.


--------------------------------------------------------------------------------
/documents/images/benchmark-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/benchmark-server.png


--------------------------------------------------------------------------------
/documents/images/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/benchmark.png


--------------------------------------------------------------------------------
/documents/images/pmem_vs_dram_oe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/pmem_vs_dram_oe.png


--------------------------------------------------------------------------------
/documents/images/serving.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/serving.drawio.png


--------------------------------------------------------------------------------
/documents/images/standalone.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/standalone.drawio.png


--------------------------------------------------------------------------------
/documents/images/training.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/images/training.drawio.png


--------------------------------------------------------------------------------
/documents/papers/openembedding_icde2023.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/4paradigm/OpenEmbedding/1e540f5c0e458ac51193f2008c07894100a71bdd/documents/papers/openembedding_icde2023.pdf


--------------------------------------------------------------------------------
/examples/criteo_deepctr_hook.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import deepctr.models
 5 | import deepctr.feature_column
 6 | import horovod.tensorflow.keras as hvd
 7 | import openembedding.tensorflow as embed
 8 | print('OpenEmbedding', embed.__version__)
 9 | 
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser()
13 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
14 | parser.add_argument('--data', default=default_data)
15 | parser.add_argument('--optimizer', default='Adam')
16 | parser.add_argument('--model', default='DeepFM')
17 | parser.add_argument('--checkpoint', default='') # include optimizer
18 | parser.add_argument('--load', default='') # include optimizer
19 | parser.add_argument('--save', default='') # not include optimizer
20 | 
21 | parser.add_argument('--batch_size', default=8, type=int)
22 | # Because the example uses hash table to store data, 
23 | # it does not support exporting to tensorflow original model.
24 | # parser.add_argument('--export', default='') # not include optimizer
25 | args = parser.parse_args()
26 | if not args.optimizer.endswith(')'):
27 |     args.optimizer += '()' # auto call args.optimizer
28 | 
29 | 
30 | # Hook deepctr.inputs.Embedding.
31 | class HookEmbedding(embed.Embedding):
32 |     def __init__(self, input_dim=-1, output_dim=9,
33 |             embeddings_initializer=None, embeddings_regularizer=None, **kwargs):
34 |         # input_dim = -1 means that the input range is the natural number range of int64 [0, 2**63-1].
35 |         # If input_dim = -1, the server will uses hash table to store Embedding layer,
36 |         # server does not support embeddings_regularizer.
37 |         # You can specify the number of global shards by num_shards, 
38 |         # num_shards is equal to the number of servers by default.
39 |         super(HookEmbedding, self).__init__(input_dim, output_dim, 
40 |                 embeddings_initializer=embeddings_initializer, 
41 |                 activity_regularizer=embeddings_regularizer, 
42 |                 num_shards=1, 
43 |                 **kwargs)
44 | import deepctr.inputs
45 | deepctr.inputs.Embedding = HookEmbedding
46 | 
47 | 
48 | # Assign GPU according to rank.
49 | hvd.init()
50 | gpus = tf.config.experimental.list_physical_devices('GPU')
51 | if gpus:
52 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
53 |     tf.config.experimental.set_memory_growth(gpus[hvd.local_rank()], True)
54 | 
55 | 
56 | # Process data.
57 | data = pandas.read_csv(args.data)
58 | n = data.shape[0] // hvd.size() * hvd.size()
59 | data = data.iloc[hvd.rank():n:hvd.size()]
60 | inputs = dict()
61 | feature_columns = list()
62 | for name in data.columns:
63 |     if name[0] == 'C':
64 |         inputs[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) # hash encoding
65 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
66 |             vocabulary_size=-1, embedding_dim=9, dtype='int64'))
67 |     elif name[0] == 'I':
68 |         inputs[name] = data[name]
69 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
70 | 
71 | 
72 | # Compile distributed model.
73 | optimizer = eval("tf.keras.optimizers." + args.optimizer)
74 | optimizer = embed.distributed_optimizer(optimizer)
75 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum)
76 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary')
77 | model = embed.distributed_model(model)
78 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
79 | 
80 | 
81 | # load --> fit --> save
82 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0),
83 |               hvd.callbacks.MetricAverageCallback() ]
84 | if args.checkpoint and hvd.rank() == 0:
85 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
86 | if args.load:
87 |     model.load_weights(args.load)
88 | 
89 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2)
90 | 
91 | if args.save and hvd.rank() == 0:
92 |     model.save(args.save, include_optimizer=False)
93 | 


--------------------------------------------------------------------------------
/examples/criteo_deepctr_network.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import deepctr.models
 5 | import deepctr.feature_column
 6 | import horovod.tensorflow.keras as hvd
 7 | import openembedding.tensorflow as embed
 8 | print('OpenEmbedding', embed.__version__)
 9 | 
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser()
13 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
14 | parser.add_argument('--data', default=default_data)
15 | parser.add_argument('--batch_size', default=8, type=int)
16 | parser.add_argument('--optimizer', default='Adam')
17 | parser.add_argument('--model', default='DeepFM')
18 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer
19 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer
20 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer
21 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer
22 | args = parser.parse_args()
23 | if not args.optimizer.endswith(')'):
24 |     args.optimizer += '()' # auto call args.optimizer
25 | 
26 | 
27 | # Assign GPU according to rank.
28 | hvd.init()
29 | gpus = tf.config.experimental.list_physical_devices('GPU')
30 | if gpus:
31 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
32 |     tf.config.experimental.set_memory_growth(gpus[hvd.local_rank()], True)
33 | 
34 | 
35 | # Process data.
36 | data = pandas.read_csv(args.data)
37 | n = data.shape[0] // hvd.size() * hvd.size()
38 | data = data.iloc[hvd.rank():n:hvd.size()]
39 | inputs = dict()
40 | feature_columns = list()
41 | for name in data.columns:
42 |     if name[0] == 'C':
43 |         inputs[name] = data[name] % 65536
44 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
45 |             vocabulary_size=65536, embedding_dim=9, dtype='int64'))
46 |     elif name[0] == 'I':
47 |         inputs[name] = data[name]
48 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
49 | 
50 | 
51 | # Compile distributed model.
52 | optimizer = eval("tf.keras.optimizers." + args.optimizer)
53 | optimizer = embed.distributed_optimizer(optimizer)
54 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum)
55 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary')
56 | model = embed.distributed_model(model)
57 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
58 | 
59 | 
60 | # load --> fit --> save
61 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0),
62 |               hvd.callbacks.MetricAverageCallback() ]
63 | if args.checkpoint and hvd.rank() == 0:
64 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
65 | if args.load:
66 |     model.load_weights(args.load)
67 | 
68 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2)
69 | 
70 | if args.save and hvd.rank() == 0:
71 |     model.save(args.save, include_optimizer=False)
72 | if args.export and hvd.rank() == 0:
73 |     model.save_as_original_model(args.export, include_optimizer=False)
74 | 


--------------------------------------------------------------------------------
/examples/criteo_deepctr_network_mirrored.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import deepctr.models
 5 | import deepctr.feature_column
 6 | import openembedding.tensorflow as embed
 7 | print('OpenEmbedding', embed.__version__)
 8 | 
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser()
12 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
13 | parser.add_argument('--data', default=default_data)
14 | parser.add_argument('--batch_size', default=8, type=int)
15 | # Currently, MirroredStrategy does not support this.
16 | # parser.add_argument('--prefetch', action='store_true')
17 | parser.add_argument('--optimizer', default='Adam')
18 | parser.add_argument('--model', default='DeepFM')
19 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer
20 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer
21 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer
22 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer
23 | args = parser.parse_args()
24 | if not args.optimizer.endswith(')'):
25 |     args.optimizer += '()' # auto call args.optimizer
26 | 
27 | # Process data
28 | data = pandas.read_csv(args.data)
29 | data = data.iloc[:data.shape[0] // args.batch_size * args.batch_size]
30 | inputs = dict()
31 | feature_columns = list()
32 | for name in data.columns:
33 |     inputs[name] = tf.reshape(data[name], [-1, args.batch_size, 1])
34 |     if name[0] == 'C':
35 |         inputs[name] = tf.cast(inputs[name] % 65536, dtype=tf.int64)
36 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
37 |             vocabulary_size=65536, embedding_dim=9, dtype='int64'))
38 |     elif name[0] == 'I':
39 |         inputs[name] = tf.cast(inputs[name], dtype=tf.float32)
40 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
41 | train_batch_target = tf.reshape(data['label'], [-1, args.batch_size])
42 | dataset = tf.data.Dataset.from_tensor_slices((inputs, train_batch_target))
43 | 
44 | 
45 | # Compile distributed model
46 | strategy = tf.distribute.MirroredStrategy()
47 | with strategy.scope():
48 |     optimizer = eval("tf.keras.optimizers." + args.optimizer)
49 |     optimizer = embed.distributed_optimizer(optimizer)
50 | 
51 |     model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary')
52 |     model = embed.distributed_model(model, sparse_as_dense_size=args.batch_size)
53 |     model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
54 | 
55 | 
56 | # load --> fit --> save
57 | callbacks = list()
58 | if args.checkpoint:
59 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
60 | if args.load:
61 |     model.load_weights(args.load)
62 | 
63 | 
64 | # Currently, MirroredStrategy does not support this.
65 | # if args.prefetch:
66 | #     dataset = embed.pulling(dataset, model).prefetch(4)
67 | model.fit(dataset, batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2)
68 | 
69 | if args.save:
70 |     model.save(args.save, include_optimizer=False)
71 | if args.export:
72 |     model.save_as_original_model(args.export, include_optimizer=False)
73 | 


--------------------------------------------------------------------------------
/examples/criteo_deepctr_network_mpi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import deepctr.models
 5 | import deepctr.feature_column
 6 | import openembedding.tensorflow as embed
 7 | print('OpenEmbedding', embed.__version__)
 8 | 
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser()
12 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
13 | parser.add_argument('--data', default=default_data)
14 | parser.add_argument('--batch_size', default=8, type=int)
15 | # Currently, MirroredStrategy does not support this.
16 | # parser.add_argument('--prefetch', action='store_true') 
17 | parser.add_argument('--optimizer', default='Adam')
18 | parser.add_argument('--model', default='DeepFM')
19 | parser.add_argument('--checkpoint', default='', help='checkpoint save path') # include optimizer
20 | parser.add_argument('--load', default='', help='checkpoint path to restore') # include optimizer
21 | parser.add_argument('--save', default='', help='distributed serving model save path') # not include optimizer
22 | parser.add_argument('--export', default='', help='standalone serving model save path') # not include optimizer
23 | parser.add_argument('--port', default=50000)
24 | args = parser.parse_args()
25 | if not args.optimizer.endswith(')'):
26 |     args.optimizer += '()' # auto call args.optimizer
27 | 
28 | 
29 | gpus = tf.config.experimental.list_physical_devices('GPU')
30 | if gpus:
31 |     for gpu in gpus:
32 |         tf.config.experimental.set_memory_growth(gpu, True)
33 | 
34 | 
35 | # Synchronizing distributed configurations using MPI.
36 | import json
37 | import socket
38 | from mpi4py import MPI
39 | comm_rank = MPI.COMM_WORLD.Get_rank()
40 | comm_size = MPI.COMM_WORLD.Get_size()
41 | ip = str(socket.gethostbyname(socket.gethostname()))
42 | ip_port = ip + ':' + str(args.port + comm_rank)
43 | os.environ['TF_CONFIG'] = json.dumps({
44 |     'cluster': { 'worker':  MPI.COMM_WORLD.allgather(ip_port) },
45 |     'task': { 'type': 'worker', 'index': comm_rank }
46 | })
47 | strategy = tf.distribute.MultiWorkerMirroredStrategy()
48 | 
49 | 
50 | # Process data.
51 | data = pandas.read_csv(args.data)
52 | data = data.iloc[:data.shape[0] // args.batch_size * args.batch_size]
53 | inputs = dict()
54 | feature_columns = list()
55 | for name in data.columns:
56 |     inputs[name] = tf.reshape(data[name], [-1, args.batch_size, 1])
57 |     if name[0] == 'C':
58 |         inputs[name] = tf.cast(inputs[name] % 65536, dtype=tf.int64)
59 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
60 |             vocabulary_size=65536, embedding_dim=9, dtype='int64'))
61 |     elif name[0] == 'I':
62 |         inputs[name] = tf.cast(inputs[name], dtype=tf.float32)
63 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
64 | train_batch_target = tf.reshape(data['label'], [-1, args.batch_size])
65 | dataset = tf.data.Dataset.from_tensor_slices((inputs, train_batch_target))
66 | 
67 | 
68 | # Compile distributed model.
69 | with strategy.scope():
70 |     optimizer = eval("tf.keras.optimizers." + args.optimizer)
71 |     optimizer = embed.distributed_optimizer(optimizer)
72 | 
73 |     model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary')
74 |     model = embed.distributed_model(model, sparse_as_dense_size=args.batch_size)
75 |     model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
76 | 
77 | 
78 | # load --> fit --> save
79 | callbacks = list()
80 | if args.checkpoint:
81 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
82 | if args.load:
83 |     model.load_weights(args.load)
84 | 
85 | 
86 | # Currently, MirroredStrategy does not support this.
87 | # if args.prefetch:
88 | #     dataset = embed.pulling(dataset, model).prefetch(4)
89 | model.fit(dataset, batch_size=args.batch_size, epochs=5, callbacks=callbacks, verbose=2)
90 | 
91 | if args.save:
92 |     model.save(args.save, include_optimizer=False)
93 | if args.export:
94 |     model.save_as_original_model(args.export, include_optimizer=False)
95 | 


--------------------------------------------------------------------------------
/examples/criteo_lr_subclass.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import openembedding.tensorflow as embed
 5 | print('OpenEmbedding', embed.__version__)
 6 | 
 7 | 
 8 | class CriteoLR(tf.keras.Model):
 9 |     def __init__(self):
10 |         super(CriteoLR, self).__init__()
11 |         # input_dim = -1 means that the input range is the natural number range of int64 [0, 2**63-1].
12 |         # If input_dim = -1, the server will uses hash table to store Embedding layer,
13 |         self.embeddings = embed.Embedding(input_dim=-1, output_dim=1,
14 |               embeddings_initializer=tf.keras.initializers.Zeros(), num_shards=16)
15 |         self.concatenate = tf.keras.layers.Concatenate()
16 |         self.sigmoid = tf.keras.layers.Dense(1, activation='sigmoid')
17 | 
18 |     def call(self, inputs):
19 |         fields = []
20 |         for name, tensor in inputs.items():
21 |             if name[0] == 'C':
22 |                 fields.append(self.embeddings(tensor))
23 |             else:
24 |                 fields.append(tf.reshape(tensor, [-1, 1, 1]))
25 |         return self.sigmoid(self.concatenate(fields))
26 | 
27 | 
28 | import argparse
29 | parser = argparse.ArgumentParser()
30 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
31 | parser.add_argument('--data', default=default_data)
32 | parser.add_argument('--checkpoint', default='') # include optimizer
33 | parser.add_argument('--load', default='') # include optimizer
34 | parser.add_argument('--save', default='') # not include optimizer
35 | # subclass model not support exporting to tensorflow original model
36 | # parser.add_argument('--export', default='') # not include optimizer
37 | args = parser.parse_args()
38 | 
39 | 
40 | # Process data
41 | data = pandas.read_csv(args.data)
42 | inputs = dict()
43 | for name in data.columns:
44 |     if name[0] == 'C':
45 |         inputs[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63) # hash encoding
46 |     elif name[0] == 'I':
47 |         inputs[name] = data[name]
48 | 
49 | 
50 | # Compile distributed model
51 | optimizer = tf.keras.optimizers.Adam()
52 | optimizer = embed.distributed_optimizer(optimizer)
53 | model = CriteoLR()
54 | model = embed.distributed_model(model)
55 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
56 | 
57 | 
58 | # load --> fit --> save
59 | callbacks = list()
60 | if args.checkpoint:
61 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
62 | if args.load:
63 |     model.load_weights(args.load)
64 | 
65 | model.fit(inputs, data['label'], batch_size=8, epochs=5, callbacks=callbacks, verbose=2)
66 | if args.save:
67 |     model.save(args.save, include_optimizer=False)
68 | 


--------------------------------------------------------------------------------
/examples/criteo_preprocess.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pandas
 3 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler
 4 | 
 5 | if len(sys.argv) < 2:
 6 |     print("usage: process_data.py input_file output_file")
 7 | 
 8 | data = pandas.read_csv(sys.argv[1], sep='\t', header=None)
 9 | target = ['label']
10 | dense_features = ['I' + str(i) for i in range(1, 14)]
11 | sparse_features = ['C' + str(i) for i in range(1, 27)]
12 | data.columns = target + dense_features + sparse_features
13 | 
14 | data[sparse_features] = data[sparse_features].fillna('-1', )
15 | data[dense_features] = data[dense_features].fillna(0, )
16 | 
17 | for feat in dense_features:
18 |     print(feat, data[feat].min(), data[feat].max())
19 | mms = MinMaxScaler(feature_range=(0, 1))
20 | data[dense_features] = mms.fit_transform(data[dense_features])
21 | for feat in sparse_features:
22 |     lbe = LabelEncoder()
23 |     data[feat] = lbe.fit_transform(data[feat])
24 | 
25 | data.to_csv(sys.argv[2], float_format='%.6f')
26 | 


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_checkpoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --checkpoint tmp/epoch
4 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --load tmp/epoch4/variables/variables
5 | 


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_horovod.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | horovodrun -np 2 python3 examples/criteo_deepctr_network.py --export tmp/criteo/1


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_mirrored.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | python3 examples/criteo_deepctr_network_mirrored.py --export tmp/criteo/1


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_mpi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | mpirun -np 2 python3 examples/criteo_deepctr_network_mpi.py --export tmp/criteo/1


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_restful.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | python3 examples/tensorflow_serving_restful.py
4 | python3 examples/tensorflow_serving_restful.py --rows 1
5 | 


--------------------------------------------------------------------------------
/examples/run/criteo_deepctr_standalone.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | python3 examples/criteo_deepctr_network.py --export tmp/criteo/1
4 | 


--------------------------------------------------------------------------------
/examples/run/criteo_preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | wget https://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz -O tmp/dac_sample.tar.gz
4 | tar -xzf tmp/dac_sample.tar.gz -C tmp
5 | python3 examples/criteo_preprocess.py tmp/dac_sample.txt tmp/dac_sample.csv
6 | python3 examples/criteo_deepctr_hook.py --data tmp/dac_sample.csv --batch_size 256
7 | 


--------------------------------------------------------------------------------
/examples/tensorflow_serving_client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import grpc
  4 | import numpy
  5 | import pandas
  6 | import threading
  7 | import tensorflow as tf
  8 | 
  9 | 
 10 | from tensorflow_serving.apis import predict_pb2
 11 | from tensorflow_serving.apis import prediction_service_pb2_grpc
 12 | 
 13 | 
 14 | class _ResultCounter(object):
 15 |     def __init__(self, num_tests, concurrency):
 16 |         self._num_tests = num_tests
 17 |         self._concurrency = concurrency
 18 |         self._error = 0
 19 |         self._done = 0
 20 |         self._active = 0
 21 |         self._condition = threading.Condition()
 22 | 
 23 |     def inc_error(self):
 24 |         with self._condition:
 25 |             self._error += 1
 26 | 
 27 |     def inc_done(self):
 28 |         with self._condition:
 29 |             self._done += 1
 30 |             self._condition.notify()
 31 | 
 32 |     def dec_active(self):
 33 |         with self._condition:
 34 |             self._active -= 1
 35 |             self._condition.notify()
 36 | 
 37 |     def get_error_rate(self):
 38 |         with self._condition:
 39 |             while self._done != self._num_tests:
 40 |                 self._condition.wait()
 41 |             return self._error / float(self._num_tests)
 42 | 
 43 |     def throttle(self):
 44 |         with self._condition:
 45 |             while self._active == self._concurrency:
 46 |                 self._condition.wait()
 47 |             self._active += 1
 48 | 
 49 | 
 50 | def _create_rpc_callback(label, result_counter):
 51 |     def _callback(result_future):
 52 |         exception = result_future.exception()
 53 |         if exception:
 54 |             result_counter.inc_error()
 55 |             print(exception)
 56 |         else:
 57 |             predict = numpy.array(result_future.result().outputs['prediction_layer'].float_val)
 58 |             print('label = ', label, ', predict = ', int(predict[0] + 0.5), ' ' ,predict[0])
 59 |             if label != int(predict[0] + 0.5):
 60 |                 result_counter.inc_error()
 61 |             result_counter.inc_done()
 62 |             result_counter.dec_active()
 63 |     return _callback
 64 |   
 65 | 
 66 | import argparse
 67 | parser = argparse.ArgumentParser()
 68 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
 69 | parser.add_argument('--data', default=default_data)
 70 | parser.add_argument('--hash', action='store_true')
 71 | parser.add_argument('--grpc', default='127.0.0.1:8500')
 72 | parser.add_argument('--model', default='criteo')
 73 | args = parser.parse_args()
 74 | 
 75 | 
 76 | # process data
 77 | data = pandas.read_csv(args.data)
 78 | feature_names = list()
 79 | for name in data.columns:
 80 |     if name[0] == 'C':
 81 |         if args.hash:
 82 |             data[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63)
 83 |         else:
 84 |             data[name] = data[name] % 65536
 85 |         feature_names.append(name)
 86 |     elif name[0] == 'I':
 87 |         feature_names.append(name)
 88 | 
 89 | 
 90 | # use TensorFlow Serving
 91 | channel = grpc.insecure_channel(args.grpc)
 92 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
 93 | result_counter = _ResultCounter(data.shape[0], 4)
 94 | for i in range(data.shape[0]):
 95 |     request = predict_pb2.PredictRequest()
 96 |     request.model_spec.name = args.model
 97 |     for name in feature_names:
 98 |         dtype = tf.float32
 99 |         if name.startswith('C'):
100 |             dtype = tf.int64
101 |         request.inputs[name].CopyFrom(tf.make_tensor_proto(data[name][i], dtype=dtype, shape=[1, 1]))
102 |     result_counter.throttle()
103 |     result_future = stub.Predict.future(request, 5.0)  # 5 seconds
104 |     result_future.add_done_callback(
105 |         _create_rpc_callback(data['label'][i], result_counter))
106 | print('error rate: ', result_counter.get_error_rate())
107 | 


--------------------------------------------------------------------------------
/examples/tensorflow_serving_restful.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pandas
 4 | import argparse
 5 | parser = argparse.ArgumentParser()
 6 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
 7 | parser.add_argument('--data', default=default_data)
 8 | parser.add_argument('--rows', type=int, default=None)
 9 | parser.add_argument('--hash', action='store_true')
10 | parser.add_argument('--host', default='127.0.0.1:8501')
11 | parser.add_argument('--model', default='criteo')
12 | args = parser.parse_args()
13 | 
14 | 
15 | # process data
16 | data = pandas.read_csv(args.data, nrows=args.rows)
17 | feature_names = list()
18 | for name in data.columns:
19 |     if name[0] == 'C':
20 |         if args.hash:
21 |             data[name] = (data[name] + int(name[1:]) * 1000000007) % (2**63)
22 |         else:
23 |             data[name] = data[name] % 65536
24 |         feature_names.append(name)
25 |     elif name[0] == 'I':
26 |         feature_names.append(name)
27 | 
28 | inputs = dict()
29 | for name in data.columns:
30 |     if name[0] == 'C':
31 |         inputs[name] = [[int(value)] for value in data[name]]
32 |     elif name[0] == 'I':
33 |         inputs[name] = [[float(value)] for value in data[name]]
34 | post = json.dumps({'inputs':inputs})
35 | command = f"curl -d '{post}' {args.host}/v1/models/{args.model}:predict"
36 | print(command)
37 | result = json.load(os.popen(command))
38 | print(json.dumps(result))
39 | 
40 | if "outputs" not in result or len(result["outputs"]) != data.shape[0]:
41 |     print("get error result!")
42 |     exit(1)
43 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/Dockerfile:
--------------------------------------------------------------------------------
 1 | RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
 2 |     python3.7 get-pip.py && \
 3 |     rm get-pip.py
 4 | 
 5 | RUN apt-get update && apt-get install -y python3.7-dev
 6 | 
 7 | RUN pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir \
 8 |     future \
 9 |     grpcio \
10 |     h5py \
11 |     mock \
12 |     numpy \
13 |     requests \
14 |     pandas \
15 |     sklearn \
16 |     deepctr \
17 |     tensorflow==2.2
18 | 
19 | RUN apt-get update && apt-get install -y cmake build-essential devscripts debhelper fakeroot
20 | RUN wget https://github.com/NVIDIA/nccl/archive/v2.8.3-1.tar.gz && tar -xzf v2.8.3-1.tar.gz && \
21 |     cd nccl-2.8.3-1 && make -j src.build && make pkg.debian.build
22 | RUN apt-get -y install ./nccl-2.8.3-1/build/pkg/deb/libnccl2_2.8.3-1+cuda10.1_amd64.deb ./nccl-2.8.3-1/build/pkg/deb/libnccl-dev_2.8.3-1+cuda10.1_amd64.deb
23 | RUN HOROVOD_GPU_OPERATIONS=NCCL pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir horovod
24 | 
25 | WORKDIR /root
26 | RUN apt-get -y install libnuma-dev librdmacm-dev libibverbs-dev
27 | 
28 | RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.5.tar.gz && \
29 |     tar -xzf openmpi-4.0.5.tar.gz && cd openmpi-4.0.5 && \
30 |     ./configure --prefix=/usr/local/openmpi CFLAGS="-fPIC" CXXFlAGS="-fPIC" --enable-static && \
31 |     make -j && make install
32 | 
33 | RUN apt-get update && apt-get install -y gawk vim libssl-dev tsocks privoxy ssh patchelf
34 | 
35 | RUN rm /usr/bin/python && rm /usr/bin/python3 && rm /usr/local/bin/pip && rm /usr/local/bin/pip3 && \
36 |     ln -s /usr/bin/python3.7 /usr/bin/python && \
37 |     ln -s /usr/bin/python3.7 /usr/bin/python3 && \
38 |     ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip && \
39 |     ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3
40 | 
41 | RUN pip3.7 install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
42 | RUN pip3.7 uninstall -y horovod && HOROVOD_GPU_OPERATIONS=NCCL pip3.7 install -i https://mirrors.aliyun.com/pypi/simple/ --no-cache-dir --upgrade horovod
43 | 
44 | ENV THRID_PARTY /usr/local
45 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/analyze.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | data = pandas.read_csv('train_1000w.csv')
 3 | 
 4 | cache = dict()
 5 | 
 6 | feature = list()
 7 | batch_size = 4096
 8 | all_whole_unique = 0
 9 | all_related_unique = 0
10 | for name in data.columns:
11 |     if name[0] != 'C':
12 |         continue
13 |     cache[name] = set()
14 |     column = data[name]
15 |     whole = 0
16 |     whole_unique = 0
17 |     related = 0
18 |     related_unique = 0
19 |     for i in range(1, 100):
20 |         prev = set()
21 |         for j in range(batch_size):
22 |             cache[name].add(column[(i - 1) * batch_size + j])
23 |             prev.add(column[(i - 1) * batch_size + j])
24 |         rlt = list()
25 |         whl = column[i * batch_size: (i + 1) * batch_size]
26 |         cache_hit = list()
27 |         for key in whl:
28 |             if key in prev:
29 |                 rlt.append(key)
30 |             if key in cache[name]:
31 |                 cache_hit.append(key)
32 |         whole += len(whl)
33 |         whole_unique += len(set(whl))
34 |         related += len(rlt)
35 |         related_unique += len(set(rlt))
36 |         if i == 64:
37 |             print(name, data[name].max() + 1, len(set(whl)), len(set(cache_hit)))
38 |     feature.append([name, [whole, whole_unique, related, related_unique]])
39 |     all_whole_unique += whole_unique
40 |     all_related_unique += related_unique
41 |     print(name, whole, related)
42 |     print(name, whole_unique, related_unique)
43 | 
44 | print()
45 | print(all_whole_unique, all_related_unique)
46 | feature = sorted(feature, key=lambda x: x[1][1])
47 | for name, values in feature:
48 |     print(name, values[1] / values[0], values[2] / values[0], values[3] / values[1])
49 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/benchmark.Dockerfile:
--------------------------------------------------------------------------------
1 | 
2 | # paddle 2.1
3 | # git clone paddleRec -b 2.0
4 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/benchmark.py:
--------------------------------------------------------------------------------
 1 | # cuda = 10.1
 2 | # torch = 1.7
 3 | # tensorflow = 2.2
 4 | 
 5 | import os
 6 | import sys
 7 | import time
 8 | 
 9 | def run_remote_server(user, ip, port):
10 |     os.system('echo "bash run_server.sh {}:{}\n sleep 1\n exit\n" | ssh {}@{}'.format(ip, port, user, ip))
11 | 
12 | def run(py, data, model, embedding_dim, options, np=1, bind_ip=None, master_endpoint=None):
13 |     if data.endswith('csv'):
14 |         extend = 'csv'
15 |     else:
16 |         extend = 'tf'
17 |     name = 'result/{}_{}_{}_{}'.format(py, extend, model, embedding_dim)
18 |     command = 'horovodrun -np {} python3.7 {}.py'.format(np, py)
19 |     command += ' --data {} --model {} --embedding_dim {}'.format(data, model, embedding_dim)
20 |     for option in options:
21 |         name += '_{}'.format(option)
22 |         command += ' --{}'.format(option)
23 |     if master_endpoint:
24 |         name += '_remote'
25 |         command += ' --master_endpoint {}'.format(master_endpoint)
26 |     if bind_ip:
27 |         command += ' --bind_ip {}'.format(bind_ip)
28 |     name += '_' + str(np)
29 |     command += ' 1>{}.out 2>{}.err'.format(name, name)
30 |     print(command)
31 |     os.system(command)
32 |     time.sleep(1)
33 | 
34 | 
35 | if len(sys.argv) > 3:
36 |     # remote
37 |     user = sys.argv[1]
38 |     remote_ip = sys.argv[2]
39 |     bind_ip = sys.argv[3]
40 |     port = 61000
41 |     for model in ['WDL', 'DeepFM']:
42 |         for embedding_dim in [9, 64]:
43 |             for options in [['server'], ['server', 'cache'], ['server', 'cache', 'prefetch']]:
44 |                 for np in [1, 2, 4, 8]:
45 |                     port += 1
46 |                     time.sleep(60)
47 |                     run_remote_server(user, remote_ip, port)
48 |                     time.sleep(60)
49 |                     run('deepctr_criteo', 'tfrecord', model, embedding_dim, options, np=np,
50 |                           bind_ip=bind_ip, master_endpoint='{}:{}'.format(remote_ip, port))
51 | else:
52 |     #local
53 |     for data in ['tfrecord', 'train.csv']:
54 |         for model in ['WDL', 'DeepFM', 'xDeepFM']:
55 |             for embedding_dim in [9, 64]:
56 |                 for options in [[], ['server'], ['server', 'cache'], ['server', 'cache', 'prefetch']]:
57 |                     for np in [1, 2, 4, 8]:
58 |                         run('deepctr_criteo', data, model, embedding_dim, options, np=np)
59 |                     run('deepctr_criteo', data, model, embedding_dim, options + ['cpu'], np=1)
60 | 
61 |     for model in ['WDL', 'DeepFM']:
62 |         for embedding_dim in [9, 64]:
63 |             for np in [1, 2, 4, 8]:
64 |                 run('deepctr_criteo_torch', data, model, embedding_dim, [], np=np)
65 | 
66 |     for model in ['WDL', 'DeepFM']:
67 |         for embedding_dim in [9, 64]:
68 |             run('deepctr_criteo_torch', data, model, embedding_dim, ['cpu'], np=1)
69 | 
70 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/parse_tensor_board.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | 
 4 | profile = json.loads(open(sys.argv[1]).read())
 5 | 
 6 | timeline = dict()
 7 | for event in profile['traceEvents']:
 8 |     if 'name' in event and 'ts' in event:
 9 |         p = event['name'].rfind(':')
10 |         name = event['name'][p + 1:]
11 |         timeline.setdefault(name, list())
12 |         timeline[name].append(event)
13 | 
14 | for name, events in timeline.items():
15 |     l = min(event['ts'] for event in events)
16 |     r = max(event['ts'] + event['dur'] for event in events)
17 |     s = sum(event['dur'] for event in events)
18 |     c = len(events)
19 |     print(name, int(l / 1000), int(r / 1000), int(s / 1000), c)
20 | 
21 | 


--------------------------------------------------------------------------------
/laboratory/benchmark/summary.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | times = dict()
 4 | 
 5 | for name in os.listdir(sys.argv[1]):
 6 |     time = 100000000
 7 |     for line in open(sys.argv[1] + '/' + name):
 8 |         r = line.find('s - loss')
 9 |         l = line.find('-')
10 |         if l > r:
11 |             l = line.find(':')
12 |         if l != -1 and r != -1:
13 |             time = min(time, int(line[l+1:r]))
14 |         sp = len(name) - 6
15 |         key, np = name[:sp], name[sp:]
16 |         times.setdefault(key, [0, 0, 0, 0])
17 |         if np == '_1.out':
18 |             times[key][0] = time
19 |         if np == '_2.out':
20 |             times[key][1] = time
21 |         if np == '_4.out':
22 |             times[key][2] = time
23 |         if np == '_8.out':
24 |             times[key][3] = time
25 | 
26 | for key, value in sorted(times.items()):
27 |     print(key, *value)


--------------------------------------------------------------------------------
/laboratory/benchmark/tensornet.Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | # tensorflow=2.2
 3 | # python3.7
 4 | # git clone tensornet -b 1.1
 5 | # cp to /usr/local/lib/python3.7
 6 | # openmpi4.0
 7 | # LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
 8 | # tensornet/WORKSPACE
 9 | # tensornet/examples
10 | 
11 | 


--------------------------------------------------------------------------------
/laboratory/inject/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM tensorflow/tensorflow:2.2.0-gpu
2 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake
3 | RUN pip install horovod
4 | ADD . /openembedding
5 | WORKDIR /openembedding/laboratory/inject
6 | 
7 | RUN bash inject.sh
8 | WORKDIR /root
9 | 


--------------------------------------------------------------------------------
/laboratory/inject/inject.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | site=`python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"`
 3 | cat "openembedding_inject_tensorflow.py" > "${site}/openembedding_inject_tensorflow.py"
 4 | cat "sitecustomize.py" > "/usr/lib/python3.6/sitecustomize.py"
 5 | 
 6 | python=python3.6
 7 | which_python=`which ${python}`
 8 | which_pythonm=`which ${python}m`
 9 | 
10 | cat python > "${which_python}"
11 | echo "${which_pythonm}" '"${args[@]}"' >> "$which_python"
12 | 
13 | 
14 | pico_compile criteo_deepctr_network.py -o pico_network_model.py 
15 | pico_run -np 4 pico_network_model.py
16 | 


--------------------------------------------------------------------------------
/laboratory/inject/network_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas
 3 | import tensorflow as tf
 4 | import horovod.tensorflow.keras as hvd
 5 | import deepctr.models
 6 | import deepctr.feature_column
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | default_data = os.path.dirname(os.path.abspath(__file__)) + '/train100.csv'
11 | parser.add_argument('--data', default=default_data) # 输入的数据文件
12 | parser.add_argument('--optimizer', default='Adam')
13 | parser.add_argument('--model', default='DeepFM')
14 | parser.add_argument('--checkpoint', default='', help='checkpoint 保存路径') # include optimizer
15 | parser.add_argument('--load', default='', help='要恢复的 checkpoint 路径') # include optimizer
16 | parser.add_argument('--save', default='', help='分布式 serving model 保存的路径') # not include optimizer
17 | args = parser.parse_args()
18 | if not args.optimizer.endswith(')'):
19 |     args.optimizer += '()' # auto call args.optimizer
20 | 
21 | 
22 | # process data
23 | hvd.init()
24 | data = pandas.read_csv(args.data)
25 | n = data.shape[0] // hvd.size()
26 | data = data.iloc[hvd.rank() * n: hvd.rank() * n + n]
27 | inputs = dict()
28 | feature_columns = list()
29 | for name in data.columns:
30 |     if name[0] == 'C':
31 |         inputs[name] = data[name] % 65536
32 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
33 |             vocabulary_size=65536, embedding_dim=9, dtype='int64'))
34 |     elif name[0] == 'I':
35 |         inputs[name] = data[name]
36 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
37 | 
38 | 
39 | # compile distributed model
40 | optimizer = eval("tf.keras.optimizers." + args.optimizer)
41 | optimizer = hvd.DistributedOptimizer(optimizer)
42 | model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, task='binary')
43 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'], experimental_run_tf_function=False)
44 | 
45 | 
46 | # load --> fit --> save
47 | callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0),
48 |               hvd.callbacks.MetricAverageCallback() ]
49 | if args.checkpoint and hvd.rank() == 0:
50 |     callbacks.append(tf.keras.callbacks.ModelCheckpoint(args.checkpoint + '{epoch}'))
51 | if args.load:
52 |     model.load_weights(args.load)
53 | 
54 | model.fit(inputs, data['label'], batch_size=8, epochs=5, callbacks=callbacks, verbose=1)
55 | 
56 | if args.save and hvd.rank() == 0:
57 |     model.save(args.save, include_optimizer=False)
58 | 


--------------------------------------------------------------------------------
/laboratory/inject/openembedding_inject_tensorflow.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | try:
 5 |     from tensorflow.python import keras
 6 |     import tensorflow as tf
 7 |     import openembedding.tensorflow as embed
 8 | except ImportError:
 9 |     pass
10 | else:
11 |     class Embedding(embed.Embedding):
12 |         def __init__(self, *args, **kwargs):
13 |             explicit = kwargs.pop('explicit', False)
14 |             super().__init__(*args, explicit=explicit, **kwargs)
15 | 
16 |     keras.layers.Embedding = Embedding
17 |     tf.keras.layers.Embedding = Embedding
18 |     keras.models.Model = embed.Model
19 |     tf.keras.models.Model = embed.Model
20 | 
21 |     _NotExplicitClass = dict()
22 |     def _NotExplicit(T):
23 |         class _Optimizer(T):
24 |             def __init__(self, *args, **kwargs):
25 |                 self.__Class = _NotExplicitClass[T]
26 |                 explicit = kwargs.pop('explicit', False)
27 |                 super(self.__Class, self).__init__(*args, explicit=explicit, **kwargs)
28 |                 
29 |         if T not in _NotExplicitClass:
30 |             _NotExplicitClass[T] = type(T.__name__, (T,), dict(_Optimizer.__dict__))
31 |         return _NotExplicitClass[T]
32 | 
33 |     tf.keras.optimizers.Adadelta = _NotExplicit(embed.Adadelta)
34 |     tf.keras.optimizers.Adagrad = _NotExplicit(embed.Adagrad)
35 |     tf.keras.optimizers.Adam = _NotExplicit(embed.Adam)
36 |     tf.keras.optimizers.Adamax = _NotExplicit(embed.Adamax)
37 |     tf.keras.optimizers.Ftrl = _NotExplicit(embed.Ftrl)
38 |     tf.keras.optimizers.Nadam = _NotExplicit(embed.Nadam)
39 |     tf.keras.optimizers.RMSprop = _NotExplicit(embed.RMSprop)
40 |     tf.keras.optimizers.SGD = _NotExplicit(embed.SGD)
41 | 
42 | 


--------------------------------------------------------------------------------
/laboratory/inject/python:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | args=()
 3 | until [ $# == '0' ]; do
 4 |     arg="$1"
 5 |     args[${#args[@]}]="$arg"
 6 |     shift        
 7 |     if [ "X$file" == "X" ]; then
 8 |         case "$arg" in
 9 |         --* )
10 |             if [ "X$arg" == "X--help" ]; then
11 |                 help="1"
12 |             fi
13 |         ;;
14 |         -* )
15 |             for i in `seq ${#arg}`; do
16 |                 case "${arg:$i:1}" in
17 |                 c )
18 |                     command="1"
19 |                     if [ "$i" == "${#arg}" ] && [ $# != '0' ]; then
20 |                         args[${#args[@]}]="$1"
21 |                         shift
22 |                     fi
23 |                 ;;
24 |                 m )
25 |                     model="1"
26 |                     if [ "$i" == "${#arg}" ] && [ $# != '0' ]; then
27 |                         args[${#args[@]}]="$1"
28 |                         shift
29 |                     fi
30 |                 ;;
31 |                 h )
32 |                     help="1"
33 |                 esac
34 |             done
35 |         ;;
36 |         * )
37 |             if [ "X$model" == "X" ] && [ "X$command" == "X" ]  && [ "X$help" == "X" ]; then
38 |                 file="1"
39 |                 if grep -q "import tensorflow" "$arg" 2>/dev/null; then
40 |                     export HYPEREMBEDDING_INJECT_TENSORFLOW="1"
41 |                 fi
42 |             fi
43 |         esac
44 |     fi
45 | done
46 | 


--------------------------------------------------------------------------------
/laboratory/inject/sitecustomize.py:
--------------------------------------------------------------------------------
 1 | # install the apport exception handler if available
 2 | try:
 3 |     import apport_python_hook
 4 | except ImportError:
 5 |     pass
 6 | else:
 7 |     apport_python_hook.install()
 8 | 
 9 | import os
10 | if os.environ.get('HYPEREMBEDDING_INJECT_TENSORFLOW', None) == '1':
11 |     import sys
12 |     sys.argv=[""]
13 |     import openembedding_inject_tensorflow
14 | 


--------------------------------------------------------------------------------
/laboratory/onnx/criteo_deepctr_torch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas
 3 | import torch
 4 | # import horovod.torch as hvd
 5 | import time
 6 | import numpy as np
 7 | import sklearn
 8 | import deepctr_torch as deepctr
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--data', required=True)
13 | parser.add_argument('--optimizer', default='Adagrad', choices=['Adagrad'])
14 | parser.add_argument('--model', default="DeepFM", choices=["WDL", 'DeepFM', 'XDeepFM'])
15 | parser.add_argument('--embedding_dim', default=9, type=int)
16 | parser.add_argument('--batch_size', default=4096, type=int)
17 | parser.add_argument('--epochs', default=2, type=int)
18 | parser.add_argument('--onnx', action='store_true')
19 | parser.add_argument('--cpu', action='store_true')
20 | args = parser.parse_args()
21 | # hvd.init()
22 | # if args.cpu:
23 | #     device = 'cpu'
24 | # else:
25 | #    # torch.cuda.set_device(hvd.local_rank())
26 | #    device = 'cuda:{}'.format(hvd.local_rank())
27 | 
28 | device = 'cuda'
29 | def train_model(model, x, y, batch_size, epochs=1, optimizer=torch.optim.Adagrad):
30 |     x = [np.expand_dims(tensor, 1) for tensor in x]
31 |     x = torch.from_numpy(np.concatenate(x, axis=-1))
32 |     y = torch.from_numpy(y)
33 |     train_tensor_data = torch.utils.data.TensorDataset(x, y)
34 |     train_loader = torch.utils.data.DataLoader(dataset=train_tensor_data, batch_size=batch_size)
35 |     loss_func = torch.nn.functional.binary_cross_entropy
36 |     for epoch in range(epochs):
37 |         start_time = time.time()
38 |         epoch_loss = 0.0
39 |         epoch_auc = 0.0
40 |         for x_train, y_train in train_loader:
41 |             x_train = x_train.to(device).float()
42 |             y_train = y_train.to(device).float()
43 |             y_pred = model(x_train).to(device).squeeze()
44 |             optimizer.zero_grad()
45 |             loss = loss_func(y_pred, y_train.squeeze(), reduction='sum')
46 |             epoch_loss += loss.item()
47 |             loss.backward()
48 |             optimizer.step()
49 |             # train_result["AUC"].append(sklearn.metrics.roc_auc_score(
50 |             #       y.cpu().data.numpy(), y_pred.cpu().data.numpy().astype("float64")))
51 | 
52 |         epoch_time = int(time.time() - start_time)
53 |         print('Epoch {0}/{1}'.format(epoch + 1, epochs))
54 |         eval_str = "{0}s - loss: {1: .4f}".format(epoch_time, epoch_loss)
55 |         # eval_str += " - " + name + ": {0: .4f}".format(epoch_logs[name])
56 |         print(eval_str)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     data = pandas.read_csv(args.data)
61 |     num_lines = data.shape[0]
62 |     num_local_lines = num_lines // args.batch_size * args.batch_size
63 |     local_start = 0
64 |     # num_local_lines = int(num_lines / hvd.size()) // args.batch_size * args.batch_size
65 |     # local_start = hvd.local_rank() * num_local_lines
66 |     local_end = local_start + num_local_lines
67 |     print("num_lines:%d, num_local_lines:%d" % (num_lines, num_local_lines))
68 |     print("local_start:%d, local_end:%d" % (local_start, local_end))
69 |     
70 |     target = ['label']
71 |     dense_features = ['I' + str(i) for i in range(1, 14)]
72 |     sparse_features = ['C' + str(i) for i in range(1, 27)]
73 |     print(data.columns)
74 |     
75 |     feature_columns = []
76 |     for name in sparse_features:
77 |         feature_columns.append(deepctr.inputs.SparseFeat(name, data[name].max() + 1, dtype='int64'))
78 |     for name in dense_features:
79 |         feature_columns.append(deepctr.inputs.DenseFeat(name, 1, dtype='float32'))
80 |     train = data.iloc[local_start:local_end]
81 |     train_model_input = {name:train[name] for name in sparse_features + dense_features}
82 | 
83 |     if args.model == 'WDL':
84 |         fc_sizes = (512, 256, 128, 32)
85 |     elif args.model in {'DeepFM', 'xDeepFM'}:
86 |         fc_sizes = (400, 400, 400)
87 |     else:
88 |         print("unknown model ", args.model)
89 |     model = eval("deepctr.models." + args.model)(feature_columns, feature_columns, device=device,
90 |           task='binary', dnn_hidden_units=fc_sizes, l2_reg_linear=0, l2_reg_embedding=0)
91 |     x = [train_model_input[name] for name in model.feature_index]
92 |     if args.onnx:
93 |         from onnxruntime.training.ortmodule import ORTModule
94 |         model = ORTModule(model)
95 |     optimizer=torch.optim.Adagrad(model.parameters())
96 |     train_model(model, x, train[target].values,
97 |           batch_size=args.batch_size, epochs=args.epochs, optimizer=optimizer)
98 | 


--------------------------------------------------------------------------------
/laboratory/publish-serving.sh:
--------------------------------------------------------------------------------
 1 | echo $1
 2 | target=$1/tensorflow_serving/custom_ops
 3 | if [ "X$1" != "X" ]; then
 4 |     mkdir -p "$target"
 5 |     mkdir -p "$target/openembedding"
 6 |     mkdir -p "$target/openembedding/core"
 7 |     mkdir -p "$target/openembedding/tensorflow"
 8 |     cp "./build/openembedding/core/libcexb_pack.so" "$target/openembedding/core/libcexb_pack.so"
 9 |     cp "./openembedding/core/c_api.h" "$target/openembedding/core/c_api.h"
10 |     cp "./openembedding/tensorflow/exb_ops.cpp" "$target/openembedding/tensorflow/exb_ops.cpp"
11 |     cp "./openembedding/tensorflow/exb_ops.cpp" "$target/openembedding/tensorflow/exb_ops.h"
12 | fi
13 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/Dockerfile.criteo:
--------------------------------------------------------------------------------
 1 | FROM openembedding-demo:2.2
 2 | ADD train.csv /root/train.csv
 3 | ADD dac_sample.csv /root/dac_sample.csv
 4 | ADD laboratory/strangedemo/criteo_predict.py /root/criteo_predict.py
 5 | ADD laboratory/strangedemo/criteo_deepctr /root/criteo_deepctr
 6 | ADD laboratory/strangedemo/criteo_deepctr_np /root/criteo_deepctr_np
 7 | ADD laboratory/strangedemo/criteo_lr /root/criteo_lr
 8 | 
 9 | RUN ln -s /root/train.csv /root/criteo_deepctr/train.csv && \
10 |     ln -s /root/dac_sample.csv /root/criteo_deepctr/dac_sample.csv && \
11 |     ln -s /root/criteo_predict.py /root/criteo_deepctr/criteo_predict.py 
12 | 
13 | RUN ln -s /root/train.csv /root/criteo_deepctr_np/train.csv && \
14 |     ln -s /root/dac_sample.csv /root/criteo_deepctr_np/dac_sample.csv && \
15 |     ln -s /root/criteo_predict.py /root/criteo_deepctr_np/criteo_predict.py 
16 | 
17 | RUN ln -s /root/train.csv /root/criteo_lr/train.csv && \
18 |     ln -s /root/dac_sample.csv /root/criteo_lr/dac_sample.csv && \
19 |     ln -s /root/criteo_predict.py /root/criteo_lr/criteo_predict.py 
20 | WORKDIR /root
21 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/Dockerfile.push:
--------------------------------------------------------------------------------
1 | FROM ubuntu:18.04
2 | RUN apt-get update && apt-get install -y vim apt-transport-https ca-certificates curl gnupg-agent software-properties-common wget
3 | RUN curl -sSL https://get.daocloud.io/docker | sh
4 | RUN wget https://mirror.azure.cn/kubernetes/helm/helm-v2.14.1-linux-amd64.tar.gz && \
5 |     tar -xzf helm-v2.14.1-linux-amd64.tar.gz && cp linux-amd64/helm linux-amd64/tiller /usr/local/bin && \
6 |     rm -rf linux-amd64 helm-v2.14.1-linux-amd64.tar.gz
7 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_deepctr/criteo_deepctr.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "language_info": {
 4 |    "codemirror_mode": {
 5 |     "name": "ipython",
 6 |     "version": 3
 7 |    },
 8 |    "file_extension": ".py",
 9 |    "mimetype": "text/x-python",
10 |    "name": "python",
11 |    "nbconvert_exporter": "python",
12 |    "pygments_lexer": "ipython3",
13 |    "version": "3"
14 |   },
15 |   "orig_nbformat": 4,
16 |   "kernelspec": {
17 |    "name": "python3",
18 |    "display_name": "Python 3.7.3 64-bit"
19 |   },
20 |   "interpreter": {
21 |    "hash": "51c43b68502c46154a57a0f411be94ca0e84f1091eab4730ae0fb62cf38c2f81"
22 |   }
23 |  },
24 |  "nbformat": 4,
25 |  "nbformat_minor": 2,
26 |  "cells": [
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": null,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": [
33 |     "!python3 criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "!mlcompile criteo_deepctr.py -o ml_criteo_deepctr.py"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "!GLOG_minloglevel=1 mlrun -np 4 python3 ml_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "!python3 criteo_predict.py --data train.csv --rows 1 --model criteo_model --host {serving地址}"
61 |    ]
62 |   }
63 |  ]
64 | }


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_deepctr/criteo_deepctr.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import tensorflow as tf
 3 | import deepctr.models
 4 | import deepctr.feature_column
 5 | 
 6 | import argparse
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--data', required=True) # 输入的数据文件
 9 | parser.add_argument('--learning_rate', type=float, required=True)
10 | parser.add_argument('--batch_size', type=int, required=True)
11 | parser.add_argument('--save', required=True)
12 | args = parser.parse_args()
13 | 
14 | data = pandas.read_csv(args.data)
15 | inputs = dict()
16 | feature_columns = list()
17 | for name in data.columns:
18 |     if name[0] == 'C':
19 |         inputs[name] = data[name]
20 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
21 |             vocabulary_size=data[name].max() + 1, embedding_dim=4, dtype='int64'))
22 |     elif name[0] == 'I':
23 |         inputs[name] = data[name]
24 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
25 | 
26 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate)
27 | model = deepctr.models.xDeepFM(feature_columns, feature_columns, task='binary', 
28 |       l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0)
29 | model.compile(optimizer, 'binary_crossentropy', metrics=['AUC'])
30 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=3, verbose=2)
31 | model.save(args.save, overwrite=True, include_optimizer=False)
32 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_deepctr_np/criteo_deepctr.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import tensorflow as tf
 3 | import deepctr.models
 4 | import deepctr.feature_column
 5 | 
 6 | import argparse
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--data', required=True) # 输入的数据文件
 9 | parser.add_argument('--learning_rate', type=float, required=True)
10 | parser.add_argument('--batch_size', type=int, required=True)
11 | parser.add_argument('--save', required=True)
12 | args = parser.parse_args()
13 | 
14 | data = pandas.read_csv(args.data)
15 | inputs = dict()
16 | feature_columns = list()
17 | for name in data.columns:
18 |     if name[0] == 'C':
19 |         inputs[name] = data[name]
20 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
21 |             vocabulary_size=data[name].max() + 1, embedding_dim=64, dtype='int64'))
22 |     elif name[0] == 'I':
23 |         inputs[name] = data[name]
24 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
25 | 
26 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate)
27 | model = deepctr.models.DeepFM(feature_columns, feature_columns, task='binary', 
28 |     l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0)
29 | model.compile(optimizer, 'binary_crossentropy', metrics=['AUC'])
30 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=3, verbose=2)
31 | model.save(args.save, overwrite=True, include_optimizer=False)
32 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_deepctr_np/criteo_deepctr_np.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "language_info": {
 4 |    "codemirror_mode": {
 5 |     "name": "ipython",
 6 |     "version": 3
 7 |    },
 8 |    "file_extension": ".py",
 9 |    "mimetype": "text/x-python",
10 |    "name": "python",
11 |    "nbconvert_exporter": "python",
12 |    "pygments_lexer": "ipython3",
13 |    "version": 3
14 |   },
15 |   "orig_nbformat": 4
16 |  },
17 |  "nbformat": 4,
18 |  "nbformat_minor": 2,
19 |  "cells": [
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": [
26 |     "!horovodrun -np 4 python3 horovod_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1"
27 |    ]
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": null,
32 |    "metadata": {},
33 |    "outputs": [],
34 |    "source": [
35 |     "!mlcompile criteo_deepctr.py -o ml_criteo_deepctr.py"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": null,
41 |    "metadata": {},
42 |    "outputs": [],
43 |    "source": [
44 |     "!GLOG_minloglevel=1 mlrun -np 4 python3 ml_criteo_deepctr.py --data train.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2"
45 |    ]
46 |   },
47 |   {
48 |    "cell_type": "code",
49 |    "execution_count": null,
50 |    "metadata": {},
51 |    "outputs": [],
52 |    "source": [
53 |     "!python3 criteo_predict.py --data train.csv --rows 1 --model criteo_model --host {serving地址}"
54 |    ]
55 |   }
56 |  ]
57 | }


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_deepctr_np/horovod_criteo_deepctr.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import tensorflow as tf
 3 | import deepctr.models
 4 | import deepctr.feature_column
 5 | import horovod.tensorflow.keras as hvd
 6 | 
 7 | import argparse
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--data', required=True) # 输入的数据文件
10 | parser.add_argument('--learning_rate', required=True, type=float)
11 | parser.add_argument('--batch_size', required=True, type=int)
12 | parser.add_argument('--save', required=True)
13 | args = parser.parse_args()
14 | 
15 | hvd.init()
16 | gpus = tf.config.experimental.list_physical_devices('GPU')
17 | if gpus:
18 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank() % len(gpus)], 'GPU')
19 | 
20 | data = pandas.read_csv(args.data)
21 | inputs = dict()
22 | feature_columns = list()
23 | for name in data.columns:
24 |     if name[0] == 'C':
25 |         inputs[name] = data[name]
26 |         feature_columns.append(deepctr.feature_column.SparseFeat(name,
27 |             vocabulary_size=data[name].max() + 1, embedding_dim=64, dtype='int64'))
28 |     elif name[0] == 'I':
29 |         inputs[name] = data[name]
30 |         feature_columns.append(deepctr.feature_column.DenseFeat(name, 1, dtype='float32'))
31 | 
32 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate)
33 | model = deepctr.models.DeepFM(feature_columns, feature_columns, task='binary', 
34 |     l2_reg_linear=0, l2_reg_embedding=0, l2_reg_dnn=0)
35 | 
36 | # 使用 horovod 实现数据并行
37 | optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum)
38 | n = data.shape[0] // hvd.size() * hvd.size()
39 | for key in inputs.keys():
40 |     inputs[key] = inputs[key][hvd.rank():n:hvd.size()]
41 | labels = data['label'][hvd.rank():n:hvd.size()]
42 | callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0),
43 |              hvd.callbacks.MetricAverageCallback()]
44 | model.compile(optimizer, "binary_crossentropy", metrics=['AUC'])
45 | model.fit(inputs, labels, callbacks=callbacks,
46 |           batch_size=args.batch_size, epochs=3, verbose=2)
47 | 
48 | if hvd.rank() == 0:
49 |     model.save(args.save, overwrite=True, include_optimizer=False)
50 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_lr/criteo_lr.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "language_info": {
 4 |    "codemirror_mode": {
 5 |     "name": "ipython",
 6 |     "version": 3
 7 |    },
 8 |    "file_extension": ".py",
 9 |    "mimetype": "text/x-python",
10 |    "name": "python",
11 |    "nbconvert_exporter": "python",
12 |    "pygments_lexer": "ipython3",
13 |    "version": "3"
14 |   },
15 |   "orig_nbformat": 4,
16 |   "kernelspec": {
17 |    "name": "python3",
18 |    "display_name": "Python 3.7.3 64-bit"
19 |   },
20 |   "interpreter": {
21 |    "hash": "51c43b68502c46154a57a0f411be94ca0e84f1091eab4730ae0fb62cf38c2f81"
22 |   }
23 |  },
24 |  "nbformat": 4,
25 |  "nbformat_minor": 2,
26 |  "cells": [
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": null,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": [
33 |     "!python3 criteo_lr.py --data dac_sample.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/1"
34 |    ]
35 |   },
36 |   {
37 |    "cell_type": "code",
38 |    "execution_count": null,
39 |    "metadata": {},
40 |    "outputs": [],
41 |    "source": [
42 |     "!mlcompile criteo_lr.py -o ml_criteo_lr.py"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "!GLOG_minloglevel=1 python3 ml_criteo_lr.py --data dac_sample.csv --learning_rate 0.001 --batch_size 4096 --save criteo_model/2"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "code",
56 |    "execution_count": null,
57 |    "metadata": {},
58 |    "outputs": [],
59 |    "source": [
60 |     "!python3 criteo_predict.py --data dac_sample.csv --rows 1 --model criteo_model --host {serving地址}"
61 |    ]
62 |   }
63 |  ]
64 | }


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_lr/criteo_lr.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | import tensorflow as tf
 3 | 
 4 | def CriteoLR(features, input_dim):
 5 |     embeddings = tf.keras.layers.Embedding(input_dim, 1,
 6 |           embeddings_initializer=tf.keras.initializers.Zeros())
 7 |     fields = list()
 8 |     for name, tensor in features.items():
 9 |         if name[0] == 'C':
10 |             fields.append(embeddings(tensor))
11 |         else:
12 |             fields.append(tf.reshape(tensor, [-1, 1, 1]))
13 |     concat = tf.keras.layers.concatenate(fields)
14 |     output = tf.keras.layers.Dense(1, activation='sigmoid')(concat)
15 |     return tf.keras.models.Model(inputs=features, outputs=[output])
16 | 
17 | import argparse
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument('--data', required=True)
20 | parser.add_argument('--learning_rate', type=float, required=True)
21 | parser.add_argument('--batch_size', type=int, required=True)
22 | parser.add_argument('--save', required=True)
23 | args = parser.parse_args()
24 | data = pandas.read_csv(args.data)
25 | inputs = dict()
26 | features = dict()
27 | vocabulary_size = 0
28 | for name in data.columns:
29 |     if name[0] == 'C':
30 |         inputs[name] = data[name] + vocabulary_size
31 |         features[name] = tf.keras.Input(shape=[1], name=name, dtype=tf.int64)
32 |         vocabulary_size += data[name].max() + 1
33 |     elif name[0] == 'I':
34 |         inputs[name] = data[name]
35 |         features[name] = tf.keras.Input(shape=[1], name=name, dtype=tf.float32)
36 | 
37 | # compile distributed model
38 | optimizer = tf.keras.optimizers.Adagrad(args.learning_rate)
39 | model = CriteoLR(features, vocabulary_size)
40 | model.compile(optimizer, 'binary_crossentropy')
41 | model.fit(inputs, data['label'], batch_size=args.batch_size, epochs=5, verbose=2)
42 | model.save(args.save, overwrite=True, include_optimizer=False)
43 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/criteo_predict.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pandas
 4 | import argparse
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('--data', required=True)
 7 | parser.add_argument('--rows', type=int, required=True)
 8 | parser.add_argument('--model', required=True)
 9 | parser.add_argument('--host', required=True)
10 | args = parser.parse_args()
11 | data = pandas.read_csv(args.data, nrows=args.rows)
12 | 
13 | inputs = dict()
14 | for name in data.columns:
15 |     if name[0] == 'C':
16 |         inputs[name] = [[int(value)] for value in data[name]]
17 |     elif name[0] == 'I':
18 |         inputs[name] = [[float(value)] for value in data[name]]
19 | post = json.dumps({'inputs':inputs})
20 | command = f"curl -d '{post}' {args.host}/v1/models/{args.model}:predict"
21 | print(command)
22 | os.system(command)
23 | 
24 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/hook/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:2.2.0-gpu
 2 | RUN apt-get update && apt-get install -y gcc-7 g++-7 cmake
 3 | RUN apt-get install -y vim wget
 4 | RUN pip3 install horovod pandas scikit-learn deepctr
 5 | RUN pip3 install jupyter jupyterlab
 6 | 
 7 | ADD openembedding-0.1.0.tar.gz /openembedding/openembedding-0.1.0.tar.gz
 8 | RUN pip3 install /openembedding/openembedding-0.1.0.tar.gz
 9 | ADD laboratory/strangedemo/hook /openembedding/hook
10 | WORKDIR /openembedding/hook
11 | RUN bash install.sh
12 | WORKDIR /root
13 | RUN rm -rf /openembedding


--------------------------------------------------------------------------------
/laboratory/strangedemo/hook/install.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | site=`python3 -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"`
3 | cp openembedding_hook_tensorflow.py ${site}/
4 | cp mlcompile /usr/local/bin/
5 | cp mlrun /usr/local/bin/
6 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/hook/mlcompile:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ "$#" != "3" ]; then
3 |     echo -e "Usage: mlcompile file.py -o out.py"
4 |     exit 1
5 | fi
6 | 
7 | echo import openembedding_hook_tensorflow >$3
8 | cat $1 >> $3
9 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/hook/mlrun:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | horovodrun $@
3 | 


--------------------------------------------------------------------------------
/laboratory/strangedemo/hook/openembedding_hook_tensorflow.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python import keras
 3 | from tensorflow.keras.models import Model as KerasModel
 4 | from tensorflow.keras.layers import Embedding as KerasEmbedding
 5 | import horovod.tensorflow.keras as hvd
 6 | import openembedding.tensorflow as embed
 7 | 
 8 | 
 9 | class Embedding(embed.Embedding):
10 |     def __init__(self, *args, **kwargs):
11 |         explicit = kwargs.pop('explicit', False)
12 |         super().__init__(*args, explicit=explicit, **kwargs)
13 | 
14 | 
15 | class Model(embed.Model):
16 |     def __init__(self, *args, **kwargs):
17 |         super().__init__(*args, **kwargs)
18 |     
19 |     def compile(self, optimizer, *args, **kwargs):
20 |         kwargs.pop('experimental_run_tf_function', None)
21 |         optimizer = embed.distributed_optimizer(optimizer, explicit=False)
22 |         optimizer = hvd.DistributedOptimizer(optimizer, op=hvd.Sum)
23 |         return super().compile(optimizer, *args, experimental_run_tf_function=False, **kwargs)
24 |     
25 |     def save(self, *args, **kwargs):
26 |         if hvd.rank() == 0:
27 |             keras.layers.Embedding = KerasEmbedding
28 |             tf.keras.layers.Embedding = KerasEmbedding
29 |             keras.Model = KerasModel
30 |             tf.keras.Model = KerasModel
31 |             keras.models.Model = KerasModel
32 |             tf.keras.models.Model = KerasModel
33 |             super().save_as_original_model(*args, **kwargs)
34 |             keras.layers.Embedding = Embedding
35 |             tf.keras.layers.Embedding = Embedding
36 |             keras.Model = Model
37 |             tf.keras.Model = Model
38 |             keras.models.Model = Model
39 |             tf.keras.models.Model = Model
40 | 
41 | 
42 |     def save_weights(self, *args, **kwargs):
43 |         if hvd.rank() == 0:
44 |             super().save_weights(*args, **kwargs)
45 | 
46 |     def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, *args, **kwargs):
47 |         if isinstance(x, dict):
48 |             x1 = dict()
49 |             n = len(y) // hvd.size() * hvd.size()
50 |             for key, value in x.items():
51 |                 x1[key] = value[hvd.rank():n:hvd.size()]
52 |             y1 = y[hvd.rank():n:hvd.size()]
53 |         else:
54 |             raise ValueError('only support dict input')
55 |         if not callbacks:
56 |             callbacks = []
57 |         callbacks = callbacks + [
58 |               hvd.callbacks.BroadcastGlobalVariablesCallback(0),
59 |               hvd.callbacks.MetricAverageCallback() ]
60 |         return super().fit(x1, y1, batch_size, epochs, verbose, callbacks=callbacks, *args, **kwargs)
61 | 
62 | 
63 | keras.layers.Embedding = Embedding
64 | tf.keras.layers.Embedding = Embedding
65 | keras.Model = Model
66 | tf.keras.Model = Model
67 | keras.models.Model = Model
68 | tf.keras.models.Model = Model
69 | 
70 | hvd.init()
71 | gpus = tf.config.experimental.list_physical_devices('GPU')
72 | if gpus:
73 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank() % len(gpus)], 'GPU')
74 | 


--------------------------------------------------------------------------------
/openembedding/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include_directories(client server variable entry)
 2 | 
 3 | file(GLOB exb_src client/*.cpp server/*.cpp variable/EmbeddingVariable.cpp entry/c_api.cc)
 4 | add_library(cexb_obj OBJECT ${exb_src})
 5 | 
 6 | # cexb_static for tensorflow ops
 7 | add_library(cexb_static STATIC $<TARGET_OBJECTS:cexb_obj>)
 8 | target_link_libraries(cexb_static pico_ps_static pico_core_static
 9 |       ${PicoCoreDep_STATIC_LIBRARIES} ${Jemalloc_pic_STATIC_LIBRARIES}
10 |       ${RDMA_LIBRARIES} ${PMEM_STATIC_LIBRARIES} ${NDCTL_LIBRARIES} dl)
11 | 
12 | # cexb with out static libraries
13 | # add_library(cexb SHARED $<TARGET_OBJECTS:cexb_obj>)
14 | # target_link_libraries(cexb pico_ps pico_core ${PicoCoreDep_LIBRARIES} ${Jemalloc_pic_LIBRARIES} ${RDMA_LIBRARIES} dl)
15 | 
16 | # cexb_pack with static libraries for tensorflow-serving
17 | add_library(cexb_pack SHARED $<TARGET_OBJECTS:cexb_obj>)
18 | target_link_libraries(cexb_pack pico_ps_static pico_core_static
19 |       ${PicoCoreDep_STATIC_LIBRARIES} ${Jemalloc_pic_STATIC_LIBRARIES}
20 |       ${RDMA_LIBRARIES} ${PMEM_STATIC_LIBRARIES} ${NDCTL_LIBRARIES} dl)
21 | 
22 | add_executable(masterd entry/masterd.cc)
23 | target_link_libraries(masterd cexb_static)
24 | 
25 | add_executable(server entry/server.cc)
26 | target_link_libraries(server cexb_static)
27 | 
28 | find_package(Protobuf REQUIRED)
29 | find_package(OpenSSL REQUIRED)
30 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
31 | find_lib(BRPC_STATIC_LIBRARIES STATIC LIBS brpc protobuf)
32 | find_lib(BRPC_DYNAMIC_LIBRARIES SHARED LIBS leveldb)
33 | protobuf_generate_cpp(PROTO_SRC PROTO_HEADER entry/controller.proto)
34 | add_executable(controller entry/controller.cc ${PROTO_SRC})
35 | target_compile_options(controller PRIVATE -Wno-unused-parameter)
36 | target_link_libraries(controller cexb_static ${BRPC_STATIC_LIBRARIES} ${BRPC_DYNAMIC_LIBRARIES} ${OPENSSL_CRYPTO_LIBRARY} ${OPENSSL_SSL_LIBRARY})
37 | 
38 | option(SKIP_CHECK_WHEEL_SETUP "try build tensorflow operator" OFF)
39 | if (NOT SKIP_CHECK_WHEEL_SETUP)
40 |     # py_api should be compiled during pip install, here is just for simple verification.
41 |     execute_process(COMMAND ${PYTHON} -c "import pybind11; print(pybind11.get_include(), end=\"\")"
42 |         OUTPUT_VARIABLE PYBIND11_INCLUDE)
43 |     add_library(exb SHARED entry/py_api.cc)
44 |     target_include_directories(exb PRIVATE ${PYBIND11_INCLUDE})
45 |     target_link_libraries(exb PRIVATE cexb_pack)
46 | 
47 |     add_subdirectory(tensorflow)
48 | endif()
49 | 
50 | # tests
51 | find_package(PicoTestDep)
52 | link_libraries(cexb_static ${PicoTestDep_STATIC_LIBRARIES})
53 | add_executable(c_api_test entry/c_api_test.cpp)
54 | add_executable(c_api_ha_test entry/c_api_ha_test.cpp)
55 | if (USE_DCPMM)
56 |     add_executable(pmem_c_api_test entry/pmem_c_api_test.cpp)
57 |     add_executable(pmem_embedding_table_test variable/pmem_embedding_table_test.cpp)
58 | endif()
59 | 
60 | include(GoogleTest)
61 | gtest_discover_tests(c_api_test)
62 | # At present, ha_test has a probability of failing, 
63 | # because the current ps restore dead node has a small probability of failing.
64 | # This situation is currently considered by unittest to be caused by an abnormal restore crash.
65 | # Actually, you only need to restart again at this time. 
66 | # When restarting the PS, the startup failure should be considered.
67 | # gtest_discover_tests(c_api_ha_test)


--------------------------------------------------------------------------------
/openembedding/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ctypes
 3 | libcexb_pack = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + '/libcexb_pack.so')
 4 | import openembedding.libexb as libexb
 5 | 
 6 | __version__ = libexb.version()
 7 | 
 8 | 
 9 | 
10 | '''
11 | Master
12 | 
13 | config: Configure in yaml format
14 | 
15 | master_endpoint: Required when the Server or worker is initialized
16 |     '': Start a Master in this process
17 | 
18 |     '{ip}:{port}': The endpoint of master
19 | 
20 | bind_ip: Used by worker, Server and Master
21 |     '': automatically bind to the ip address of a network card
22 | 
23 |     '{ip}': specify the ip address, bind on random port
24 | 
25 |     '{ip}:{port}': bind on the specified ip and port, only supported by Master
26 | 
27 | num_workers: should be consistent in different workers
28 | 
29 | wait_num_servers: should be consistent in different workers
30 |     -1: start a Server in each worker process.
31 |     n: need to wait the number of Servers start.
32 | '''
33 | class Flags:
34 |     def __init__(self, config='', master_endpoint='', bind_ip='', num_workers=1, wait_num_servers=-1):
35 |         self.config = config
36 |         self.master_endpoint = master_endpoint
37 |         self.bind_ip = bind_ip
38 |         self.num_workers = num_workers
39 |         self.wait_num_servers = wait_num_servers
40 | flags = Flags()
41 | 
42 | '''
43 | Run a master in this process.
44 | '''
45 | class Master:
46 |     def __init__(self):
47 |         self.__master = libexb.Master(flags.bind_ip)
48 | 
49 |     def __del__(self):
50 |         self.__master.finalize()
51 | 
52 |     @property
53 |     def endpoint(self):
54 |         '''
55 |         The format is '{ip}:{port}'.
56 |         '''
57 |         return self.__master.endpoint
58 | 
59 | '''
60 | Run a parameter server in this process.
61 | '''
62 | class Server:
63 |     def __init__(self):
64 |         self.__server = libexb.Server(flags.config, flags.master_endpoint, flags.bind_ip)
65 | 
66 |     def exit(self):
67 |         '''
68 |         Send exit request to this server.
69 |         '''
70 |         return self.__server.exit()
71 |     
72 |     def join(self):
73 |         '''
74 |         Waiting for the server to exit.
75 |         '''
76 |         return self.__server.join()


--------------------------------------------------------------------------------
/openembedding/client/Communication.cpp:
--------------------------------------------------------------------------------
  1 | #include "Communication.h"
  2 | 
  3 | namespace paradigm4 {
  4 | namespace pico {
  5 | namespace embedding {
  6 | 
  7 | 
  8 | Communication::Communication(): _comm_size(1) {} // for native connection
  9 | 
 10 | Communication::Communication(core::RpcService* rpc, int32_t comm_size, std::string rpc_name) {
 11 |     _rpc = rpc;
 12 |     _comm_size = comm_size;
 13 |     _rpc_name = rpc_name;
 14 |     _rpc_server = _rpc->create_server(_rpc_name);
 15 |     _serving_th = std::thread(&Communication::serving, this);
 16 |     _rpc_client = _rpc->create_client(_rpc_name, comm_size);
 17 |     core::RpcServiceInfo info;
 18 |     _rpc_client->get_rpc_service_info(info);
 19 |     SCHECK(info.servers.size() == static_cast<size_t>(comm_size)) << "error sync num";
 20 |     for (core::ServerInfo server: info.servers) {
 21 |         SCHECK(server.server_id < comm_size) << "error server id";
 22 |     }
 23 |     _comm_rank = _rpc_server->id();
 24 |     SCHECK(_comm_rank < comm_size) << "error comm rank";
 25 |     _dealer = [this]() { return _rpc_client->create_dealer(); };
 26 | }
 27 | 
 28 | Communication::~Communication() {
 29 |     _dealer.clear();
 30 |     _rpc_server->terminate();
 31 |     _serving_th.join();
 32 |     _rpc_client.reset();
 33 |     _rpc_server.reset();
 34 |     _rpc->deregister_rpc_service(_rpc_name);
 35 | }
 36 | 
 37 | 
 38 | comm_rank_t Communication::barrier(std::string name) {
 39 |     int32_t num = _comm_size;
 40 |     if (num == 1) {
 41 |         return _comm_rank;
 42 |     }
 43 | 
 44 |     core::RpcRequest req;
 45 |     req.head().sid = std::hash<std::string>()(name) % _comm_size;
 46 |     req << BARRIER << name << num << _comm_rank;
 47 | 
 48 |     std::shared_ptr<core::Dealer> dealer = _dealer.acquire();
 49 |     core::RpcResponse resp = dealer->sync_rpc_call(std::move(req));
 50 |     _dealer.release(std::move(dealer));
 51 | 
 52 |     comm_rank_t selected;
 53 |     resp >> selected;
 54 |     return selected;
 55 | }
 56 | 
 57 | bool Communication::load_model_sign(const std::string& model_sign) {
 58 |     core::RpcRequest req;
 59 |     req.head().sid = 0;
 60 |     req << LOAD_MODEL_SIGN << model_sign;
 61 | 
 62 |     std::shared_ptr<core::Dealer> dealer = _dealer.acquire();
 63 |     core::RpcResponse resp = dealer->sync_rpc_call(std::move(req));
 64 |     _dealer.release(std::move(dealer));
 65 | 
 66 |     bool result;
 67 |     resp >> result;
 68 |     return result;
 69 | }
 70 | 
 71 | void Communication::inner_boardcast(std::string name, core::BinaryArchive& ar, comm_rank_t from) {
 72 |     int32_t num = _comm_size;
 73 |     if (num == 1) {
 74 |         return;
 75 |     }
 76 | 
 77 |     core::RpcRequest req;
 78 |     req.head().sid = from;
 79 |     bool is_main = from == _comm_rank;
 80 |     req << BOARD_CAST << name << num << is_main;
 81 |     if (is_main) {
 82 |         req << ar;
 83 |     }
 84 | 
 85 |     std::shared_ptr<core::Dealer> dealer = _dealer.acquire();
 86 |     core::RpcResponse resp = dealer->sync_rpc_call(std::move(req));
 87 |     _dealer.release(std::move(dealer));
 88 |     if (!is_main) {
 89 |         resp >> ar;
 90 |     }
 91 | }
 92 | 
 93 | void Communication::serving() {
 94 |     core::RpcRequest req;
 95 |     std::shared_ptr<core::Dealer> dealer = _rpc_server->create_dealer();
 96 |     while (dealer->recv_request(req)) {
 97 |         uint32_t req_type;
 98 |         req >> req_type;
 99 |         if (req_type == BOARD_CAST) {
100 |             std::string name;
101 |             req >> name;
102 |             uint32_t num;
103 |             req >> num;
104 |             auto& reqs = _reqs[name];
105 |             reqs.push_back(std::move(req));
106 |             if (reqs.size() >= num) {
107 |                 SCHECK(reqs.size() == num) << "error barrier node num!";
108 |                 core::BinaryArchive ar;
109 |                 for (core::RpcRequest& req1: reqs) {
110 |                     bool is_main;
111 |                     req1 >> is_main;
112 |                     if (is_main) {
113 |                         req1 >> ar;
114 |                     }
115 |                 }
116 |                 for (core::RpcRequest& req1: reqs) {
117 |                     core::RpcResponse resp(req1);
118 |                     resp << ar;
119 |                     dealer->send_response(std::move(resp));
120 |                 }
121 |                 _reqs.erase(name);
122 |             }
123 |         } else if (req_type == LOAD_MODEL_SIGN) {
124 |             std::string model_sign;
125 |             req >> model_sign;
126 |             core::RpcResponse resp(req);
127 |             resp << (_model_sign != model_sign);
128 |             _model_sign = model_sign;
129 |             dealer->send_response(std::move(resp));
130 |         } else if (req_type == BARRIER) {
131 |             std::string name;
132 |             req >> name;
133 |             uint32_t num;
134 |             req >> num;
135 |             auto& reqs = _barriers[name];
136 |             reqs.push_back(std::move(req));
137 |             if (reqs.size() >= num) {
138 |                 SCHECK(reqs.size() == num) << "error barrier node num: " << reqs.size() << ' ' << num;
139 |                 int32_t fast_comm_rank = -1;
140 |                 for (core::RpcRequest& req1: reqs) {
141 |                     if (fast_comm_rank == -1) {
142 |                         req1 >> fast_comm_rank;
143 |                     }
144 |                     core::RpcResponse resp(req1);
145 |                     resp << fast_comm_rank;
146 |                     dealer->send_response(std::move(resp));
147 |                 }
148 |                 _barriers.erase(name);
149 |             }
150 |         }
151 |     }
152 | }
153 | 
154 | 
155 | }
156 | }
157 | }
158 | 


--------------------------------------------------------------------------------
/openembedding/client/Communication.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_COMMUNICATION_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_COMMUNICATION_H
 3 | 
 4 | #include <pico-core/RpcService.h>
 5 | #include "ObjectPool.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | // Here comm_rank is in [0, comm_size], and the corresponding rpc global_rank can be any value.
12 | class Communication {
13 |     enum reqs {
14 |         BOARD_CAST = 0,
15 |         LOAD_MODEL_SIGN = 1,
16 |         BARRIER = 2,
17 |     };
18 | public:
19 |     Communication();
20 |     Communication(core::RpcService* rpc, int32_t comm_size, std::string rpc_name = "sync_runner_rpc_api");
21 | 
22 |     ~Communication();
23 | 
24 |     int32_t comm_rank() {
25 |         return _comm_rank;
26 |     }
27 | 
28 |     int32_t comm_size() {
29 |         return _comm_size;
30 |     }
31 | 
32 |     comm_rank_t barrier(std::string name);
33 | 
34 |     template<class Fn>
35 |     auto sync_bcast(const std::string& name, Fn fn) {
36 |         comm_rank_t from = barrier(name);
37 |         decltype(fn()) result;
38 |         if (_comm_rank == from) {
39 |             result = fn();
40 |         }
41 |         boardcast(name, result, from);
42 |         return result;
43 |     }
44 | 
45 |     template<class T>
46 |     void boardcast(std::string name, T& value, comm_rank_t from) {
47 |         core::BinaryArchive ar;
48 |         ar << value;
49 |         inner_boardcast(name, ar, from);
50 |         ar >> value;
51 |     }
52 | 
53 |     bool load_model_sign(const std::string& model_sign);
54 | 
55 | private:
56 |     void serving();
57 | 
58 |     void inner_boardcast(std::string name, core::BinaryArchive& ar, comm_rank_t from);
59 | 
60 |     core::RpcService* _rpc = nullptr;
61 |     int32_t _comm_size = 0;
62 |     std::string _rpc_name;
63 |         
64 |     int32_t _comm_rank = 0;
65 |    
66 |     std::string _model_sign;
67 |     std::thread _serving_th;
68 |     std::unique_ptr<core::RpcServer> _rpc_server;
69 |     std::unique_ptr<core::RpcClient> _rpc_client;
70 |     ObjectPool<std::shared_ptr<core::Dealer>> _dealer;
71 |     std::unordered_map<std::string, std::vector<core::RpcRequest>> _reqs;
72 |     std::unordered_map<std::string, std::vector<core::RpcRequest>> _barriers;
73 | };
74 | 
75 | }
76 | }
77 | }
78 | 
79 | #endif


--------------------------------------------------------------------------------
/openembedding/client/Connection.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_CONNECTION_H
  2 | #define PARADIGM4_HYPEREMBEDDING_CONNECTION_H
  3 | 
  4 | #include <pico-core/PicoJsonNode.h>
  5 | #include <pico-core/RpcServer.h>
  6 | #include <pico-core/MasterClient.h>
  7 | #include <pico-ps/service/Server.h>
  8 | #include <pico-ps/service/Client.h>
  9 | #include <pico-ps/common/defs.h>
 10 | #include <pico-ps/controller/Controller.h>
 11 | 
 12 | #include "Meta.h"
 13 | #include "EmbeddingVariableHandle.h"
 14 | #include "EnvConfig.h"
 15 | 
 16 | namespace paradigm4 {
 17 | namespace pico {
 18 | namespace embedding {
 19 | 
 20 | class Connection {
 21 | public:
 22 |     virtual ~Connection() {};
 23 |     virtual comm_rank_t global_rank() = 0;
 24 |     virtual ps::Context* server_context() = 0;
 25 |     virtual std::vector<int32_t> running_servers() = 0;
 26 |     virtual ps::Status close_server(int32_t server_id) = 0;
 27 |     virtual void close_servers() = 0;
 28 |     virtual ps::Status create_storage(const std::map<int32_t, std::vector<int32_t>>& node_shards, int32_t& storage_id) = 0;
 29 |     virtual ps::Status delete_storage(int32_t storage_id) = 0;
 30 |     virtual ps::Status set_storage_restore_uri(int32_t storage_id, const core::URIConfig& uri) = 0;
 31 | 
 32 |     virtual ps::Status create_storage_handler(int32_t storage_id, std::unique_ptr<EmbeddingStorageHandler>&) = 0;
 33 |     virtual uint32_t generate_id(const std::string&) = 0;
 34 |     virtual ps::Status pull_model_meta(const std::string& model_sign, ModelMeta& model_meta) = 0;
 35 |     
 36 | 
 37 |     virtual const EnvConfig& env_config()const = 0;
 38 | 
 39 |     void set_default_hadoop_bin(core::URIConfig& uri);
 40 | 
 41 | protected:
 42 |     ps::Status create_operator(int32_t storage_id, const std::string& key,
 43 |           int32_t& handler_id, std::shared_ptr<ps::Operator>& op);
 44 | };
 45 | 
 46 | 
 47 | class RpcConnection: public Connection {
 48 | public:
 49 |     RpcConnection(const EnvConfig& env);
 50 | 
 51 |     ~RpcConnection() override;
 52 | 
 53 |     comm_rank_t global_rank() override {
 54 |         return _rpc->global_rank();
 55 |     }
 56 | 
 57 |     std::unique_ptr<ps::Server> create_server();
 58 | 
 59 |     std::unique_ptr<ps::Controller> create_controller();
 60 | 
 61 |     ps::Context* server_context() override {
 62 |         return _client->context().get();
 63 |     }
 64 | 
 65 |     std::vector<int32_t> running_servers()override;
 66 | 
 67 |     ps::Status close_server(int32_t server_id)override;
 68 | 
 69 |     void close_servers() override;
 70 | 
 71 |     ps::Status create_storage(const std::map<int32_t, std::vector<int32_t>>& node_shards, int32_t& storage_id)override;
 72 | 
 73 |     ps::Status delete_storage(int32_t storage_id)override;
 74 | 
 75 |     ps::Status create_storage_handler(int32_t storage_id, std::unique_ptr<EmbeddingStorageHandler>& storage)override;
 76 | 
 77 |     ps::Status set_storage_restore_uri(int32_t storage_id, const core::URIConfig& uri);
 78 | 
 79 |     uint32_t generate_id(const std::string& name);
 80 | 
 81 |     ps::Status pull_model_meta(const std::string& model_sign, ModelMeta& model_meta)override;
 82 | 
 83 |     ps::Status push_model_meta(const ModelMeta& model_meta);
 84 | 
 85 |     ps::Status update_model_meta(const ModelMeta& model_meta);
 86 |     
 87 |     ps::Status delete_model_meta(const std::string& model_sign);
 88 | 
 89 |     std::vector<std::string> list_model();
 90 | 
 91 |     bool try_lock_model(const std::string& model_sign);
 92 | 
 93 |     void unlock_model(const std::string& model_sign);
 94 | 
 95 |     const EnvConfig& env_config()const override {
 96 |         return _env;
 97 |     }
 98 | 
 99 |     core::RpcService* rpc()const {
100 |         return _rpc.get();
101 |     }
102 | 
103 |     core::MasterClient* master_client()const {
104 |         return _master_client.get();
105 |     }
106 | 
107 | private:
108 |     template<class T>
109 |     ps::Status create_handler(int32_t storage_id, const std::string& key, std::unique_ptr<T>& handler) {
110 |         int32_t handler_id = -1;
111 |         std::shared_ptr<ps::Operator> op;
112 |         CHECK_STATUS_RETURN(create_operator(storage_id, key, handler_id, op));
113 |         handler = std::make_unique<T>(storage_id, handler_id, op, _client.get());
114 |         return ps::Status();
115 |     }
116 | 
117 |     template<class T>
118 |     void create_handler_pool(int32_t storage_id, const std::string& key,
119 |           ObjectPool<std::unique_ptr<T>>& handler_pool) {
120 |         handler_pool = [this, storage_id, key]() {
121 |             std::unique_ptr<T> handler;
122 |             ps::Status status = create_handler(storage_id, key, handler);
123 |             if (!status.ok()) {
124 |                 SLOG(WARNING) << key << " " << status.ToString();
125 |             }
126 |             return handler;
127 |         };
128 |     }
129 | 
130 |     std::string _model_path = "_hyper-embedding-model_";
131 |     std::string _model_lock_path = "_hyper-embedding-model-lock_";
132 |     std::unique_ptr<core::RpcService> _rpc;
133 |     std::unique_ptr<core::MasterClient> _master_client;
134 |     std::unique_ptr<core::RpcClient> _rpc_client;
135 |     std::unique_ptr<ps::Client> _client;
136 |     EnvConfig _env;
137 | };
138 | 
139 | }
140 | }
141 | }
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/openembedding/client/EmbeddingVariableHandle.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_HANDLE_H
  2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_HANDLE_H
  3 | 
  4 | #include <pico-ps/handler/UDFHandler.h>
  5 | #include <pico-ps/handler/PushHandler.h>
  6 | #include <pico-ps/handler/LoadHandler.h>
  7 | #include <pico-ps/handler/DumpHandler.h>
  8 | 
  9 | #include "Meta.h"
 10 | #include "ObjectPool.h"
 11 | 
 12 | #include "EmbeddingPullOperator.h"
 13 | #include "EmbeddingPushOperator.h"
 14 | #include "EmbeddingLoadOperator.h"
 15 | #include "EmbeddingDumpOperator.h"
 16 | #include "EmbeddingStoreOperator.h"
 17 | 
 18 | namespace paradigm4 {
 19 | namespace pico {
 20 | namespace embedding {
 21 | 
 22 | struct HandlerWaiter {
 23 | public:
 24 |     template<class F>
 25 |     HandlerWaiter(F&& waiter): _waiter(std::forward<F>(waiter)) {
 26 |         SCHECK(_waiter);
 27 |     }
 28 | 
 29 |     ~HandlerWaiter() {
 30 |         SCHECK(_wait_called);
 31 |     }
 32 |     
 33 |     HandlerWaiter(const HandlerWaiter&) = delete;
 34 |     HandlerWaiter& operator=(const HandlerWaiter&) = delete;
 35 | 
 36 |     HandlerWaiter(HandlerWaiter&& other) {
 37 |         _wait_called = other._wait_called;
 38 |         _waiter = other._waiter;
 39 |         other._wait_called = true;
 40 |     }
 41 | 
 42 |     ps::Status wait(void* result = nullptr) {
 43 |         _wait_called = true;
 44 |         return _waiter(result);
 45 |     }
 46 | 
 47 | private:
 48 |     bool _wait_called = false;
 49 |     std::function<ps::Status(void*)> _waiter;
 50 | };
 51 | 
 52 | // not handler, just a handle of storage handler.
 53 | class EmbeddingVariableHandle {
 54 | public:
 55 |     // n * embedding_dim;
 56 |     const EmbeddingVariableMeta& meta()const {
 57 |         return _meta;
 58 |     }
 59 | 
 60 |     uint32_t variable_id()const {
 61 |         return _variable_id;
 62 |     }
 63 | 
 64 |     HandlerWaiter init_config(const core::Configure& config)const;
 65 | 
 66 |     // predictor controller
 67 |     HandlerWaiter clear_weights();
 68 | 
 69 |     // predictor controller
 70 |     HandlerWaiter pull_weights(const uint64_t* indices, size_t n, int64_t batch_id)const;
 71 | 
 72 |     HandlerWaiter push_gradients(const uint64_t* indices, size_t n, const char* gradients)const;
 73 | 
 74 |     int _timeout = -1;
 75 |     bool _read_only = false;
 76 |     uint32_t _variable_id = 0;
 77 |     EmbeddingVariableMeta _meta;
 78 | 
 79 |     ObjectPool<std::unique_ptr<ps::UDFHandler>>* _read_only_pull_handler = nullptr;
 80 |     ObjectPool<std::unique_ptr<ps::UDFHandler>>* _pull_handler = nullptr;
 81 |     ObjectPool<std::unique_ptr<ps::UDFHandler>>* _push_handler = nullptr;
 82 |     ObjectPool<std::unique_ptr<ps::PushHandler>>* _init_handler = nullptr;
 83 | 
 84 |     std::atomic<bool>* _should_persist;
 85 | };
 86 | 
 87 | class EmbeddingStorageHandler {
 88 | public:
 89 |     EmbeddingStorageHandler() {}
 90 |     EmbeddingStorageHandler(const EmbeddingStorageHandler&) = delete;
 91 |     EmbeddingStorageHandler& operator=(const EmbeddingStorageHandler&) = delete;
 92 |     
 93 |     EmbeddingStorageHandler(EmbeddingStorageHandler&&) = default;
 94 |     EmbeddingStorageHandler& operator=(EmbeddingStorageHandler&&) = default;
 95 | 
 96 |     EmbeddingVariableHandle variable(uint32_t variable_id, EmbeddingVariableMeta meta);
 97 | 
 98 |     HandlerWaiter update_weights();
 99 | 
100 |     // predictor controller
101 |     HandlerWaiter load_storage(const URIConfig& uri, size_t server_concurency = 4);
102 | 
103 |     // predictor controller
104 |     HandlerWaiter dump_storage(const URIConfig& uri, size_t file_number);
105 | 
106 |     int _timeout = -1;
107 |     ObjectPool<std::unique_ptr<ps::UDFHandler>> _read_only_pull_handler;
108 |     ObjectPool<std::unique_ptr<ps::UDFHandler>> _pull_handler;
109 |     ObjectPool<std::unique_ptr<ps::UDFHandler>> _push_handler;
110 |     ObjectPool<std::unique_ptr<ps::UDFHandler>> _store_handler;
111 |     ObjectPool<std::unique_ptr<ps::PushHandler>> _init_handler;
112 |     
113 |     ObjectPool<std::unique_ptr<ps::LoadHandler>> _load_handler;
114 |     ObjectPool<std::unique_ptr<ps::DumpHandler>> _dump_handler;
115 | };
116 | 
117 | 
118 | }
119 | }
120 | }
121 | 
122 | #endif
123 | 


--------------------------------------------------------------------------------
/openembedding/client/EnvConfig.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EXB_ENV_CONFIG_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EXB_ENV_CONFIG_H
 3 | 
 4 | #include <pico-core/Configure.h>
 5 | #include <pico-core/ConfigureHelper.h>
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | using core::ConfigNode;
12 | using core::ConfigUnit;
13 | 
14 | #ifdef USE_RDMA
15 | DECLARE_CONFIG(RdmaConfig, ConfigNode) {
16 |     PICO_CONFIGURE_DECLARE(std::string, ib_devname);
17 |     PICO_CONFIGURE_DECLARE(int, gid_index);
18 |     PICO_CONFIGURE_DECLARE(int, ib_port);
19 |     PICO_CONFIGURE_DECLARE(int, traffic_class);
20 |     PICO_CONFIGURE_DECLARE(int, sl);
21 |     PICO_CONFIGURE_DECLARE(int, mtu);
22 |     PICO_CONFIGURE_DECLARE(int, pkey_index);
23 |     PICO_CONFIGURE_DECLARE(int, min_rnr_timer);
24 |     PICO_CONFIGURE_DECLARE(int, retry_cnt);
25 |     PICO_CONFIGURE_DECLARE(int, timeout);
26 | };
27 | #endif
28 | 
29 | DECLARE_CONFIG(TcpConfig, ConfigNode) {
30 |     PICO_CONFIGURE_DECLARE(int, keepalive_time);
31 |     PICO_CONFIGURE_DECLARE(int, keepalive_intvl);
32 |     PICO_CONFIGURE_DECLARE(int, keepalive_probes);
33 |     PICO_CONFIGURE_DECLARE(int, connect_timeout);
34 | };
35 | 
36 | DECLARE_CONFIG(RpcConfig, ConfigNode) {
37 |     PICO_CONFIGURE_DECLARE(std::string, bind_ip);
38 |     PICO_CONFIGURE_DECLARE(size_t, io_thread_num);
39 |     PICO_CONFIGURE_DECLARE(std::string, protocol);
40 | #ifdef USE_RDMA
41 |     PICO_CONFIGURE_DECLARE(RdmaConfig, rdma);
42 | #endif
43 |     PICO_CONFIGURE_DECLARE(TcpConfig, tcp);
44 | };
45 | 
46 | DECLARE_CONFIG(MasterConfig, ConfigNode) {
47 |     PICO_CONFIGURE_DECLARE(std::string, endpoint);
48 |     PICO_CONFIGURE_DECLARE(std::string, type);
49 |     PICO_CONFIGURE_DECLARE(std::string, root_path);
50 |     PICO_CONFIGURE_DECLARE(size_t, recv_timeout);
51 |     PICO_CONFIGURE_DECLARE(size_t, cache_timeout);
52 | };
53 | 
54 | DECLARE_CONFIG(ServerConfig, ConfigNode) {
55 |     PICO_CONFIGURE_DECLARE(std::string, pmem_pool_root_path);
56 |     PICO_CONFIGURE_DECLARE(size_t, cache_size);
57 |     PICO_CONFIGURE_DECLARE(std::string, message_compress);
58 |     PICO_CONFIGURE_DECLARE(size_t, server_dump_files);
59 |     PICO_CONFIGURE_DECLARE(int, server_concurrency);
60 |     PICO_CONFIGURE_DECLARE(int, recv_timeout);
61 |     PICO_CONFIGURE_DECLARE(int, report_interval);
62 |     PICO_CONFIGURE_DECLARE(bool, update_early_return);
63 | };
64 | 
65 | class EnvConfig: public ConfigNode {
66 |     // client server shared
67 |     // default shard_num = server_concurrency * server_num
68 | //    PICO_CONFIGURE_DECLARE(size_t, max_request_merge_num); // pull push
69 | public:
70 |     PICO_CONFIGURE_DECLARE(RpcConfig, rpc);
71 |     PICO_CONFIGURE_DECLARE(MasterConfig, master);
72 |     PICO_CONFIGURE_DECLARE(ServerConfig, server);
73 | public:
74 | 
75 |     void load_yaml(const core::Configure& configure, const std::string& master_endpoint = "", const std::string& rpc_bind_ip = "") {
76 |         SCHECK(load_config(configure));
77 |         if (!master_endpoint.empty()) {
78 |             master.endpoint = master_endpoint;
79 |         }
80 |         if (!rpc_bind_ip.empty()) {
81 |             rpc.bind_ip = rpc_bind_ip;
82 |         }
83 |     }
84 | };
85 | 
86 | 
87 | }
88 | }
89 | }
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/openembedding/client/Model.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_H
 3 | 
 4 | #include "Meta.h"
 5 | #include "Connection.h"
 6 | #include "EmbeddingVariableHandle.h"
 7 | 
 8 | namespace paradigm4 {
 9 | namespace pico {
10 | namespace embedding {
11 | 
12 | class Model {
13 | public:
14 |     Model(Connection* connection): _conn(connection) {}
15 | 
16 |     const ModelMeta& model_meta() {
17 |         return _model_meta;
18 |     }
19 | 
20 |     void set_model_status(ps::ModelStatus model_status);
21 | 
22 |     ps::Status test_status(const ps::Status& status);
23 | 
24 |     ps::Status update_model_meta(const ModelMeta& model_meta);
25 | 
26 |     ps::Status add_storage(int32_t storage_id, std::string storage_name);
27 | 
28 |     ps::Status add_variable(const ModelVariableMeta& variable);
29 |     
30 |     ps::Status access_storage(int32_t storage_id, EmbeddingStorageHandler*& storage)const;
31 | 
32 |     ps::Status access_variable(uint32_t variable_id, EmbeddingVariableHandle& handle)const;
33 | 
34 |     ps::Status dump_model(core::URIConfig uri, std::string model_sign, size_t num_files)const;
35 | 
36 |     ps::Status load_model(core::URIConfig uri);
37 | 
38 |     ps::Status load_model();
39 | 
40 |     ps::Status create_model(core::URIConfig uri);
41 | 
42 |     ps::Status create_model_storages(int32_t replica_num, int32_t shard_num = -1);
43 | 
44 |     void delete_model_storages();
45 | 
46 |     static ps::Status read_meta_file(const core::URIConfig& uri, ModelOfflineMeta& model_meta);
47 | 
48 | private:
49 |     Connection* _conn = nullptr;
50 |     ModelMeta _model_meta;
51 |     // The file name of storage is the ordered rank of the storage_id in this model.
52 |     std::unordered_map<int32_t, std::unique_ptr<EmbeddingStorageHandler>> _storages;
53 | };
54 | 
55 | }
56 | }
57 | }
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/openembedding/client/ModelController.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_CONTROLLER_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_MODEL_CONTROLLER_H
 3 | 
 4 | #include "Model.h"
 5 | 
 6 | namespace paradigm4 {
 7 | namespace pico {
 8 | namespace embedding {
 9 | 
10 | // for predictor
11 | class ModelManager {
12 | public:
13 |     ModelManager(Connection* connection): _conn(connection) {}
14 | 
15 |     /// TODO: Cache pull_model_meta with timeout.
16 |     // Predictor pull handler may have different requirements of timeout.
17 |     ps::Status find_model_variable(const std::string& model_sign, uint32_t variable_id,
18 |           std::shared_ptr<Model>& out, EmbeddingVariableHandle& handle, int timeout = -1);
19 | 
20 | private:
21 |     core::RWSpinLock _lock;
22 |     Connection* _conn = nullptr;
23 |     std::unordered_map<std::string, std::shared_ptr<Model>> _models; 
24 | };
25 | 
26 | 
27 | // for controller, all heavy methods are async
28 | class ModelController {
29 | public:
30 |     ModelController(RpcConnection* connection): _conn(connection),
31 |            _threads(_conn->env_config().server.server_concurrency) {}
32 | 
33 |     ps::Status create_model(const core::URIConfig& model_uri,
34 |           std::string& model_sign, core::PicoJsonNode& result, int32_t replica_num, int32_t shard_num);
35 | 
36 |     ps::Status delete_model(const std::string& model_sign);
37 | 
38 |     ps::Status show_model(const std::string& model_sign, core::PicoJsonNode& result);
39 | 
40 |     ps::Status show_models(core::PicoJsonNode& result);
41 | 
42 |     ps::Status show_node(int32_t node_id, core::PicoJsonNode& result);
43 | 
44 |     ps::Status show_nodes(core::PicoJsonNode& result);
45 | 
46 |     ps::Status shutdown_node(int32_t node_id);
47 | private:
48 |     RpcConnection* _conn = nullptr;
49 |     ThreadGroup _threads;
50 | };
51 | 
52 | }
53 | }
54 | }
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/openembedding/client/ObjectPool.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_OBJECT_POOL_H
 2 | #define PARADIGM4_HYPEREMBEDDING_OBJECT_POOL_H
 3 | 
 4 | #include <thread>
 5 | #include <algorithm>
 6 | #include <pico-ps/handler/PullHandler.h>
 7 | #include <pico-ps/handler/PushHandler.h>
 8 | #include "EmbeddingVariable.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace pico {
12 | namespace embedding {
13 | 
14 | 
15 | template<class T>
16 | class ObjectPool {
17 | public:
18 |     ObjectPool() {}
19 |     ObjectPool(ObjectPool<T>&&) = default;
20 |     ObjectPool<T>& operator=(ObjectPool<T>&&) = default;
21 |     ObjectPool<T>& operator=(std::function<T()> initializer) {
22 |         SCHECK(_initializer == nullptr);
23 |         _initializer = initializer;
24 |         return *this;
25 |     }
26 | 
27 |     T acquire() {
28 |         core::lock_guard<core::RWSpinLock> lk(*_lock);
29 |         if (_pool.empty()) {
30 |             if (_initializer) {
31 |                 return _initializer();
32 |             } else {
33 |                 return nullptr;
34 |             }
35 |         } else {
36 |             T p = std::move(_pool.back());
37 |             _pool.pop_back();
38 |             return p;
39 |         }
40 |     }
41 | 
42 |     void release(T&& p) {
43 |         core::lock_guard<core::RWSpinLock> lk(*_lock);
44 |         _pool.push_back(std::move(p));
45 |     }
46 | 
47 |     void clear() {
48 |         core::lock_guard<core::RWSpinLock> lk(*_lock);
49 |         _pool.clear();
50 |     }
51 | 
52 |     std::unique_ptr<core::RWSpinLock> _lock = std::make_unique<core::RWSpinLock>();
53 |     std::function<T()> _initializer;
54 |     std::deque<T> _pool;
55 | };
56 | 
57 | 
58 | 
59 | }
60 | }
61 | }
62 | 
63 | 
64 | #endif


--------------------------------------------------------------------------------
/openembedding/client/WorkerContext.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_WORKER_CONTEXT_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_WORKER_CONTEXT_H
 3 | 
 4 | #include <pico-ps/service/Server.h>
 5 | #include "Connection.h"
 6 | #include "Communication.h"
 7 | #include "EmbeddingVariableHandle.h"
 8 | #include "Model.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace pico {
12 | namespace embedding {
13 | 
14 | class WorkerContext {
15 | public:
16 |     WorkerContext(RpcConnection* connection,
17 |           int32_t worker_num, int32_t wait_server_num = -1);
18 | 
19 |     ~WorkerContext();
20 | 
21 |     int32_t create_storage(int32_t shard_num = -1);
22 | 
23 |     void delete_storage(int32_t storage_id);
24 | 
25 |     EmbeddingVariableHandle create_variable(int32_t storage_id, const EmbeddingVariableMeta& meta);
26 | 
27 |     HandlerWaiter update_weights(int32_t storage_id);
28 | 
29 |     int32_t worker_rank()const {
30 |         return _comm->comm_rank();
31 |     }
32 | 
33 |     int32_t worker_num()const {
34 |         return _comm->comm_size();
35 |     }
36 | 
37 |     Connection* connection()const {
38 |         return _conn;
39 |     }
40 | 
41 |     void load_model(const core::URIConfig& uri)const;
42 | 
43 |     void dump_model(const core::URIConfig& uri, const std::string& model_sign);
44 | 
45 |     void barrier(const std::string& key) {
46 |         _comm->barrier(key);
47 |     }
48 | 
49 |     template<class T>
50 |     void boardcast(const std::string& key, T& value) {
51 |         _comm->boardcast(key, value, 0);
52 |     }
53 | 
54 |     void report_accumulator();
55 | 
56 |     std::atomic<bool> should_persist = {false};
57 | 
58 | private:
59 |     core::RWSpinLock _lock;
60 |     Connection* _conn;
61 |     std::unique_ptr<Communication> _comm;
62 |     std::unique_ptr<ps::Server> _server;
63 | 
64 |     std::unique_ptr<Model> _model;
65 | 
66 |     ServerConfig _server_config;
67 | 
68 |     bool _reporter = false;
69 |     size_t _report_monitor = 0;
70 | };
71 | 
72 | }
73 | }
74 | }
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/openembedding/entry/c_api.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_EXB_CAPI_H
  3 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_EXB_CAPI_H
  4 | 
  5 | #ifdef __cplusplus
  6 | extern "C" {
  7 | #endif
  8 | 
  9 | #include <stdint.h>
 10 | #include <stddef.h>
 11 | 
 12 | struct exb_connection;
 13 | 
 14 | struct exb_master;
 15 | struct exb_server;
 16 | struct exb_configure;
 17 | struct exb_context;
 18 | struct exb_storage;
 19 | struct exb_variable;
 20 | struct exb_optimizer;
 21 | struct exb_initializer;
 22 | struct exb_pull_waiter;
 23 | struct exb_waiter;
 24 | struct exb_channel;
 25 | struct exb_mutex {
 26 |     int64_t data[16];
 27 | };
 28 | struct exb_string {
 29 |     char data[128];
 30 | };
 31 | 
 32 | struct exb_connection* exb_serving();
 33 | // TCP configuration should be consistent for all connections.
 34 | // There may be unknown problems with multiple connections at the same time.
 35 | // wait_server_num = -1 means to start ps in each worker process.
 36 | struct exb_connection* exb_connect(const char* yaml_config,
 37 |       const char* master_endpoint, const char* rpc_bind_ip = "");
 38 | 
 39 | // thread local
 40 | const char* exb_last_error();
 41 | 
 42 | int exb_last_wait_time_ms();
 43 | 
 44 | int exb_running_server_count(struct exb_connection*);
 45 | 
 46 | void exb_disconnect(struct exb_connection*);
 47 | 
 48 | struct exb_master* exb_master_start(const char* bind_ip = "");
 49 | 
 50 | void exb_master_endpoint(struct exb_master*, exb_string* value);
 51 | 
 52 | void exb_master_join(struct exb_master*); // destroy
 53 | 
 54 | struct exb_server* exb_server_start(struct exb_connection*);
 55 | 
 56 | void exb_server_exit(struct exb_server*);
 57 | 
 58 | void exb_server_join(struct exb_server*); // destroy
 59 | 
 60 | struct exb_context* exb_context_initialize(struct exb_connection*,
 61 |       int32_t worker_num, int32_t wait_server_num = -1);
 62 | 
 63 | void exb_context_finalize(struct exb_context*);
 64 | 
 65 | int exb_worker_rank(struct exb_context*);
 66 | 
 67 | struct exb_storage* exb_create_storage(struct exb_context*, int32_t shard_num = -1);
 68 | 
 69 | void exb_delete_storage(struct exb_storage*);
 70 | 
 71 | struct exb_variable* exb_create_variable(struct exb_storage*,
 72 |       uint64_t vocabulary_size, size_t embedding_dim, const char* dtype = "float32");
 73 | 
 74 | int32_t exb_storage_id(struct exb_storage*);
 75 | 
 76 | uint32_t exb_variable_id(struct exb_variable*);
 77 | 
 78 | void exb_set_initializer(struct exb_variable*, struct exb_initializer*);
 79 | 
 80 | void exb_set_optimizer(struct exb_variable*, struct exb_optimizer*);
 81 | 
 82 | size_t exb_unique_indices(const uint64_t* indices, size_t n, size_t* unique);
 83 | 
 84 | struct exb_pull_waiter* exb_pull_weights(const struct exb_variable*,
 85 |       const uint64_t* indices, size_t n, int64_t batch_id);
 86 | 
 87 | struct exb_waiter* exb_push_gradients(struct exb_variable*,
 88 |       const uint64_t* indices, size_t n, const void* gradients);
 89 | 
 90 | struct exb_waiter* exb_update_weights(struct exb_storage*);
 91 | 
 92 | bool exb_pull_wait(struct exb_pull_waiter*, const uint64_t* indices, size_t n, void* weights);
 93 | 
 94 | bool exb_wait(struct exb_waiter*);
 95 | 
 96 | struct exb_optimizer* exb_create_optimizer(const char* category);
 97 | 
 98 | void exb_set_optimizer_property(struct exb_optimizer*, const char* key, const char* value);
 99 | 
100 | struct exb_initializer* exb_create_initializer(const char* category);
101 | 
102 | void exb_set_initializer_property(struct exb_initializer*, const char* key, const char* value);
103 | 
104 | const char* exb_version();
105 | 
106 | void exb_dump_model_include_optimizer(struct exb_context*, const char* path, const char* model_sign);
107 | 
108 | void exb_dump_model(struct exb_context*, const char* path, const char* model_sign);
109 | 
110 | void exb_load_model(struct exb_context*, const char* path);
111 | 
112 | void exb_create_model(struct exb_connection*, const char* path, int32_t replica_num, int32_t shard_num = -1);
113 | 
114 | struct exb_variable* exb_get_model_variable(struct exb_connection*, const char* model_sign, int32_t variable_id, int pull_timeout = -1);
115 | 
116 | void exb_release_model_variable(struct exb_variable*);
117 | 
118 | void exb_barrier(struct exb_context*, const char* name, exb_string* value = NULL);
119 | 
120 | void exb_start_monitor(struct exb_context*);
121 | 
122 | struct exb_channel* exb_channel_create();
123 | void exb_channel_delete(struct exb_channel*);
124 | void exb_channel_close(struct exb_channel*);
125 | void exb_channel_write(struct exb_channel*, void*);
126 | bool exb_channel_read(struct exb_channel*, void**);
127 | 
128 | void exb_mutex_lock(struct exb_mutex*);
129 | void exb_mutex_unlock(struct exb_mutex*);
130 | void exb_mutex_lock_shared(struct exb_mutex*);
131 | void exb_mutex_unlock_shared(struct exb_mutex*);
132 | void exb_mutex_upgrade(struct exb_mutex*);
133 | void exb_mutex_downgrade(struct exb_mutex*);
134 | 
135 | void* exb_malloc(size_t size);
136 | void exb_free(void* p);
137 | 
138 | void exb_info(const char* message);
139 | void exb_warning(const char* message);
140 | void exb_fatal(const char* message);
141 | 
142 | bool exb_should_persist_model(struct exb_context*);
143 | 
144 | void exb_persist_model(struct exb_context*, const char* path, const char* model_sign, size_t persist_pending_window);
145 | void exb_restore_model(struct exb_context*, const char* path);
146 | 
147 | #ifdef __cplusplus
148 | }
149 | #endif
150 | 
151 | #endif
152 | 


--------------------------------------------------------------------------------
/openembedding/entry/c_api_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | #include <pico-core/MultiProcess.h>
 3 | #include <pico-core/ThreadGroup.h>
 4 | 
 5 | #include "c_api_test.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | TEST(c_api, model_mix) {
12 |     c_api_threads(1, 1, 1, 10, true);
13 |     c_api_threads(1, 15, 5, 10, true);
14 |     c_api_threads(2, 10, 5, 10, true);
15 |     c_api_threads(3, 8, 5, 10, true);
16 |     c_api_threads(4, 6, 5, 10, true);
17 |     
18 |     c_api_threads(1, 1, 1, 100, true);
19 |     c_api_threads(2, 10, 5, 100, true);
20 | }
21 | 
22 | TEST(c_api, model_shard_num) {
23 |     c_api_threads(1, 3, 1, 10, true, 1);
24 |     c_api_threads(1, 3, 5, 10, true, 3);
25 |     c_api_threads(3, 3, 1, 10, true, 7);
26 |     c_api_threads(5, 2, 1, 10, true, 111);
27 |     c_api_threads(8, 2, 2, 10, true, 256);
28 | }
29 | 
30 | TEST(c_api, pull_push) {
31 |     for (size_t i = 1; i < 10; ++i) {
32 |         c_api_pull_push(i, 100, 128, false);
33 |         c_api_pull_push(i, 100000, 1, false);
34 |         c_api_pull_push(i, 100000, 8, false);
35 |         c_api_pull_push(i, 100000, 1, true);
36 |         c_api_pull_push(i, 100000, 16, true);
37 |     }
38 | }
39 | 
40 | TEST(c_api, one) {
41 |     c_api_threads(1, 1, 1, 1000);
42 |     c_api_threads(3, 1, 1, 1000);
43 |     c_api_threads(5, 1, 1, 1000);
44 |     c_api_threads(8, 1, 1, 1000);
45 | }
46 | 
47 | TEST(c_api, trd) {
48 |     c_api_threads(1, 3, 1, 300);
49 |     c_api_threads(2, 3, 1, 300);
50 |     c_api_threads(3, 3, 1, 300);
51 |     c_api_threads(4, 3, 1, 300);
52 | }
53 | 
54 | TEST(c_api, mix) {
55 |     for (int node_num = 1; node_num < 9; ++node_num) {
56 |         c_api_threads(node_num, 20, 5, 100, false, node_num * node_num);
57 |     }
58 | }
59 | 
60 | TEST(c_api, rep) {
61 |     for (int i = 0; i < 3; ++i) {
62 |         c_api_threads(2, 7, 2, 300);
63 |         c_api_threads(3, 5, 2, 300);
64 |         c_api_threads(4, 3, 3, 300);
65 |     }
66 | }
67 | 
68 | }
69 | }
70 | }
71 | 
72 | 
73 | int main(int argc, char* argv[]) {
74 |     testing::InitGoogleTest(&argc, argv);
75 |     int ret = RUN_ALL_TESTS();
76 |     return ret;
77 | }
78 | 


--------------------------------------------------------------------------------
/openembedding/entry/controller.proto:
--------------------------------------------------------------------------------
 1 | // Licensed to the Apache Software Foundation (ASF) under one
 2 | // or more contributor license agreements.  See the NOTICE file
 3 | // distributed with this work for additional information
 4 | // regarding copyright ownership.  The ASF licenses this file
 5 | // to you under the Apache License, Version 2.0 (the
 6 | // "License"); you may not use this file except in compliance
 7 | // with the License.  You may obtain a copy of the License at
 8 | //
 9 | //   http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing,
12 | // software distributed under the License is distributed on an
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | // KIND, either express or implied.  See the License for the
15 | // specific language governing permissions and limitations
16 | // under the License.
17 | 
18 | syntax="proto2";
19 | package exb;
20 | 
21 | option cc_generic_services = true;
22 | 
23 | message HttpRequest {};
24 | message HttpResponse {};
25 | 
26 | service models {
27 |   rpc default_method(HttpRequest) returns (HttpResponse);
28 | };
29 | 
30 | service nodes {
31 |   rpc default_method(HttpRequest) returns (HttpResponse);
32 | };
33 | 


--------------------------------------------------------------------------------
/openembedding/entry/masterd.cc:
--------------------------------------------------------------------------------
 1 | #include <gflags/gflags.h>
 2 | #include <pico-core/pico_log.h>
 3 | #include <pico-core/Master.h>
 4 | 
 5 | 
 6 | int main(int argc, char* argv[]) {
 7 |     google::InstallFailureSignalHandler();
 8 |     google::InitGoogleLogging(argv[0]);
 9 |     FLAGS_logtostderr = 1;
10 |     google::AllowCommandLineReparsing();
11 |     google::ParseCommandLineFlags(&argc, &argv, false);
12 | 
13 |     paradigm4::pico::core::LogReporter::set_id("MASTER", 0);
14 |     paradigm4::pico::core::Master master("");
15 |     
16 |     master.initialize();
17 |     master.finalize();
18 |     return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/openembedding/entry/server.cc:
--------------------------------------------------------------------------------
 1 | #include <pico-core/pico_log.h>
 2 | #include <pico-ps/service/Server.h>
 3 | #include "Connection.h"
 4 | #include "c_api.h"
 5 | #include <pico-core/observability/metrics/Metrics.h>
 6 | 
 7 | DEFINE_bool(enable_metrics, false, "enable/disable metrics");
 8 | DEFINE_string(service_name, "service_name", "service name of this binary");
 9 | DEFINE_string(instance_name, "instance_name", "instance name of this binary");
10 | DEFINE_string(metrics_ip, "0.0.0.0", "Binding IP of the metrics exposer");
11 | DEFINE_int32(metrics_port, 8001, "TCP port of the metrics exposer");
12 | DEFINE_string(metrics_url, "/metrics", "URL of the metrics exposer");
13 | 
14 | DEFINE_bool(restore, true, "is replace one dead node"); // try replace one dead node
15 | 
16 | DEFINE_string(config, "", "");
17 | DEFINE_string(config_file, "", "");
18 | DEFINE_string(rpc_bind_ip, "", "");
19 | DEFINE_string(master_endpoint, "", "");
20 | 
21 | 
22 | using namespace paradigm4::pico;
23 | using namespace paradigm4::pico::ps;
24 | 
25 | int main(int argc, char* argv[]) {
26 |     // exb_serving(); // Import registered optimizer.
27 |     google::InstallFailureSignalHandler();
28 |     google::InitGoogleLogging(argv[0]);
29 |     FLAGS_logtostderr = 1;
30 |     google::AllowCommandLineReparsing();
31 |     google::ParseCommandLineFlags(&argc, &argv, false);
32 | 
33 |     paradigm4::pico::core::Memory::singleton().initialize();
34 | 
35 |     paradigm4::pico::metrics_initialize(FLAGS_metrics_ip, FLAGS_metrics_port, FLAGS_metrics_url,
36 |             FLAGS_service_name, FLAGS_instance_name, FLAGS_enable_metrics);
37 | 
38 |     paradigm4::pico::embedding::EnvConfig env;
39 |     paradigm4::pico::core::Configure configure;    
40 |     if (FLAGS_config.empty()) {
41 |         configure.load(FLAGS_config);
42 |     } else {
43 |         configure.load_file(FLAGS_config_file);
44 |     }
45 |     env.load_yaml(configure, FLAGS_master_endpoint, FLAGS_rpc_bind_ip);
46 |     paradigm4::pico::embedding::RpcConnection conn(env);
47 |     paradigm4::pico::core::LogReporter::set_id("SERVER", conn.rpc()->global_rank());
48 |     std::unique_ptr<paradigm4::pico::ps::Server> server;
49 |     server = conn.create_server();
50 |     server->initialize();
51 | 
52 |     if (FLAGS_restore) {
53 |         server->restore_storages(false);
54 |     }
55 | 
56 |     server->finalize();
57 |     paradigm4::pico::core::Memory::singleton().finalize();
58 |     paradigm4::pico::metrics_finalize();
59 |     return 0;
60 | }
61 | 


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingDumpOperator.cpp:
--------------------------------------------------------------------------------
  1 | #include "EmbeddingDumpOperator.h"
  2 | 
  3 | #include <pico-ps/operator/DumpOperator.h>
  4 | #include "EmbeddingVariable.h"
  5 | #include "EmbeddingShardFile.h"
  6 | #include "EmbeddingStorage.h"
  7 | #include "Factory.h"
  8 | 
  9 | namespace paradigm4 {
 10 | namespace pico {
 11 | namespace embedding {
 12 | 
 13 | void EmbeddingDumpOperator::apply_request(ps::RuntimeInfo& rt,
 14 |         ps::PSRequest& req,
 15 |         ps::Storage* storage,
 16 |         ps::PSResponse& resp_ret) {
 17 |     ps::DumpArgs dump_args;
 18 |     req >> dump_args;
 19 |     int32_t file_id;
 20 |     req >> file_id;
 21 |     std::vector<int32_t> shard_ids;
 22 |     req >> shard_ids;
 23 |     SCHECK(req.archive().is_exhausted());
 24 |     ps::PSResponse resp(req);
 25 |     //core::FileSystem::mkdir_p(dump_args.uri());
 26 | 
 27 |     core::URIConfig uri(dump_args.uri());
 28 |     std::string file = format_string("/model_%d_%d", rt.node_id(), file_id);
 29 |     FileWriter writer;
 30 |     if (!writer.open(uri + file)) {
 31 |         if (uri.storage_type() != core::FileSystemType::HDFS) {
 32 |             core::FileSystem::mkdir_p(uri);
 33 |         }
 34 |         SCHECK(writer.open(uri + file));
 35 |     }
 36 |     bool include_optimizer = true;
 37 |     uri.config().get_val("include_optimizer", include_optimizer);
 38 | 
 39 |     bool persist_model = false;
 40 |     uri.config().get_val("persist_model", persist_model);
 41 |     if (persist_model && !include_optimizer) {
 42 |         SLOG(WARNING) << "persist model not support without optimizer.";
 43 |         include_optimizer = true;
 44 |     }
 45 |     size_t persist_pending_window = 2;
 46 |     uri.config().get_val("persist_pending_window", persist_pending_window);
 47 |     
 48 |     auto& st = *(static_cast<EmbeddingStorage*>(storage));
 49 |     core::shared_lock_guard<EmbeddingStorage> l(st);
 50 |     for (int32_t shard_id: shard_ids) {
 51 |         SCHECK(rt.local_shards().count(shard_id) != 0) 
 52 |                 << "Bad Request: invalid shard_id = " << shard_id;
 53 |         auto& shard = *(st.get(shard_id));
 54 |         // should not lock shared
 55 |         core::lock_guard<ps::ShardData> sl(shard);
 56 |         EmbeddingShard& ht = *boost::any_cast<EmbeddingShard>(&shard.data);
 57 |         for (uint32_t variable_id: ht.variable_ids()) {
 58 |             EmbeddingVariableBase& variable = ht[variable_id];
 59 |             
 60 |             EmbeddingShardDataMeta shard_meta;
 61 |             shard_meta.variable_id = variable_id;
 62 |             shard_meta.meta = ht.meta(variable_id);
 63 | 
 64 |             core::Configure config;
 65 |             if (persist_model) {
 66 |                 SCHECK(variable.persist_config(persist_pending_window, config));
 67 |                 if (!include_optimizer) {
 68 |                     config.node().remove("optimizer");
 69 |                 }
 70 |             } else {
 71 |                 variable.dump_config(config);
 72 |             }
 73 |             shard_meta.config = config.dump();
 74 |             shard_meta.shard_id = shard_id;
 75 |             shard_meta.shard_num = rt.global_shard_num();
 76 |             shard_meta.state_line_size = include_optimizer ? variable.state_line_size() : 0;
 77 |             shard_meta.num_items = persist_model ? 0 : variable.num_indices();
 78 |             writer.write(shard_meta);
 79 | 
 80 |             if (shard_meta.num_items) {
 81 |                 int reader_id = variable.create_reader();
 82 |                 size_t n = 0;
 83 |                 core::vector<uint64_t> indices(variable.server_block_num_items());
 84 |                 while ( (n = variable.read_indices(reader_id, indices.data(), indices.size())) ) {
 85 |                     writer.write(n);
 86 |                     indices.resize(n);
 87 |                     core::vector<char> weights(indices.size() * shard_meta.meta.line_size());
 88 |                     core::vector<char> states(indices.size() * shard_meta.state_line_size);
 89 |                     variable.get_weights(indices.data(), n, weights.data(), states.data());
 90 |                     writer.write(indices.data(), indices.size());
 91 |                     writer.write(weights.data(), weights.size());
 92 |                     writer.write(states.data(), states.size());
 93 |                 }
 94 |                 variable.delete_reader(reader_id);
 95 |             }
 96 |         }
 97 |     }
 98 |     resp << ps::Status();
 99 |     resp_ret = std::move(resp);
100 | }
101 | 
102 | }
103 | }
104 | }
105 | 


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingDumpOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_DUMP_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_DUMP_OPERATOR_H
 3 | 
 4 | #include <pico-ps/operator/DumpOperator.h>
 5 | 
 6 | namespace paradigm4 {
 7 | namespace pico {
 8 | namespace embedding {
 9 | 
10 | class EmbeddingDumpOperator : public ps::ShardStorageDumpOperator {
11 | public:
12 |     
13 |     EmbeddingDumpOperator(const core::Configure& conf) : ps::ShardStorageDumpOperator(conf) {}
14 | 
15 |     virtual ~EmbeddingDumpOperator() {}
16 | 
17 |     EmbeddingDumpOperator(EmbeddingDumpOperator&&) = default;
18 |     EmbeddingDumpOperator& operator=(EmbeddingDumpOperator&&) = default;
19 | 
20 |     void apply_request(ps::RuntimeInfo& rt,
21 |           ps::PSRequest& req,
22 |           ps::Storage* storage,
23 |           ps::PSResponse& resp_ret)override;
24 | 
25 |     std::unique_ptr<ps::ForEachResult> init_result_impl() {
26 |         return nullptr;
27 |     }
28 | 
29 |     void merge_result_impl(const ps::ForEachResult&, ps::ForEachResult&,
30 |           const ps::CarriedItem&)override  {}
31 | };
32 | 
33 | }
34 | }
35 | }
36 | 
37 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingInitOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_INIT_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_INIT_OPERATOR_H
 3 | 
 4 | #include <pico-ps/common/EasyHashMap.h>
 5 | #include <pico-ps/operator/PushOperator.h>
 6 | #include "Meta.h"
 7 | 
 8 | namespace paradigm4 {
 9 | namespace pico {
10 | namespace embedding {
11 | 
12 | class EmbeddingInitItems: public ps::PushItems {
13 | public:
14 |     EmbeddingVariableMeta meta;
15 |     uint32_t variable_id = -1;
16 |     uint64_t n = 0; // indices for push 
17 |                   // vocabulary_size for resize or create
18 |     bool clear_weights = false;
19 |     const uint64_t* indices = nullptr; // for push
20 |     const char* weights = nullptr;
21 |     const char* states = nullptr;
22 |     uint64_t state_line_size = 0; // != 0 means pushing optimizer state
23 |     std::string variable_config; // for create
24 | };
25 | 
26 | // for init, load, update context
27 | class EmbeddingInitOperator : public ps::PushOperator {
28 | public:
29 |     EmbeddingInitOperator(const Configure& config) : ps::PushOperator(config) {
30 |         initialize_compress_info(config, "EmbeddingInitOperator", _compress_info);
31 |     }
32 | 
33 |     ~EmbeddingInitOperator()override {}
34 | 
35 |     EmbeddingInitOperator(EmbeddingInitOperator&&) = default;
36 |     EmbeddingInitOperator& operator=(EmbeddingInitOperator&&) = default;
37 | 
38 |     void generate_request_data(core::vector<std::unique_ptr<ps::PushItems>>& push_items,
39 |           ps::RuntimeInfo& rt,
40 |           std::unique_ptr<ps::PushRequestData>& push_request_data) override;
41 | 
42 |     void generate_push_request(
43 |           std::vector<ps::PushRequestData*>& push_request_data,
44 |           ps::RuntimeInfo& rt,
45 |           std::vector<ps::PSRequest>& reqs) override;
46 | 
47 |     void generate_store_request(ps::RuntimeInfo& rt,
48 |           std::vector<ps::PSRequest>& reqs) override;
49 | 
50 |     void apply_async_push_request(ps::RuntimeInfo& rt,
51 |           ps::PSRequest& req,
52 |           ps::Storage* storage,
53 |           ps::Storage*,
54 |           ps::PSResponse& resp) override;
55 | 
56 |     void apply_sync_push_request(ps::RuntimeInfo&,
57 |           ps::PSRequest&,
58 |           ps::Storage*,
59 |           ps::PSResponse&) override {
60 |         return;
61 |     }
62 | 
63 |     void apply_store_request(ps::RuntimeInfo&,
64 |           ps::PSRequest&,
65 |           ps::Storage*,
66 |           ps::Storage*,
67 |           ps::Storage*,
68 |           std::function<void(ps::PSResponse&&)>) override {  
69 |         return;
70 |     }
71 | 
72 |     void apply_response(ps::PSResponse& resp) override;
73 | 
74 |     std::unique_ptr<ps::Storage> create_delta_storage(ps::RuntimeInfo&) override {
75 |         return nullptr;
76 |     }
77 | 
78 |     std::unique_ptr<ps::Storage> create_incr_storage(ps::RuntimeInfo&) override {
79 |         return nullptr;
80 |     }
81 | 
82 | protected:
83 |     ps::CompressInfo _compress_info;
84 | };
85 | 
86 | 
87 | }
88 | }
89 | }
90 | 
91 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingLoadOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_LOAD_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_LOAD_OPERATOR_H
 3 | 
 4 | #include <pico-ps/operator/LoadOperator.h>
 5 | #include "EmbeddingInitOperator.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | class EmbeddingLoadOperator: public ps::LoadOperator {
12 |     typedef uint64_t key_type;
13 | public:
14 |     EmbeddingLoadOperator(const Configure& config): ps::LoadOperator(config), _push_op(config) {}
15 | 
16 |     virtual ~EmbeddingLoadOperator() {}
17 | 
18 |     EmbeddingLoadOperator(EmbeddingLoadOperator&&) = default;
19 |     EmbeddingLoadOperator& operator=(EmbeddingLoadOperator&&) = default;
20 | 
21 | 
22 |     void apply_load_response(ps::PSResponse& resp) override;
23 | 
24 |     void restore(const URIConfig&, ps::RuntimeInfo&, ps::Storage*) override;
25 | 
26 |     void create_stream(const URIConfig& uri, std::shared_ptr<void>& stream) override;
27 | 
28 |     size_t generate_push_items(std::shared_ptr<void>& stream_in,
29 |           core::vector<std::unique_ptr<ps::PushItems>>& push_items) override;
30 | 
31 |     ps::PushOperator* push_operator() override {
32 |         return &_push_op;
33 |     }
34 | 
35 | 
36 | protected:
37 |     EmbeddingInitOperator _push_op;
38 | };
39 | 
40 | 
41 | }
42 | }
43 | }
44 | 
45 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingPullOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_PULL_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_PULL_OPERATOR_H
 3 | 
 4 | #include <pico-ps/common/EasyHashMap.h>
 5 | #include <pico-ps/operator/PullOperator.h>
 6 | #include <pico-ps/operator/UDFOperator.h>
 7 | #include "EmbeddingStorage.h"
 8 | 
 9 | namespace paradigm4 {
10 | namespace pico {
11 | namespace embedding {
12 | 
13 | struct EmbeddingPullItems {
14 |     uint32_t variable_id = 0;
15 |     EmbeddingVariableMeta meta;
16 | 
17 |     const uint64_t* indices = nullptr;
18 |     uint64_t n = 0;
19 | 
20 |     int64_t batch_id = 0;
21 |     
22 | };
23 | 
24 | struct EmbeddingPullResults {
25 |     const uint64_t* indices = nullptr;
26 |     uint64_t n = 0;
27 |     
28 |     char* weights = nullptr;
29 |     bool should_persist = false;
30 | };
31 | 
32 | struct EmbeddingPullRequestData {
33 |     struct ShardData {
34 |         size_t cursor = 0;
35 |         core::vector<uint64_t> num_indices; // prefix count
36 |         ps::RpcVector<uint64_t> indices;
37 |         BinaryArchive weights;
38 |     };
39 |     
40 |     EmbeddingPullRequestData() {}
41 |     
42 |     void init(size_t shard_num, size_t block_num);
43 | 
44 |     size_t waiting_reqs = 0;
45 |     core::vector<EasyHashMap<uint64_t, size_t>> block_offsets;
46 |     core::vector<EmbeddingPullItems> block_items;
47 |     std::unordered_map<int, core::vector<int32_t>> node_shards;
48 |     core::vector<ShardData> shards;
49 | };
50 | 
51 | class EmbeddingPullOperator: public ps::UDFOperator<core::vector<EmbeddingPullItems>, EmbeddingPullRequestData> {
52 | public:
53 |     EmbeddingPullOperator(const Configure& config):
54 |           ps::UDFOperator<core::vector<EmbeddingPullItems>, EmbeddingPullRequestData>(config) {
55 |         initialize_compress_info(config, "EmbeddingPullOperator", _compress_info);
56 |         _algo = ps::initialize_shard_pick_algo(config);
57 |         if (config.has("read_only")) {
58 |             _read_only = config["read_only"].as<bool>();
59 |         }
60 |     }
61 | 
62 |     ~EmbeddingPullOperator() override {}
63 | 
64 |     EmbeddingPullOperator(EmbeddingPullOperator&&) = default;
65 |     EmbeddingPullOperator& operator=(EmbeddingPullOperator&&) = default;
66 | 
67 |     bool read_only() override { return _read_only; }
68 | 
69 |     ps::Status generate_request(core::vector<EmbeddingPullItems>& block_items, 
70 |           ps::RuntimeInfo& rt, EmbeddingPullRequestData& data, std::vector<ps::PSRequest>& reqs)override;
71 | 
72 |     void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 
73 |           const ps::TableDescriptor& table, core::Dealer* dealer) override;
74 | 
75 |     /// TODO: check context version 
76 |     void apply_request_pull(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 
77 |           const ps::TableDescriptor& table, core::Dealer* dealer);
78 | 
79 |     ps::Status apply_response(ps::PSResponse& resp, EmbeddingPullRequestData& data, void* result) override;
80 | 
81 | protected:
82 |     bool _read_only = false;
83 |     ps::CompressInfo _compress_info;
84 |     ps::PickAlgo _algo;
85 | };
86 | 
87 | 
88 | }
89 | }
90 | }
91 | 
92 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingPushOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_PUSH_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_PUSH_OPERATOR_H
 3 | 
 4 | #include <pico-ps/common/EasyHashMap.h>
 5 | #include <pico-ps/operator/PushOperator.h>
 6 | #include "EmbeddingStorage.h"
 7 | #include "EmbeddingPullOperator.h"
 8 | #include "RpcView.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace pico {
12 | namespace embedding {
13 | 
14 | // key <--> index
15 | // value <--> gradients
16 | class EmbeddingPushItems {
17 | public:
18 |     uint32_t variable_id = -1;
19 |     EmbeddingVariableMeta meta;
20 | 
21 |     const uint64_t* indices = nullptr;
22 |     uint64_t n = 0;
23 |     const char* gradients = nullptr;
24 | };
25 | 
26 | struct EmbeddingPushRequestData {
27 |     struct ShardData {
28 |         size_t indices_base = 0;
29 |         size_t gradients_base = 0;
30 |         core::vector<uint64_t> num_indices; // prefix count
31 |         ps::RpcVector<uint64_t> indices;
32 |         ps::RpcVector<char> gradients;
33 |         ps::RpcVector<uint64_t> counts;
34 |     };
35 |     
36 |     EmbeddingPushRequestData(): offsets(-1) {}
37 | 
38 |     void init(size_t shard_num);
39 | 
40 |     template<class T>
41 |     void operator()(TypeCase<T>, EmbeddingPushItems& items);
42 | 
43 |     EasyHashMap<uint64_t, size_t> offsets;
44 |     core::vector<ShardData> shards;
45 | };
46 | 
47 | 
48 | class EmbeddingPushOperator : public ps::UDFOperator<core::vector<EmbeddingPushItems>, EmbeddingPushRequestData> {
49 | public:
50 |     EmbeddingPushOperator(const Configure& config):
51 |           ps::UDFOperator<core::vector<EmbeddingPushItems>, EmbeddingPushRequestData>(config) {
52 |         initialize_compress_info(config, "EmbeddingPushOperator", _compress_info);
53 |     }
54 | 
55 |     virtual ~EmbeddingPushOperator() {}
56 | 
57 |     EmbeddingPushOperator(EmbeddingPushOperator&&) = default;
58 |     EmbeddingPushOperator& operator=(EmbeddingPushOperator&&) = default;
59 | 
60 |     bool read_only() override { return false; }
61 | 
62 |     ps::Status generate_request(core::vector<EmbeddingPushItems>& block_items,
63 |           ps::RuntimeInfo& rt, EmbeddingPushRequestData& data, std::vector<ps::PSRequest>& reqs) override;
64 | 
65 |     void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 
66 |           const ps::TableDescriptor& table, core::Dealer* dealer) override;
67 | 
68 | 
69 |     ps::Status apply_response(ps::PSResponse& resp, EmbeddingPushRequestData&, void* result) override;
70 | 
71 | protected:
72 | 
73 |     ps::CompressInfo _compress_info;
74 | };
75 | 
76 | 
77 | }
78 | }
79 | }
80 | 
81 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingRestoreOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_RESTORE_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_RESTORE_OPERATOR_H
 3 | 
 4 | #include <pico-ps/operator/RestoreOperator.h>
 5 | #include "EmbeddingStorage.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | // need restore initializer for default value
12 | class EmbeddingRestoreOperator: public ps::RestoreOperator {
13 |     typedef uint64_t key_type;
14 | public:
15 |     EmbeddingRestoreOperator(const core::Configure& config) : ps::RestoreOperator(config) {
16 |         initialize_compress_info(config, "EmbeddingRestoreOperator", _compress_info);
17 |     }
18 | 
19 |     ~EmbeddingRestoreOperator() override {}
20 |     EmbeddingRestoreOperator(EmbeddingRestoreOperator&&) = default;
21 |     EmbeddingRestoreOperator& operator=(EmbeddingRestoreOperator&&) = default;
22 |     
23 |     void generate_coordinated_restore_request(
24 |           ps::CoordinatedRestoreRequestItem* req_item, std::vector<ps::PSRequest>& req)override;
25 | 
26 |     virtual void apply_coordinated_restore_request(
27 |           ps::PSRequest& req, ps::Storage* storage, ps::PSResponse& resp)override;
28 | 
29 |     virtual void apply_coordinated_restore_response(ps::PSResponse& resp, ps::Storage* storage, ps::CoordinatedRestoreResponseItem* resp_item);
30 | 
31 |     virtual void restore(const core::URIConfig& uri, ps::RuntimeInfo& rt, ps::Storage* storage);
32 | 
33 | protected:
34 |     ps::CompressInfo _compress_info;
35 | };
36 | 
37 | typedef ps::ShardStorageOperator<EmbeddingStorage, EmbeddingRestoreOperator> EmbeddingStorageOperator;
38 | 
39 | 
40 | }
41 | }
42 | }
43 | 
44 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingShardFile.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_SHRAD_FILE_H
 3 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_SHRAD_FILE_H
 4 | 
 5 | #include <pico-core/FileSystem.h>
 6 | #include <pico-core/ShellUtility.h>
 7 | #include "Meta.h"
 8 | 
 9 | namespace paradigm4 {
10 | namespace pico {
11 | namespace embedding {
12 | 
13 | struct EmbeddingShardDataMeta {
14 |     uint32_t variable_id = 0;
15 |     EmbeddingVariableMeta meta;
16 |     std::string config;
17 |     int32_t shard_id = 0;
18 |     int32_t shard_num = 0;
19 |     uint64_t state_line_size = 0;
20 |     uint64_t num_items = 0;
21 |     PICO_SERIALIZATION(variable_id, meta, config, shard_id, shard_num, state_line_size, num_items);
22 | 
23 |     uint64_t get_index(uint64_t index)const {
24 |         return index * shard_num + shard_id;
25 |     }
26 | };
27 | 
28 | class FileReader {
29 | public:
30 |     bool open(const core::URIConfig& uri) {
31 |         std::string hadoop_bin;
32 |         uri.config().get_val(core::URI_HADOOP_BIN, hadoop_bin);
33 |         _file = core::ShellUtility::open_read(uri.name(), "", hadoop_bin);
34 |         _archive.reset(_file);
35 |         return _file;
36 |     }
37 | 
38 |     template<class T>
39 |     bool read(T& value) {
40 |         return core::pico_deserialize(_archive, value);
41 |     }
42 | 
43 |     template<class T>
44 |     typename std::enable_if<std::is_pod<T>::value, bool>::type
45 |     read(T* buffer, size_t n) {
46 |         return _archive.read_raw_uncheck(buffer, n * sizeof(T));
47 |     }
48 | 
49 | private:
50 |     core::shared_ptr<FILE> _file;
51 |     core::BinaryFileArchive _archive;
52 | };
53 | 
54 | class FileWriter {
55 | public:
56 |     bool open(const core::URIConfig& uri) {
57 |         std::string null_uri = "mem://null/";
58 |         if (uri.uri().substr(0, null_uri.size()) == null_uri) {
59 |             _null = true;
60 |             return true;
61 |         }
62 |         std::string hadoop_bin;
63 |         uri.config().get_val(core::URI_HADOOP_BIN, hadoop_bin);
64 |         _file = core::ShellUtility::open_write(uri.name(), "", hadoop_bin);
65 |         _archive.reset(_file);
66 |         return _file;
67 |     }
68 | 
69 |     template<class T>
70 |     void write(const T& value) {
71 |         if (_null) return;
72 |         SCHECK(core::pico_serialize(_archive, value));
73 |     }
74 | 
75 |     template<class T>
76 |     typename std::enable_if<std::is_pod<T>::value>::type
77 |     write(const T* buffer, size_t n) {
78 |         if (_null) return;
79 |         SCHECK(_archive.write_raw_uncheck(buffer, n * sizeof(T)));
80 |     }
81 | 
82 | private:
83 |     bool _null = false;
84 |     core::shared_ptr<FILE> _file;
85 |     core::BinaryFileArchive _archive;
86 | };
87 | 
88 | 
89 | }
90 | }
91 | }
92 | 
93 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingStorage.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_PICO_PS_EMBEDDING_EMBEDDING_STORAGE_H
  2 | #define PARADIGM4_PICO_PS_EMBEDDING_EMBEDDING_STORAGE_H
  3 | 
  4 | #include "Meta.h"
  5 | #include "EmbeddingVariable.h"
  6 | #include <pico-ps/operator/StorageOperator.h>
  7 | 
  8 | namespace paradigm4 {
  9 | namespace pico {
 10 | /*! \brief namespace of parameter server */
 11 | namespace embedding {
 12 | 
 13 | class EmbeddingShard {
 14 | public:
 15 |     bool insert_variable(uint32_t variable_id,
 16 |           std::unique_ptr<EmbeddingVariableBase> variable,
 17 |           const EmbeddingVariableMeta& meta) {
 18 |         if (variable_id >= _variables.size()) {
 19 |             _variables.resize(variable_id + 1);
 20 |             _metas.resize(variable_id + 1);
 21 |         }
 22 |         if (_variables[variable_id]) {
 23 |             return false;
 24 |         } 
 25 |         if (variable) {
 26 |             _metas[variable_id] = meta;
 27 |             _variables[variable_id] = std::move(variable);
 28 |             _variable_ids.push_back(variable_id);
 29 |             return true;
 30 |         }
 31 |         return false;  
 32 |     }
 33 | 
 34 |     bool contains(uint32_t variable_id)const  {
 35 |         return variable_id < _variables.size() && _variables[variable_id];
 36 |     }
 37 | 
 38 |     EmbeddingVariableBase& operator[](uint32_t variable_id) {
 39 |         SCHECK(contains(variable_id)) << variable_id;
 40 |         return *_variables[variable_id];
 41 |     }
 42 | 
 43 |     const std::vector<uint32_t>& variable_ids()const {
 44 |         return _variable_ids;
 45 |     }
 46 | 
 47 |     const EmbeddingVariableMeta& meta(uint32_t variable_id) {
 48 |         SCHECK(contains(variable_id)) << variable_id;
 49 |         return _metas[variable_id];
 50 |     }
 51 | 
 52 |     EmbeddingVariableBase& get(uint32_t variable_id, const EmbeddingVariableMeta& meta) {
 53 |         if (!contains(variable_id)) {
 54 |             auto pvar = EmbeddingVariableBase::create(meta.datatype, meta.embedding_dim);
 55 |             SCHECK(insert_variable(variable_id, std::move(pvar), meta));
 56 |         }
 57 |         SCHECK(this->meta(variable_id) == meta)
 58 |             << this->meta(variable_id).to_json_node().dump() << " " << meta.to_json_node().dump();
 59 |         return (*this)[variable_id];
 60 |     }
 61 | private:
 62 |     std::vector<uint32_t> _variable_ids;
 63 |     std::vector<EmbeddingVariableMeta> _metas;
 64 |     std::vector<std::shared_ptr<EmbeddingVariableBase>> _variables;
 65 | };
 66 | 
 67 | struct PendingRequest {
 68 |     ps::PSMessageMeta psmeta;
 69 |     ps::PSRequest request;
 70 | };
 71 | 
 72 | class EmbeddingStorage : public ps::ShardStorage  {
 73 | public:
 74 |     using ps::ShardStorage::_shards;
 75 |     typedef uint64_t key_type;
 76 |     typedef EmbeddingShard shard_type;
 77 |     EmbeddingStorage(const std::unordered_set<int32_t>& shard_id, const Configure&) {
 78 |         for (const auto& id : shard_id) {
 79 |             create_shard(id);
 80 |         }
 81 |     }
 82 | 
 83 |     void clear() override {
 84 |         for (auto& shard : _shards) {
 85 |             shard.second->data = shard_type();
 86 |         }
 87 |     }
 88 | 
 89 |     virtual bool create_shard(int32_t shard_id) override {
 90 |         core::lock_guard<RWSpinLock> lk(this->_mtx);
 91 |         if (_shards.count(shard_id) != 0) {
 92 |             return false;
 93 |         }
 94 |         _shards.emplace(shard_id, std::make_unique<ps::ShardData>());
 95 |         _shards[shard_id]->data = EmbeddingShard();
 96 |         _shards_meta.emplace(shard_id, std::make_unique<ps::ShardDataMeta>());
 97 |         _shards_meta[shard_id]->on_dcpmm = false;
 98 |         return true;
 99 |     }
100 | 
101 |     //no use
102 |     virtual size_t shard_size(int32_t) override {
103 |         return 0;
104 |     }
105 | 
106 |     //no use
107 |     virtual size_t shard_memory_usage(int32_t) override {
108 |         return 0;
109 |     }
110 | 
111 |     virtual ps::ShardIterator* get_shard_iterator(int32_t, int32_t) override {
112 |         SLOG(FATAL) << "No implementation";
113 |         return nullptr;
114 |     }
115 | 
116 |     core::RWSpinLock& shared_mutex() {
117 |         return this->_mtx;
118 |     }
119 | 
120 |     core::RWSpinLock pending_mutex;
121 |     int64_t batch_id = 0;
122 |     std::atomic<size_t> async_tasks = {0};
123 |     core::deque<core::vector<PendingRequest>> pending;
124 |     core::vector<data_block_t> holders;
125 | };
126 | 
127 | 
128 | }
129 | }
130 | }
131 | 
132 | 
133 | #endif


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingStoreOperator.cpp:
--------------------------------------------------------------------------------
 1 | #include "EmbeddingStoreOperator.h"
 2 | 
 3 | #include <pico-ps/common/EasyHashMap.h>
 4 | #include <pico-ps/operator/PushOperator.h>
 5 | #include "EmbeddingStorage.h"
 6 | #include "EmbeddingPullOperator.h"
 7 | #include "RpcView.h"
 8 | #include "PersistManager.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace pico {
12 | namespace embedding {
13 | 
14 | ps::Status EmbeddingStoreOperator::generate_request(int&,
15 |         ps::RuntimeInfo& rt, int&, std::vector<ps::PSRequest>& reqs) {
16 |     VTIMER(1, embedding_push, generate_push_request, ms);
17 |     for (auto& node: rt.nodes()) {
18 |         reqs.emplace_back(node.first);
19 |     }
20 |     return ps::Status();
21 | }
22 | 
23 | void EmbeddingStoreOperator::apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 
24 |         const ps::TableDescriptor& table, core::Dealer* dealer) {
25 |     VTIMER(1, embedding_update, apply_request, ms);
26 |     ps::PSResponse resp(req);
27 |     resp << psmeta;
28 |     auto& rt = *table.runtime_info;
29 |     auto& st = *(static_cast<EmbeddingStorage*>(table.storage.get()));
30 |     core::shared_lock_guard<EmbeddingStorage> l(st); 
31 |     VariableAsyncTask::wait(st.async_tasks);
32 | 
33 | #ifdef USE_DCPMM
34 |     VariableAsyncTaskThreadPool::singleton().initialize_batch_task();
35 | #endif
36 | 
37 |     for (int32_t shard_id: rt.local_shards()) {
38 |         auto& shard = *(st.get(shard_id));
39 |         shard.lock(); // TODO: use guard
40 |     }
41 | 
42 |     if (_early_return) {
43 |         dealer->send_response(std::move(resp.rpc_response()));
44 |     }
45 |     
46 |     for (int32_t shard_id: rt.local_shards()) {
47 |         auto& shard = *(st.get(shard_id));
48 |         EmbeddingShard& ht = *boost::any_cast<EmbeddingShard>(&shard.data);
49 |         for (uint32_t variable_id: ht.variable_ids()) {
50 |             ht[variable_id].update_weights();
51 |         }
52 |         shard.unlock();
53 |     }
54 | 
55 |     if (!_early_return) {
56 |         dealer->send_response(std::move(resp.rpc_response()));
57 |     }
58 |     core::vector<PendingRequest> reqs;
59 |     {
60 |         core::lock_guard<core::RWSpinLock> pl(st.pending_mutex);
61 |         // Store and push should not happen at the same time, otherwise holders.clear() will cause error.
62 |         st.holders.clear();
63 |         
64 |         if (!st.pending.empty()) {
65 |             reqs = std::move(st.pending.front());
66 |             st.pending.pop_front();
67 |         }
68 |         st.batch_id += 1;
69 |     }
70 |     // Start processing the pull requests of batch_id + 1.
71 |     for (PendingRequest& pend: reqs) {
72 |         ps::Status status;
73 |         if (status.ok()) {
74 |             _pull.apply_request_pull(pend.psmeta, pend.request, table, dealer);
75 |         } else {
76 |             ps::PSResponse resp(pend.request);
77 |             resp.rpc_response().set_error_code(RpcErrorCodeType::ELOGICERROR);
78 |             resp << status << pend.psmeta;
79 |             dealer->send_response(std::move(resp.rpc_response()));
80 |         }
81 |     }
82 | }
83 | 
84 | ps::Status EmbeddingStoreOperator::apply_response(ps::PSResponse& resp, int&, void* result) {
85 |     SCHECK(result == nullptr) << "return no result!";
86 |     SCHECK(resp.archive().is_exhausted());
87 |     return ps::Status();
88 | }
89 | 
90 | 
91 | 
92 | }
93 | }
94 | }
95 | 


--------------------------------------------------------------------------------
/openembedding/server/EmbeddingStoreOperator.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_STORE_OPERATOR_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_STORE_OPERATOR_H
 3 | 
 4 | #include <pico-ps/common/EasyHashMap.h>
 5 | #include <pico-ps/operator/PushOperator.h>
 6 | #include "EmbeddingStorage.h"
 7 | #include "EmbeddingPullOperator.h"
 8 | #include "RpcView.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace pico {
12 | namespace embedding {
13 | 
14 | 
15 | class EmbeddingStoreOperator : public ps::UDFOperator<int, int> {
16 | public:
17 |     EmbeddingStoreOperator(const Configure& config):
18 |           ps::UDFOperator<int, int>(config), _pull(config) {
19 |         if (config.has("update_early_return")) {
20 |             _early_return = config["update_early_return"].as<bool>();
21 |         }
22 |     }
23 | 
24 |     virtual ~EmbeddingStoreOperator() {}
25 | 
26 |     EmbeddingStoreOperator(EmbeddingStoreOperator&&) = default;
27 |     EmbeddingStoreOperator& operator=(EmbeddingStoreOperator&&) = default;
28 | 
29 |     bool read_only() override { return false; }
30 | 
31 |     ps::Status generate_request(int&,
32 |           ps::RuntimeInfo& rt, int&, std::vector<ps::PSRequest>& reqs) override;
33 | 
34 |     void apply_request(const ps::PSMessageMeta& psmeta, ps::PSRequest& req, 
35 |           const ps::TableDescriptor& table, core::Dealer* dealer) override;
36 | 
37 |     ps::Status apply_response(ps::PSResponse& resp, int&, void* result) override;
38 | 
39 | protected:
40 |     EmbeddingPullOperator _pull;
41 |     bool _early_return = true;
42 | };
43 | 
44 | 
45 | }
46 | }
47 | }
48 | 
49 | #endif


--------------------------------------------------------------------------------
/openembedding/server/RpcView.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_RPC_VIEW_H
  2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_RPC_VIEW_H
  3 | 
  4 | #include <pico-core/LazyArchive.h>
  5 | #include <pico-ps/common/message.h>
  6 | 
  7 | namespace paradigm4 {
  8 | namespace pico {
  9 | namespace embedding {
 10 | 
 11 | template<class T>
 12 | struct RpcView {
 13 |     static_assert(std::is_trivially_copyable<T>::value, "");
 14 | 
 15 |     RpcView() {}
 16 | 
 17 |     // not owner
 18 |     RpcView(ps::RpcVector<T>& vector) {
 19 |         data = vector.data();
 20 |         size = vector.size();
 21 |     }
 22 | 
 23 |     RpcView(RpcView&& other) {
 24 |         *this = std::move(other);
 25 |     }
 26 | 
 27 |     RpcView& operator=(RpcView&& other) {
 28 |         data = other.data;
 29 |         size = other.size;
 30 |         holder = std::move(other.holder);
 31 |         other.data = nullptr;
 32 |         other.size = 0;
 33 |         return *this;
 34 |     }
 35 | 
 36 |     // for src_rank == dest_rank
 37 |     // be owner after receive()
 38 |     void receive() {
 39 |         if (!holder.deleter.owner) {
 40 |             holder = data_block_t(size * sizeof(T));
 41 |             memcpy(holder.data, data, holder.length);
 42 |             data = reinterpret_cast<T*>(holder.data);
 43 |         }
 44 |     }
 45 | 
 46 |     void receive(BinaryArchive&& ar) {
 47 |         SCHECK(ar.length() % sizeof(T) == 0);
 48 |         holder = data_block_t(ar.length());
 49 |         memcpy(holder.data, ar.buffer(), ar.length());
 50 |         data = reinterpret_cast<T*>(holder.data);
 51 |         size = ar.length() / sizeof(T);
 52 |         ar = BinaryArchive();
 53 |     }
 54 | 
 55 |     T* data = nullptr;
 56 |     size_t size = 0;
 57 |     data_block_t holder;
 58 | };
 59 | 
 60 | 
 61 | template<class T>
 62 | bool pico_serialize(core::ArchiveWriter&, core::SharedArchiveWriter& sar, RpcView<T>& view) {
 63 |     sar.put_shared_uncheck(view.data, view.size);
 64 |     return true;
 65 | }
 66 | 
 67 | template<class T>
 68 | bool pico_deserialize(core::ArchiveReader&, core::SharedArchiveReader& sar, RpcView<T>& view) {
 69 |     // be owner after receive()
 70 |     if (sar.is_exhausted()) {
 71 |         return false;
 72 |     }
 73 |     sar.get_shared_uncheck(view.data, view.size, view.holder);
 74 |     return true;
 75 | }
 76 | 
 77 | 
 78 | template<class T>
 79 | void serialize(core::LazyArchive& lazy, ps::CompressInfo& compress_info, RpcView<T>&& view) {
 80 |     if (compress_info._enabled) {
 81 |         BinaryArchive msg_ar, compressed_ar(true);
 82 |         msg_ar.set_read_buffer(reinterpret_cast<char*>(view.data), view.size * sizeof(T));
 83 |         compress_info._compresser.raw_compress(msg_ar, compressed_ar);
 84 |         lazy << std::move(compressed_ar);
 85 |     } else {
 86 |         lazy << std::move(view);
 87 |     }
 88 |     view = RpcView<T>();
 89 | }
 90 | 
 91 | template<class T>
 92 | void deserialize(core::LazyArchive& lazy, ps::CompressInfo& compress_info, RpcView<T>& view) {
 93 |     if (compress_info._enabled) {
 94 |         BinaryArchive msg_ar, compressed_ar;
 95 |         lazy >> compressed_ar;
 96 |         compress_info._compresser.raw_uncompress(compressed_ar, msg_ar);
 97 |         view.receive(std::move(msg_ar));
 98 |     } else {
 99 |         lazy >> view;
100 |         view.receive();
101 |     }
102 | }
103 | 
104 | 
105 | 
106 | }
107 | 
108 | 
109 | }
110 | }
111 | 
112 | #endif


--------------------------------------------------------------------------------
/openembedding/tensorflow/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # exb_ops should be compiled when pip install, here is only a simple v
 2 | execute_process(COMMAND ${PYTHON} -c "import tensorflow as tf; print(\" \".join(tf.sysconfig.get_compile_flags()), end=\"\")"
 3 |     OUTPUT_VARIABLE TF_COMPILE_FLAGS)
 4 | execute_process(COMMAND ${PYTHON} -c "import tensorflow as tf; print(\" \".join(tf.sysconfig.get_link_flags()), end=\"\")"
 5 |     OUTPUT_VARIABLE TF_LINK_FLAGS)
 6 | 
 7 | message(TF_COMPILE_FLAGS: ${TF_COMPILE_FLAGS})
 8 | message(TF_COMPILE_FLAGS: ${TF_LINK_FLAGS})
 9 | 
10 | add_library(exb_ops SHARED exb_ops.cpp)
11 | target_link_libraries(exb_ops cexb_pack)
12 | target_compile_options(exb_ops PRIVATE -Wno-unused-parameter -Wno-unused-but-set-parameter -Wno-ignored-qualifiers)
13 | set_target_properties(exb_ops PROPERTIES
14 |     COMPILE_FLAGS ${TF_COMPILE_FLAGS}
15 |     LINK_FLAGS ${TF_LINK_FLAGS})
16 | 
17 | 


--------------------------------------------------------------------------------
/openembedding/tensorflow/Prefetch.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_COMMON_PREFETCH_H
 2 | #define PARADIGM4_HYPEREMBEDDING_COMMON_PREFETCH_H
 3 | 
 4 | #include <unordered_map>
 5 | 
 6 | #include "ThreadPool.h"
 7 | 
 8 | namespace paradigm4 {
 9 | namespace exb {
10 | 
11 | class BatchIDTable {
12 | public:
13 |     int64_t pull_batch_id(int64_t key) {
14 |         exb_lock_guard guard(_mutex);
15 |         return _table[key];
16 |     }
17 | 
18 |     void next_work(int64_t key) {
19 |         exb_lock_guard guard(_mutex);
20 |         ++_table[key];
21 |     }
22 | 
23 | private:
24 |     exb_mutex _mutex;
25 |     std::unordered_map<int64_t, int64_t> _table;
26 |     int64_t _batch_id;
27 | };
28 | 
29 | struct PrefetchKey {
30 |     exb_variable* variable = nullptr;
31 |     const uint64_t* indices = nullptr;
32 |     size_t n = 0;
33 |     int64_t batch_id = 0;
34 |     size_t hash()const {
35 |         // boost::hash_combine
36 |         size_t hash = reinterpret_cast<size_t>(variable);
37 |         hash ^= batch_id + 0x9e3779b9 + (hash << 6) + (hash >> 2);
38 |         hash ^= n + 0x9e3779b9 +(hash << 6) + (hash >> 2);
39 |         // sampling key
40 |         for (size_t i = 0; i < 4 && i < n; ++i) {
41 |             hash ^= indices[hash % n] + 0x9e3779b9 + (hash << 6) + (hash >> 2);
42 |         }
43 |         return hash;
44 |     }
45 | };
46 | 
47 | struct PrefetchValue {
48 |     uint64_t check = 0;
49 |     exb_pull_waiter* waiter;
50 | };
51 | 
52 | class PrefetchTable {
53 | public:
54 |     void push(const PrefetchKey& key, PrefetchValue&& value) {
55 |         exb_lock_guard guard(_mutex);
56 |         _table[key.variable].push_back(std::move(value));
57 |     }
58 | 
59 |     bool pop(const PrefetchKey& key, PrefetchValue& value) {
60 |         exb_lock_guard guard(_mutex);
61 |         auto it = _table.find(key.variable);
62 |         if (it == _table.end() || it->second.empty()) {
63 |             return false;
64 |         }
65 |         value = std::move(it->second.front());
66 |         it->second.pop_front();
67 |         return true;
68 |     }
69 | 
70 |     exb_mutex _mutex;
71 |     std::unordered_map<exb_variable*, std::deque<PrefetchValue>> _table;
72 | };
73 | 
74 | }
75 | }
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/openembedding/tensorflow/ThreadPool.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_COMMON_THREAD_POOL_H
 2 | #define PARADIGM4_HYPEREMBEDDING_COMMON_THREAD_POOL_H
 3 | 
 4 | #include <atomic>
 5 | #include <vector>
 6 | #include <thread>
 7 | 
 8 | #include "../entry/c_api.h"
 9 | 
10 | namespace paradigm4 {
11 | namespace exb {
12 | 
13 | class exb_lock_guard {
14 | public:
15 |     exb_lock_guard(exb_mutex& mutex): _mutex(&mutex) {
16 |         exb_mutex_lock(_mutex);
17 |     }
18 |     ~exb_lock_guard() {
19 |         exb_mutex_unlock(_mutex);
20 |     }
21 |     exb_lock_guard(exb_lock_guard&&) = default;
22 |     exb_lock_guard& operator=(exb_lock_guard&&) = default;
23 | private:
24 |     exb_mutex* _mutex;
25 | };
26 | 
27 | class ThreadPool {
28 | public:
29 |     static ThreadPool& singleton() {
30 |         static ThreadPool pool;
31 |         return pool; 
32 |     }
33 | 
34 |     template<class F>
35 |     void submit(F job) {
36 |         std::function<void()>* p = new std::function<void()>(std::move(job));
37 |         exb_channel_write(_channels[_jid.fetch_add(std::memory_order_acq_rel) % _channels.size()], p);
38 |     }
39 | 
40 | private:
41 |     ThreadPool(size_t thread_num = std::thread::hardware_concurrency()): _threads(thread_num), _channels(thread_num) {
42 |         for (size_t i = 0; i < _threads.size(); ++i) {
43 |             _channels[i] = exb_channel_create();
44 |             _threads[i] = std::thread(&ThreadPool::running, this, i);
45 |         }
46 |     }
47 | 
48 |     ~ThreadPool() {
49 |         for (size_t i = 0; i < _threads.size(); ++i) {
50 |             exb_channel_close(_channels[i]);
51 |             _threads[i].join();
52 |             exb_channel_delete(_channels[i]);
53 |         }
54 |     }
55 | 
56 |     void running(size_t i) {
57 |         void* job;
58 |         while (exb_channel_read(_channels[i], &job)) {
59 |             std::function<void()>* p = static_cast<std::function<void()>*>(job);
60 |             (*p)();
61 |             delete p;
62 |         }
63 |     }
64 | 
65 |     std::atomic<size_t> _jid = {0};
66 |     std::vector<std::thread> _threads;
67 |     std::vector<exb_channel*> _channels;
68 | };
69 | 
70 | 
71 | 
72 | }
73 | }
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/openembedding/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from openembedding.tensorflow.exb import *
2 | from openembedding import __version__


--------------------------------------------------------------------------------
/openembedding/variable/DataType.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_DATATYPE_H
  2 | #define PARADIGM4_HYPEREMBEDDING_DATATYPE_H
  3 | 
  4 | #include <cmath>
  5 | #include <cstddef>
  6 | #include <type_traits>
  7 | #include <algorithm>
  8 | #include <pico-core/Configure.h>
  9 | #include <pico-core/Archive.h>
 10 | 
 11 | namespace paradigm4 {
 12 | namespace pico {
 13 | namespace embedding {
 14 | 
 15 | typedef float float32_t;
 16 | typedef double float64_t;
 17 | 
 18 | template<class T> struct TypeCase {};
 19 | 
 20 | class DataType {
 21 | public:
 22 |     enum DType {
 23 |         UNKNOWN = 0x0,
 24 |         INT8 = 0x1,
 25 |         INT16 = 0x2,
 26 |         INT32 = 0x4,
 27 |         INT64 = 0x8,
 28 | 
 29 |         FLOAT32 = 0x104,
 30 |         FLOAT64 = 0x108,
 31 |     };
 32 | 
 33 |     explicit DataType(int dtype = FLOAT32): dtype(dtype) {}
 34 | 
 35 |     DataType(const std::string& str) {
 36 |         if (str == "int8") {
 37 |             dtype = INT8;
 38 |         } else if (str == "int16") {
 39 |             dtype = INT16;
 40 |         } else if (str == "int32") {
 41 |             dtype = INT32;
 42 |         } else if (str == "int64") {
 43 |             dtype = INT64;
 44 |         } else if (str == "float32") {
 45 |             dtype = FLOAT32;
 46 |         } else if (str == "float64") {
 47 |             dtype = FLOAT64;
 48 |         } else {
 49 |             dtype = UNKNOWN;
 50 |         }
 51 |     }
 52 | 
 53 |     class ToString {
 54 |     public:
 55 |         void operator()(TypeCase<int8_t>, std::string& str) { str = "int8"; }
 56 |         void operator()(TypeCase<int16_t>, std::string& str) { str = "int16"; }
 57 |         void operator()(TypeCase<int32_t>, std::string& str) { str = "int32"; }
 58 |         void operator()(TypeCase<int64_t>, std::string& str) { str = "int64"; }
 59 |         void operator()(TypeCase<float32_t>, std::string& str) { str = "float32"; }
 60 |         void operator()(TypeCase<float64_t>, std::string& str) { str = "float64"; }
 61 |     };
 62 | 
 63 |     operator std::string()const {
 64 |         std::string str = "unknown";
 65 |         invoke(ToString(), str);
 66 |         return str;
 67 |     }
 68 | 
 69 |     std::string to_string()const {
 70 |         return *this;
 71 |     }
 72 | 
 73 |     template<class Function, typename... Params>
 74 |     void invoke(Function&& f, Params&&... params)const {
 75 |         switch (dtype) {
 76 |         case INT8:
 77 |             std::forward<Function>(f)(TypeCase<int8_t>(),
 78 |                   std::forward<Params>(params)...);
 79 |             break;
 80 |         case INT16:
 81 |             std::forward<Function>(f)(TypeCase<int16_t>(),
 82 |                   std::forward<Params>(params)...);
 83 |             break;
 84 |         case INT32:
 85 |             std::forward<Function>(f)(TypeCase<int32_t>(),
 86 |                   std::forward<Params>(params)...);
 87 |             break;
 88 |         case INT64:
 89 |             std::forward<Function>(f)(TypeCase<int64_t>(),
 90 |                   std::forward<Params>(params)...);
 91 |             break;
 92 |         case FLOAT32:
 93 |             std::forward<Function>(f)(TypeCase<float32_t>(),
 94 |                   std::forward<Params>(params)...);
 95 |             break;
 96 |         case FLOAT64:
 97 |             std::forward<Function>(f)(TypeCase<float64_t>(),
 98 |                   std::forward<Params>(params)...);
 99 |             break;
100 |         case UNKNOWN:
101 |             break;
102 |         default:
103 |             SLOG(FATAL) << "unexpected unknown datatype!";
104 |         }
105 |     }
106 | 
107 |     size_t size()const {
108 |         return dtype & 0xFF;
109 |     }
110 | 
111 |     template<class T>
112 |     static DataType from() {
113 |         return DataType(inner_from(TypeCase<T>()));
114 |     }
115 | 
116 |     friend bool operator==(DataType a, DataType b) {
117 |         return a.dtype == b.dtype;
118 |     }
119 | 
120 |     friend bool operator!=(DataType a, DataType b) {
121 |         return a.dtype != b.dtype;
122 |     }
123 | 
124 |     static DType inner_from(TypeCase<int8_t>) { return INT8; }
125 |     static DType inner_from(TypeCase<int16_t>) { return INT16; }
126 |     static DType inner_from(TypeCase<int32_t>) { return INT32; }
127 |     static DType inner_from(TypeCase<int64_t>) { return INT64; }
128 |     static DType inner_from(TypeCase<float32_t>) { return FLOAT32; }
129 |     static DType inner_from(TypeCase<float64_t>) { return FLOAT64; }
130 | 
131 |     int dtype = FLOAT32;
132 | 
133 |     PICO_SERIALIZATION(dtype);
134 | };
135 | 
136 | 
137 | }
138 | }
139 | }
140 | 
141 | #endif


--------------------------------------------------------------------------------
/openembedding/variable/EmbeddingInitializer.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_INITIALIZER_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_INITIALIZER_H
 3 | 
 4 | #include "DataType.h"
 5 | #include "Factory.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | template<class T>
12 | class EmbeddingInitializer: public Configurable {
13 | public:
14 |     using weight_type = T;
15 |     virtual std::string category() = 0;
16 |     virtual void train_init(T* weights, size_t embedding_dim) = 0;
17 | };
18 | 
19 | template<class T>
20 | class EmbeddingConstantInitializer: public EmbeddingInitializer<T> {
21 | public:
22 |     std::string category()override { return "constant"; }
23 | 
24 |     void train_init(T* weights, size_t embedding_dim) override {
25 |         for (size_t i = 0; i < embedding_dim; ++i) {
26 |             weights[i] = value;
27 |         }
28 |     }
29 | 
30 | private:
31 |     CONFIGURE_PROPERTY(T, value, 0.0);
32 | };
33 | 
34 | 
35 | template<class T>
36 | class EmbeddingUniformInitializer: public EmbeddingInitializer<T> {
37 | public:
38 |     std::string category()override { return "uniform"; }
39 | 
40 |     void load_config(const core::Configure& config) override {
41 |         EmbeddingInitializer<T>::load_config(config);
42 |         device = std::make_unique<std::random_device>();
43 |         engine = std::make_unique<std::default_random_engine>((*device)());
44 |         distribution = std::make_unique<std::uniform_real_distribution<T>>(minval, maxval);
45 |     }
46 | 
47 |     void train_init(T* weights, size_t embedding_dim) override {
48 |         for (size_t i = 0; i < embedding_dim; ++i) {
49 |             weights[i] = (*distribution)(*engine);
50 |         }
51 |     }
52 | 
53 | private:
54 |     CONFIGURE_PROPERTY(T, minval, 0.0);
55 |     CONFIGURE_PROPERTY(T, maxval, 1.0);
56 |     std::unique_ptr<std::random_device> device;
57 |     std::unique_ptr<std::default_random_engine> engine;
58 |     std::unique_ptr<std::uniform_real_distribution<T>> distribution;
59 | };
60 | 
61 | template<class T>
62 | class EmbeddingNormalInitializer: public EmbeddingInitializer<T> {
63 | public:
64 |     std::string category()override { return "normal"; }
65 | 
66 |     void load_config(const core::Configure& config) override {
67 |         EmbeddingInitializer<T>::load_config(config);
68 |         device = std::make_unique<std::random_device>();
69 |         engine = std::make_unique<std::default_random_engine>((*device)());
70 |         distribution = std::make_unique<std::normal_distribution<T>>(mean, stddev);
71 |     }
72 | 
73 |     void train_init(T* weights, size_t embedding_dim) override {
74 |         for (size_t i = 0; i < embedding_dim; ++i) {
75 |             weights[i] = (*distribution)(*engine);
76 |             if (truncated > 0.1) {
77 |                 while ((weights[i] - mean) / stddev > truncated) {
78 |                     weights[i] = (*distribution)(*engine);
79 |                 }
80 |             }
81 |         }
82 |     }
83 | 
84 | private:
85 |     CONFIGURE_PROPERTY(T, mean, 0.0);
86 |     CONFIGURE_PROPERTY(T, stddev, 1.0);
87 |     CONFIGURE_PROPERTY(T, truncated, 0.0);
88 |     std::unique_ptr<std::random_device> device;
89 |     std::unique_ptr<std::default_random_engine> engine;
90 |     std::unique_ptr<std::normal_distribution<T>> distribution;
91 | };
92 | 
93 | }
94 | }
95 | }
96 | 
97 | #endif
98 | 


--------------------------------------------------------------------------------
/openembedding/variable/EmbeddingVariable.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_H
 2 | #define PARADIGM4_HYPEREMBEDDING_EMBEDDING_VARIABLE_H
 3 | 
 4 | #include <limits>
 5 | #include "Meta.h"
 6 | #include "VariableAsyncTask.h"
 7 | 
 8 | namespace paradigm4 {
 9 | namespace pico {
10 | namespace embedding {
11 | 
12 | struct EmbeddingVariableContext {
13 |     int variable_id = 0;
14 | };
15 | 
16 | class EmbeddingVariableBase {
17 |     using key_type = uint64_t;
18 | public:
19 |     static std::unique_ptr<EmbeddingVariableBase> create(DataType datatype, size_t embedding_dim);
20 |     virtual ~EmbeddingVariableBase() {}
21 |     virtual void set_variable_context(const EmbeddingVariableContext&) = 0;
22 |     virtual void load_config(const core::Configure& config) = 0;
23 |     virtual void dump_config(core::Configure& config) = 0;
24 |     virtual bool persist_config(size_t persist_pending_window, core::Configure& config) = 0;
25 |     virtual bool should_persist() = 0;
26 |     virtual void clear_weights() = 0; // clear initializer，weights. optimizer not change. reset slots.
27 |     virtual size_t server_block_num_items() = 0;
28 |     virtual void get_weights(const key_type* indices, size_t n,
29 |           char* weights, char* states = nullptr) = 0;  // thread safe
30 |     virtual void set_weights(const key_type* indices, size_t n,
31 |           const char* weights, const char* states = nullptr) = 0;
32 |    
33 |     virtual void pull_weights(const key_type* indices, size_t n,
34 |           char* weights, VariableAsyncTask& async_task) = 0;  // thread safe
35 |     virtual void push_gradients(const key_type* indices, size_t n,
36 |           const char* gradients, const key_type* counts, VariableAsyncTask& async_task) = 0; // thread safe
37 |     virtual void update_weights() = 0;
38 |     virtual size_t state_line_size() = 0;
39 | 
40 |     virtual size_t num_indices() = 0;
41 |     virtual int create_reader() = 0; // thread safe
42 |     virtual size_t read_indices(int reader_id, key_type* indices, size_t n) = 0; // thread safe for unique reader_id
43 |     virtual uint64_t get_reader_cursor(int reader_id) = 0; // // thread safe for unique reader_id
44 |     virtual void delete_reader(int reader_id) = 0; // thread safe
45 | };
46 | 
47 | }
48 | }
49 | }
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/openembedding/variable/Factory.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_FACTORY_H
  2 | #define PARADIGM4_HYPEREMBEDDING_FACTORY_H
  3 | 
  4 | #include <cmath>
  5 | #include <cstddef>
  6 | #include <type_traits>
  7 | #include <pico-core/Configure.h>
  8 | 
  9 | namespace paradigm4 {
 10 | namespace pico {
 11 | namespace embedding {
 12 | 
 13 | template<class T>
 14 | void LOAD_CONFIG_load_config(const core::Configure& config, const std::string& key, T& value) {
 15 |     if (config.has(key)) {
 16 |         value = config.get<T>(key, value);
 17 |     }
 18 | }
 19 | 
 20 | template<class T>
 21 | void SAVE_CONFIG_save_config(core::Configure& config, const std::string& key, const T& value) {    
 22 |     config.node()[key] = value;
 23 | }
 24 | 
 25 | 
 26 | #define LOAD_CONFIG(config, x) do { \
 27 |     LOAD_CONFIG_load_config((config), #x, (x)); \
 28 | } while(0)
 29 | 
 30 | #define SAVE_CONFIG(config, x) do { \
 31 |     SAVE_CONFIG_save_config((config), #x, (x)); \
 32 | } while(0)
 33 | 
 34 | 
 35 | class Configurable: core::VirtualObject {
 36 | public:
 37 | 
 38 |     virtual void dump_config(core::Configure& config)const {
 39 |         for (auto& dumper: _inner_dumpers) {
 40 |             dumper(config);
 41 |         }
 42 |     }
 43 | 
 44 |     virtual void load_config(const core::Configure& config) {
 45 |         for (auto& loader: _inner_loaders) {
 46 |             loader(config);
 47 |         }
 48 |         core::Configure self;
 49 |         dump_config(self);
 50 | 
 51 |         // bool has_default = false;
 52 |         // core::Configure defaults;
 53 |         // for (auto pair: self.node()) {
 54 |         //     std::string key = pair.first.as<std::string>();
 55 |         //     if (!config.has(key)) {
 56 |         //         has_default = true;
 57 |         //         defaults.node()[key] = self.node()[key];
 58 |         //     }
 59 |         // }
 60 |         // if (has_default) {
 61 |         //     SLOG(INFO) << "using default configure: \n" << defaults.dump();
 62 |         // }
 63 | 
 64 |         bool has_unknown = false;
 65 |         core::Configure unknowns;
 66 |         for (auto pair: config.node()) {
 67 |             std::string key = pair.first.as<std::string>();
 68 |             if (!self.has(key)) {
 69 |                 has_unknown = true;
 70 |                 unknowns.node()[key] = config.node()[key];
 71 |             }
 72 |         }
 73 |         if (has_unknown) {
 74 |             SLOG(WARNING) << "unknown configure: \n" << unknowns.dump();
 75 |         }
 76 |     }
 77 | 
 78 | protected:
 79 |     std::vector<std::function<void(core::Configure&)> > _inner_dumpers;
 80 |     std::vector<std::function<void(const core::Configure&)> > _inner_loaders;
 81 | };
 82 | 
 83 | template<class T>
 84 | struct CONFIGURE_PROPERTY_LOADER {
 85 |     CONFIGURE_PROPERTY_LOADER(const char* key, T* p): key(key), p(p) {}
 86 |     void operator()(const core::Configure& config) {
 87 |         LOAD_CONFIG_load_config(config, key, *p);
 88 |     }
 89 |     const char* key;
 90 |     T* p;
 91 | };
 92 | 
 93 | template<class T>
 94 | struct CONFIGURE_PROPERTY_DUMPER {
 95 |     CONFIGURE_PROPERTY_DUMPER(const char* key, const T* p): key(key), p(p) {}
 96 |     void operator()(core::Configure& config) {
 97 |         SAVE_CONFIG_save_config(config, key, *p);
 98 |     }
 99 |     const char* key;
100 |     const T* p;
101 | };
102 | 
103 | 
104 | #define CONFIGURE_PROPERTY(type, name, default_value)\
105 |     public:\
106 |         type name = (default_value);\
107 |     private:\
108 |         bool name##_loader_dummy = (this->_inner_loaders.push_back(\
109 |               CONFIGURE_PROPERTY_LOADER<type>(#name, &this->name)), true);\
110 |         bool name##_dumper_dummy = (this->_inner_dumpers.push_back(\
111 |               CONFIGURE_PROPERTY_DUMPER<type>(#name, &this->name)), true);\
112 | 
113 | 
114 | template<class Base, typename... Args>
115 | class Factory: core::VirtualObject {
116 | public:
117 |     typedef std::function<std::unique_ptr<Base>(Args...)> creator_type;
118 |     virtual ~Factory() {}
119 | 
120 |     template<class Impl>
121 |     bool register_creator(const std::string& category) {
122 |         return _creators.emplace(category, creator<Impl>).second;
123 |     }
124 | 
125 |     std::unique_ptr<Base> create(const std::string& category, Args... args)const {
126 |         if (_creators.count(category)) {
127 |             return _creators.at(category)(args...);
128 |         } else {
129 |             std::string all_registered;
130 |             for (auto& pair: _creators) {
131 |                 all_registered += pair.first + " ";
132 |             }
133 |             SLOG(WARNING) << "Do not find \"" << category 
134 |                           << "\" in factory of " << core::readable_typename<Factory>()
135 |                           << ". Registered: " << all_registered;
136 |             return nullptr;
137 |         }
138 |     }
139 | 
140 |     static Factory& singleton() {
141 |         static Factory factory;
142 |         return factory;
143 |     }
144 | private:
145 |     Factory() = default;
146 |     template<class Impl>
147 |     static std::unique_ptr<Base> creator(Args... args) {
148 |         return std::make_unique<Impl>(std::forward<Args>(args)...);
149 |     }
150 |     std::map<std::string, creator_type> _creators;
151 | };
152 | 
153 | 
154 | }
155 | }
156 | }
157 | 
158 | #endif


--------------------------------------------------------------------------------
/openembedding/variable/MpscGradientReducer.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_MPSC_GRADIENT_REDUCER_H
 2 | #define PARADIGM4_HYPEREMBEDDING_MPSC_GRADIENT_REDUCER_H
 3 | 
 4 | #include <pico-ps/common/EasyHashMap.h>
 5 | #include "EmbeddingInitializer.h"
 6 | 
 7 | namespace paradigm4 {
 8 | namespace pico {
 9 | namespace embedding {
10 | 
11 | template<class Key, class T>
12 | class MpscGradientReducer {
13 | public:
14 |     using key_type = Key;
15 |     struct block_type {
16 |         const key_type* keys;
17 |         size_t n;
18 |         const T* gradients;
19 |         const uint64_t* counts;
20 |     };
21 | 
22 |     MpscGradientReducer(size_t embedding_dim, key_type empty_key)
23 |           : _embedding_dim(embedding_dim), _offsets(empty_key) {}
24 | 
25 |     // thread safe
26 |     void push_gradients(block_type block) {
27 |         _queue.push(std::move(block));
28 |     }
29 | 
30 |     block_type reduce_gradients() {
31 |         block_type block;
32 |         while (_queue.pop(block)) {
33 |             const T* grad = block.gradients;
34 |             for (size_t i = 0; i < block.n; ++i) {
35 |                 key_type key = block.keys[i];
36 |                 if (_offsets.count(key)) {
37 |                     size_t offset = _offsets.at(key);
38 |                     T* sum = _gradients.data() + offset * _embedding_dim;
39 |                     for (size_t j = 0; j < _embedding_dim; ++j) {
40 |                         sum[j] += grad[j];
41 |                     }
42 |                     _counts[offset] += block.counts[i];
43 |                 } else {
44 |                     _offsets.force_emplace(key, _offsets.size());
45 |                     _keys.push_back(key);
46 |                     _gradients.insert(_gradients.end(), grad, grad + _embedding_dim);
47 |                     _counts.push_back(block.counts[i]);
48 |                 }
49 |                 grad += _embedding_dim;
50 |             }
51 |         }
52 |         return {_keys.data(), _keys.size(), _gradients.data(), _counts.data()};
53 |     }
54 | 
55 |     void clear() {
56 |         _offsets.clear();
57 |         _keys.clear();
58 |         _gradients.clear();
59 |         _counts.clear();
60 |     }
61 | 
62 | private:
63 |     size_t _embedding_dim = 0;
64 |     core::MpscQueue<block_type> _queue;
65 |     EasyHashMap<key_type, size_t> _offsets;
66 |     core::vector<key_type> _keys;
67 |     core::vector<T> _gradients;
68 |     core::vector<uint64_t> _counts;
69 | };
70 | 
71 | }
72 | }
73 | }
74 | 
75 | #endif


--------------------------------------------------------------------------------
/openembedding/variable/PersistManager.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARADIGM4_HYPEREMBEDDING_PERSIST_MANAGER_H
 2 | #define PARADIGM4_HYPEREMBEDDING_PERSIST_MANAGER_H
 3 | 
 4 | #include <pico-core/pico_log.h>
 5 | #include <pico-core/SpinLock.h>
 6 | #include <pico-core/FileSystem.h>
 7 | 
 8 | namespace paradigm4 {
 9 | namespace pico {
10 | namespace embedding {
11 | 
12 | class PersistManager {
13 |     PersistManager() = default;
14 |     PersistManager(const PersistManager&) = default;
15 | public:
16 |     class CacheManager {
17 |     public:
18 |         void initialize() {
19 |             _cache_size.store(0);
20 |             _acquired_size.store(0);
21 |         }
22 | 
23 |         void set_cache_size(size_t cache_size) {
24 |             _cache_size.store(cache_size);
25 |         }
26 | 
27 |         bool acquire_cache(size_t size) {
28 |             if (_acquired_size.fetch_add(size, std::memory_order_relaxed) + size >
29 |                 _cache_size.load(std::memory_order_relaxed)) {
30 |                 _acquired_size.fetch_sub(size, std::memory_order_relaxed);
31 |                 return false;
32 |             }
33 |             return true;
34 |         }
35 | 
36 |         bool acquire_reserve_cache(size_t size) {
37 |             if (3 * _acquired_size.load(std::memory_order_relaxed) <
38 |                 _cache_size.load(std::memory_order_relaxed)) {
39 |                 return acquire_cache(size);
40 |             }
41 |             return false;
42 |         }
43 | 
44 |         void release_cache(size_t size) {
45 |             _acquired_size.fetch_sub(size);
46 |         }
47 |     private:
48 |         std::atomic<size_t> _cache_size = {0};
49 |         std::atomic<size_t> _acquired_size = {0};
50 |     };
51 | 
52 |     static PersistManager& singleton() {
53 |         static PersistManager manager;
54 |         return manager;
55 |     }
56 | 
57 |     bool use_pmem() { // server & client
58 |         return !_pmem_pool_root_path.empty();
59 |     }
60 | 
61 |     void initialize(const std::string& path) {
62 |         core::FileSystem::mkdir_p(path);
63 |         _pmem_pool_root_path = path;
64 |         _prefix = std::to_string(time(NULL)) + '-' + std::to_string(::getpid());
65 |         _next_pool_id.store(0);
66 |         reserved_cache.initialize();
67 |         dynamic_cache.initialize();
68 |     }
69 | 
70 |      std::string new_pmem_pool_path() {
71 |         SCHECK(use_pmem());
72 |         std::string name = std::to_string(_next_pool_id.fetch_add(1));
73 |         while (name.size() < 6) name = "0" + name;
74 |         return _pmem_pool_root_path + "/" + _prefix + "-" + name;
75 |     }
76 | 
77 |     CacheManager reserved_cache;
78 |     CacheManager dynamic_cache;
79 | private:
80 |     std::string _prefix;
81 |     std::string _pmem_pool_root_path;
82 |     std::atomic<size_t> _next_pool_id = {0};
83 | };
84 | 
85 | }
86 | }
87 | }
88 | 
89 | #endif


--------------------------------------------------------------------------------
/openembedding/variable/VariableAsyncTask.h:
--------------------------------------------------------------------------------
  1 | #ifndef PARADIGM4_HYPEREMBEDDING_ASYNC_OPERATOR_THREAD_POOL_H
  2 | #define PARADIGM4_HYPEREMBEDDING_ASYNC_OPERATOR_THREAD_POOL_H
  3 | 
  4 | #include <pico-core/SpinLock.h>
  5 | #include <pico-core/RpcChannel.h>
  6 | #include <thread>
  7 | 
  8 | namespace paradigm4 {
  9 | namespace pico {
 10 | namespace embedding {
 11 | 
 12 | class VariableAsyncTask {
 13 | public:
 14 |     static void wait(std::atomic<size_t>& _counter) {
 15 |         for (int tests = 0; unlikely(_counter.load(std::memory_order_acquire)); ++tests) {
 16 |             if (tests < 128) {
 17 |                 cpu_relax();
 18 |             } else {
 19 |                 static constexpr std::chrono::microseconds us0{0};
 20 |                 std::this_thread::sleep_for(us0);
 21 |             }
 22 |         }
 23 |     }
 24 | 
 25 |     VariableAsyncTask() {}
 26 |     VariableAsyncTask(int thread_id, std::atomic<size_t>& counter, core::RWSpinLock& shard_lock)
 27 |         : _thread_id(thread_id), _counter(&counter), _shard_lock(&shard_lock) {}
 28 |     VariableAsyncTask(const VariableAsyncTask&) = delete;
 29 |     VariableAsyncTask(VariableAsyncTask&& other) = default;
 30 | 
 31 |     VariableAsyncTask& operator=(VariableAsyncTask other) {
 32 |         SCHECK(_done == nullptr);
 33 |         new (this) VariableAsyncTask(std::move(other));
 34 |         return *this;
 35 |     }
 36 |     
 37 |     ~VariableAsyncTask() {}
 38 | 
 39 |     explicit operator bool() {
 40 |         return _done.operator bool();
 41 |     }
 42 | 
 43 |     int thread_id() {
 44 |         return _thread_id;
 45 |     }
 46 | 
 47 |     void done() {
 48 |         SCHECK(_done);
 49 |         if (_shard_lock) {
 50 |             core::lock_guard<core::RWSpinLock> guard(*_shard_lock);
 51 |             _done();
 52 |         } else {
 53 |             _done();
 54 |         }
 55 |         _entity = nullptr;
 56 |         _done = nullptr;
 57 |         _counter->fetch_sub(1, std::memory_order_relaxed);
 58 |     }
 59 | 
 60 |     void set_done(std::function<void()>&& done) {
 61 |         SCHECK(_done == nullptr && _counter);
 62 |         if (done) {
 63 |             _counter->fetch_add(1, std::memory_order_relaxed);
 64 |             _done = std::move(done);
 65 |         }
 66 |     }
 67 | 
 68 |     void hold_entity(const std::shared_ptr<void>& entity) {
 69 |         _entity = entity;
 70 |     }
 71 | 
 72 | private:
 73 |     size_t _thread_id = 0;
 74 |     std::atomic<size_t>* _counter = nullptr;
 75 |     core::RWSpinLock* _shard_lock = nullptr;
 76 |     std::shared_ptr<void> _entity = nullptr;
 77 |     std::function<void()> _done; 
 78 | };
 79 | 
 80 | class VariableAsyncTaskThreadPool {
 81 | public:
 82 |     static VariableAsyncTaskThreadPool& singleton() {
 83 |         static VariableAsyncTaskThreadPool pool;
 84 |         return pool; 
 85 |     }
 86 | 
 87 |     void submit(VariableAsyncTask&& async_task) {
 88 |         SCHECK(_initialized);
 89 |         core::lock_guard<core::RWSpinLock> guard(_lock);
 90 |         size_t num_tasks = _num_tasks.load(std::memory_order_relaxed) + 1;
 91 |         _num_tasks.store(num_tasks, std::memory_order_relaxed);
 92 |         _tasks.push_back(std::move(async_task));
 93 |         if (_tasks.size() >= _batch_num_tasks) {
 94 |             for (VariableAsyncTask& task: _tasks) {
 95 |                 if (task) {
 96 |                     _channels[task.thread_id() % _threads.size()]->send(std::move(task));
 97 |                 }   
 98 |             }
 99 |             _tasks.clear();
100 |         }
101 |     }
102 | 
103 |     // very illformed! TODO: remove
104 |     void initialize_batch_task() {
105 |         if (_batch_num_tasks.load(std::memory_order_relaxed) == 0 &&
106 |               _num_tasks.load(std::memory_order_relaxed) != 0) {
107 |             core::lock_guard<core::RWSpinLock> guard(_lock);
108 |             if (_batch_num_tasks.load() == 0) {
109 |                 SLOG(INFO) << "set batch num tasks " << _num_tasks.load();
110 |                 _batch_num_tasks.store(_num_tasks);
111 |             }
112 |         }
113 |     }
114 | 
115 |     void initialize(size_t thread_num) {
116 |         SCHECK(!_initialized);
117 |         _initialized = true;
118 |         _num_tasks.store(0);
119 |         _batch_num_tasks.store(0);
120 |         _threads.resize(thread_num);
121 |         _channels.resize(thread_num);
122 |         for (size_t i = 0; i < _threads.size(); ++i) {
123 |             _channels[i] = std::make_unique<core::RpcChannel<VariableAsyncTask>>();
124 |             _threads[i] = std::thread(&VariableAsyncTaskThreadPool::running, this, i);
125 |         }
126 |     }
127 | 
128 |     void finalize() {
129 |         SCHECK(_initialized);
130 |         for (size_t i = 0; i < _threads.size(); ++i) {
131 |             _channels[i]->terminate();
132 |             _threads[i].join();
133 |         }
134 |         _initialized = false;
135 |     }
136 | 
137 | private:
138 |     void running(size_t i) {
139 |         VariableAsyncTask task;
140 |         while (_channels[i]->recv(task, -1)) {
141 |             // must finalize task in loop
142 |             VariableAsyncTask done = std::move(task); 
143 |             done.done();
144 |         }
145 |     }
146 | 
147 |     bool _initialized = false;
148 |     std::vector<std::thread> _threads;
149 |     std::vector<std::unique_ptr<core::RpcChannel<VariableAsyncTask>>> _channels;
150 | 
151 |     core::RWSpinLock _lock;
152 |     std::atomic<size_t> _num_tasks = {0};
153 |     std::atomic<size_t> _batch_num_tasks = {0};
154 |     std::vector<VariableAsyncTask> _tasks;
155 | };
156 | 
157 | 
158 | }
159 | }
160 | }
161 | 
162 | #endif
163 | 


--------------------------------------------------------------------------------
/openembedding/variable/pmem_embedding_table_test.cpp:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | #include "PmemEmbeddingTable.h"
  3 | #include <limits>
  4 | 
  5 | namespace paradigm4 {
  6 | namespace pico {
  7 | namespace embedding {
  8 | 
  9 | std::string pmem_pool_root_path = "/mnt/pmem0/tmp/exb_pmem_test";
 10 | TEST(PmemEmbeddingTable, MultipleGetAndSet) {
 11 |     PersistManager::singleton().initialize(pmem_pool_root_path);
 12 |     PmemEmbeddingArrayTable<uint64_t,double> pt(64, -1);
 13 |     PersistManager::singleton().dynamic_cache.set_cache_size(pt.cache_item_memory_cost());
 14 |     
 15 |     size_t total_items = 5;
 16 |     for (size_t j = 0; j < total_items; ++j){
 17 |         ASSERT_EQ(j, pt.work_id());
 18 |         ASSERT_EQ(nullptr, pt.get_value(j));
 19 |         double* value = pt.set_value(j);
 20 |         for(size_t i = 0; i < 64; ++i){
 21 |             value[i] = i + j;
 22 |         }
 23 |         const double* get = pt.get_value(j);
 24 |         for(size_t i = 0; i < 64; ++i){
 25 |             ASSERT_EQ(double(i + j), get[i]);
 26 |         }
 27 |         pt.next_work();
 28 |     }
 29 |     ASSERT_EQ(total_items, pt.work_id());
 30 |     
 31 |     for (size_t k = 0; k < total_items; ++k){
 32 |         const double* tmp = pt.get_value(k);
 33 |         for(size_t i = 0; i < 64; ++i) {
 34 |             ASSERT_EQ(double(i + k), tmp[i]);
 35 |         }
 36 |     }
 37 | 
 38 |     pt.start_commit_checkpoint();
 39 |     ASSERT_EQ(pt.checkpoints().size(), 0);
 40 |     pt.flush_committing_checkpoint();
 41 |     ASSERT_EQ(pt.checkpoints().size(), 1);
 42 | 
 43 |     for (size_t j = 0; j < total_items; ++j){
 44 |         const double* get = pt.get_value(j);
 45 |         for(size_t i = 0; i < 64; ++i){
 46 |             ASSERT_EQ(double(i + j), get[i]);
 47 |         }
 48 |         double* value = pt.set_value(j);
 49 |         for(size_t i = 0; i < 64; ++i){
 50 |             value[i] = i + j;
 51 |         }
 52 |         pt.next_work();
 53 |     }
 54 |     core::FileSystem::rmrf(pmem_pool_root_path);
 55 | }
 56 | 
 57 | TEST(PmemEmbeddingTable, SingleCheckpoint) {
 58 |     PersistManager::singleton().initialize(pmem_pool_root_path);
 59 |     PmemEmbeddingHashTable<uint64_t,double> pt(64, -1);
 60 |     PersistManager::singleton().dynamic_cache.set_cache_size(pt.cache_item_memory_cost() * 5);
 61 | 
 62 |     double* tmp;
 63 |     EXPECT_EQ(0, pt.work_id());
 64 |     EXPECT_EQ(0, pt.checkpoints().size());
 65 | 
 66 |     for(size_t j=0; j<5; ++j){
 67 |         EXPECT_EQ(j, pt.work_id());
 68 |         EXPECT_EQ(nullptr, pt.get_value(j));
 69 |         tmp = pt.set_value(j);
 70 |         for(size_t i=0; i<64; ++i){
 71 |             *tmp = double(i+j);
 72 |             ++tmp;
 73 |         }
 74 |         tmp = (double *)pt.get_value(j);
 75 |         for(size_t i=0; i<64; ++i){
 76 |             EXPECT_EQ(double(i+j), *tmp);
 77 |             ++tmp;
 78 |         }
 79 |         pt.next_work();
 80 |     }
 81 |     EXPECT_EQ(5, pt.work_id());
 82 |     pt.start_commit_checkpoint();  //_committing=5
 83 | 
 84 |     EXPECT_EQ(0, pt.checkpoints().size());
 85 | 
 86 |     tmp = pt.set_value(0);
 87 |     for(size_t i=0; i<64; ++i){
 88 |         *tmp = (*tmp) + 10;
 89 |         ++tmp;
 90 |     }
 91 |     EXPECT_EQ(5, pt.work_id());
 92 |     EXPECT_EQ(0, pt.checkpoints().size());
 93 | 
 94 |     for(int k=1; k<5; ++k){
 95 |         tmp = pt.set_value(k);
 96 |         for(size_t i=0; i<64; ++i){
 97 |             *tmp = (*tmp) + 10;
 98 |             ++tmp;
 99 |         }
100 |     }
101 |     pt.next_work();
102 |     EXPECT_EQ(6, pt.work_id());
103 |     EXPECT_EQ(1, pt.checkpoints().size());
104 | 
105 |     tmp = pt.set_value(0);
106 |     for(size_t i=0; i<64; ++i){
107 |         *tmp = (*tmp) + 10;
108 |         ++tmp;
109 |     }
110 |     pt.next_work();
111 |     EXPECT_EQ(7, pt.work_id());
112 |     EXPECT_EQ(1, pt.checkpoints().size());
113 | 
114 |     for(size_t k=0; k<100; ++k){
115 |         tmp = pt.set_value(0);
116 |         for(size_t i=0; i<64; ++i){
117 |             *tmp = (*tmp) + 10;
118 |             ++tmp;
119 |         }
120 |         //pt.next_work();
121 |     }
122 |     pt.next_work();
123 |     EXPECT_EQ(8, pt.work_id());
124 |     EXPECT_EQ(1, pt.checkpoints().size());
125 | 
126 |     for(int k=5; k>=0; --k){
127 |         tmp = pt.set_value(k);
128 |         for(size_t i=0; i<64; ++i){
129 |             *tmp = (*tmp) + 10;
130 |             ++tmp;
131 |         }
132 |         pt.next_work();
133 |     }
134 |     EXPECT_EQ(14, pt.work_id());
135 |     EXPECT_EQ(1, pt.checkpoints().size());
136 | 
137 |     if(pt.checkpoints().size()>=2){
138 |         pt.pop_checkpoint();
139 |     }
140 |     pt.next_work();
141 |     EXPECT_EQ(15, pt.work_id());
142 |     EXPECT_EQ(1, pt.checkpoints().size());
143 | 
144 |     core::FileSystem::rmrf(pmem_pool_root_path);
145 | }
146 | 
147 | 
148 | 
149 | }
150 | }
151 | }
152 | 
153 | int main(int argc, char* argv[]) {
154 |     testing::InitGoogleTest(&argc, argv);
155 |     int ret = RUN_ALL_TESTS();
156 |     return ret;
157 | }
158 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import setuptools
 6 | import setuptools.command.build_ext
 7 | import distutils.errors
 8 | import distutils.sysconfig
 9 | import openembedding_setup
10 | 
11 | 
12 | work_path = os.path.dirname(os.path.realpath(__file__)) + '/'
13 | cpp_flags = ['--std=c++14', '-Wall', '-Wextra', '-frecord-gcc-switches', '-fPIC']
14 | link_flags = ['-lcexb_pack', '-L' + work_path + 'openembedding']        
15 | libexb = setuptools.Extension('openembedding.libexb', [])
16 | tensorflow_exb_ops = setuptools.Extension('openembedding.tensorflow.exb_ops', [])
17 | 
18 | 
19 | class custom_build_ext(setuptools.command.build_ext.build_ext):
20 |     def build_extensions(self):
21 |         self.build_core_extension()
22 |         self.build_tensorflow_extension()
23 |     
24 |     def build_core_extension(self):
25 |         import pybind11
26 |         libexb.sources = ['openembedding/entry/py_api.cc']
27 |         libexb.extra_compile_args = cpp_flags + ['-I' + pybind11.get_include()]
28 |         libexb.extra_link_args = link_flags
29 |         distutils.sysconfig.customize_compiler(self.compiler)
30 |         self.build_extension(libexb)
31 | 
32 |     def build_tensorflow_extension(self):
33 |         import tensorflow as tf
34 |         tensorflow_exb_ops.sources = ['openembedding/tensorflow/exb_ops.cpp']
35 |         tensorflow_exb_ops.extra_compile_args = cpp_flags + tf.sysconfig.get_compile_flags()
36 |         tensorflow_exb_ops.extra_link_args = link_flags + tf.sysconfig.get_link_flags()
37 |         distutils.sysconfig.customize_compiler(self.compiler)
38 |         self.build_extension(tensorflow_exb_ops)
39 | 
40 | 
41 | import textwrap
42 | setuptools.setup(
43 |     name='openembedding',
44 |     version=openembedding_setup.__version__,
45 |     description='Distributed framework to accelerate training and support serving.',
46 |     author='4paradigm',
47 |     author_email='opensource@4paradigm.com',
48 |     long_description=textwrap.dedent('''\
49 |         OpenEmbedding is a distributed framework to accelerate TensorFlow training and
50 |         support TensorFlow Serving. It uses the parameter server architecture to store
51 |         the Embedding Layer. So that single machine memory is not the limit of model size.
52 |         OpenEmbedding can cooperate with all-reduce framework to support both data parallel
53 |         and model parallel.'''),
54 |     url='https://github.com/4paradigm/OpenEmbedding',
55 |     keywords=['deep learning', 'tensorflow', 'keras', 'AI'],
56 |     classifiers=[
57 |         'Programming Language :: Python :: 3',
58 |         'Development Status :: 2 - Pre-Alpha',
59 |         'Operating System :: POSIX :: Linux',
60 |         'License :: OSI Approved :: Apache Software License'],
61 |     python_requires='>=3.6',
62 |     setup_requires=['pybind11'],
63 |     extras_require={'tensorflow':['tensorflow']},
64 |     packages=setuptools.find_packages(),
65 |     package_data={'': [work_path + 'openembedding/libcexb_pack.so']},
66 |     ext_modules=[libexb, tensorflow_exb_ops],
67 |     cmdclass={'build_ext': custom_build_ext})
68 | 


--------------------------------------------------------------------------------
/test/benchmark/criteo_deepctr_torch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pandas
 3 | import torch
 4 | import horovod.torch as hvd
 5 | from deepctr_torch.inputs import SparseFeat, DenseFeat
 6 | from deepctr_torch.models import WDL, DeepFM, xDeepFM
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--data', required=True)
11 | parser.add_argument('--optimizer', default='Adagrad', choices=['Adagrad'])
12 | parser.add_argument('--model', default="DeepFM", choices=["WDL", 'DeepFM', 'XDeepFM'])
13 | parser.add_argument('--embedding_dim', default=9, type=int)
14 | parser.add_argument('--batch_size', default=4096, type=int)
15 | parser.add_argument('--epochs', default=2, type=int)
16 | parser.add_argument('--cpu', action='store_true')
17 | args = parser.parse_args()
18 | hvd.init()
19 | if args.cpu:
20 |     device = 'cpu'
21 | else:
22 |     #torch.cuda.set_device(hvd.local_rank())
23 |     device = 'cuda:{}'.format(hvd.local_rank())
24 | 
25 | if __name__ == "__main__":
26 |     data = pandas.read_csv(args.data)
27 |     num_lines = data.shape[0]
28 |     num_local_lines = int(num_lines / hvd.size()) // args.batch_size * args.batch_size
29 |     local_start = hvd.local_rank() * num_local_lines
30 |     local_end = local_start + num_local_lines
31 |     print("num_lines:%d, num_local_lines:%d" % (num_lines, num_local_lines))
32 |     print("local_start:%d, local_end:%d" % (local_start, local_end))
33 |     
34 |     target = ['label']
35 |     dense_features = ['I' + str(i) for i in range(1, 14)]
36 |     sparse_features = ['C' + str(i) for i in range(1, 27)]
37 |     print(data.columns)
38 |     
39 |     feature_columns = []
40 |     for name in sparse_features:
41 |         feature_columns.append(SparseFeat(name, data[name].max() + 1, dtype='int64'))
42 |     for name in dense_features:
43 |         feature_columns.append(DenseFeat(name, 1, dtype='float32'))
44 |     train = data.iloc[local_start:local_end]
45 |     train_model_input = {name:train[name] for name in sparse_features + dense_features}
46 | 
47 |     if args.model == 'WDL':
48 |         fc_sizes = (512, 256, 128, 32)
49 |     elif args.model in {'DeepFM', 'xDeepFM'}:
50 |         fc_sizes = (400, 400, 400)
51 |     else:
52 |         print("unknown model ", args.model)
53 |     model = eval(args.model)(feature_columns, feature_columns, device=device,
54 |           task='binary', dnn_hidden_units=fc_sizes, l2_reg_linear=0, l2_reg_embedding=0)
55 | 
56 |     optimizer = torch.optim.Adagrad(model.parameters())
57 |     if hvd.size() > 1:
58 |         optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), op=hvd.Sum)
59 |         hvd.broadcast_optimizer_state(optimizer, root_rank=0)
60 |         hvd.broadcast_parameters(model.state_dict(), root_rank=0)
61 |     model.compile(optimizer, "binary_crossentropy", metrics=["binary_crossentropy", "auc"])
62 |     history = model.fit(train_model_input, train[target].values,
63 |           batch_size=args.batch_size, epochs=args.epochs, verbose=2)
64 | 


--------------------------------------------------------------------------------
/test/benchmark/criteo_tfrecord.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pandas
 3 | import tensorflow as tf
 4 | 
 5 | if len(sys.argv) < 4:
 6 |     print('usage: criteo_tfrecord.py input pid np')
 7 | 
 8 | def serialize_example(train, j):
 9 |     fea_desc = {}
10 |     for name, column in train.items():
11 |         if name[0] == 'I':
12 |             # dense feature
13 |             fea_desc[name] = tf.train.Feature(float_list=tf.train.FloatList(value=[float(column[j])]))
14 |         else:
15 |             # label or sparse feature
16 |             fea_desc[name] = tf.train.Feature(int64_list=tf.train.Int64List(value=[int(column[j])]))
17 |     example_proto = tf.train.Example(features=tf.train.Features(feature=fea_desc))
18 |     return example_proto.SerializeToString()
19 | 
20 | data = pandas.read_csv(sys.argv[1])
21 | pid = int(sys.argv[2])
22 | np = int(sys.argv[3])
23 | 
24 | target = ['label']
25 | dense_features = ['I' + str(i) for i in range(1, 14)]
26 | sparse_features = ['C' + str(i) for i in range(1, 27)]
27 | columns = target + dense_features + sparse_features
28 | train = {name:data[name] for name in columns}
29 | 
30 | count = 1000000
31 | for start in range(count * pid, data.shape[0], count * np):
32 |     end = start + count
33 |     if end > data.shape[0]:
34 |         end = data.shape[0]
35 |     name = str(start // count + 1)
36 |     while len(name) < 5:
37 |         name = '0' + name
38 |     with tf.io.TFRecordWriter("./tfrecord/tf-part.{}".format(name)) as writer:
39 |         for j in range(start, end):
40 |             example = serialize_example(train, j)
41 |             writer.write(example)
42 | 
43 | if pid == 0:
44 |     with open('./tfrecord/meta', 'w') as writer:
45 |         for name in sparse_features:
46 |             writer.write(name, data[name].max() + 1)
47 | 


--------------------------------------------------------------------------------
/test/benchmark/server.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import sys
 3 | import time
 4 | import psutil
 5 | from threading import Thread
 6 | import openembedding as embed
 7 | 
 8 | import argparse
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--bind_ip', default='')
11 | parser.add_argument('--server_concurrency', default=28, type=int)
12 | 
13 | # For paper experiment
14 | parser.add_argument('--pmem', default='')
15 | parser.add_argument('--cache_size', default=1000, type=int)
16 | 
17 | args = parser.parse_args()
18 | if args.pmem:
19 |     embed.flags.config = ('{"server":{"server_concurrency":%d'
20 |             ',"pmem_pool_root_path":"%s", "cache_size":%d } }') % (
21 |             args.server_concurrency, args.pmem, args.cache_size)
22 | else:
23 |     embed.flags.config = '{"server":{"server_concurrency":%d } }' % (
24 |             args.server_concurrency)
25 | 
26 | 
27 | def print_rss():
28 |     print(psutil.Process(os.getpid()).memory_info().rss  / 1024 / 1024 / 1024, 'GB', flush=True)
29 | 
30 | 
31 | def start():
32 |     if len(sys.argv) > 1:
33 |         embed.flags.bind_ip = args.bind_ip
34 |     _master = embed.Master()
35 |     print(_master.endpoint)
36 |     embed.flags.bind_ip = embed.flags.bind_ip[:embed.flags.bind_ip.find(':')]
37 |     embed.flags.master_endpoint = _master.endpoint
38 |     _server = embed.Server()
39 |     _server.join()
40 | 
41 | 
42 | i = 0
43 | print_rss()
44 | th = Thread(target=start, args=[])
45 | th.start()
46 | while th.is_alive():
47 |     i += 1
48 |     th.join(0.1)
49 |     if i % 100 == 0:
50 |         print_rss()
51 | print_rss()


--------------------------------------------------------------------------------
/test/criteo_preprocess.cpp:
--------------------------------------------------------------------------------
  1 | #include <gflags/gflags.h>
  2 | #include <pico-core/FileSystem.h>
  3 | #include <pico-ps/common/EasyHashMap.h>
  4 | 
  5 | namespace paradigm4 {
  6 | namespace pico {
  7 | 
  8 | class LabelEncoder {
  9 | public:
 10 |     LabelEncoder(): _encoder(-1) {}
 11 |     size_t encode(int64_t key) {
 12 |         if (key == -1) {
 13 |             key = std::numeric_limits<int64_t>::max();
 14 |         }
 15 |         return _encoder.try_emplace(key, _encoder.size()).first->second;
 16 |     }
 17 | 
 18 |     size_t unique_count() {
 19 |         return _encoder.size();
 20 |     }
 21 | private:
 22 |     EasyHashMap<int64_t, size_t> _encoder;
 23 | };
 24 | 
 25 | class TSVProcesser {
 26 | public:
 27 |     TSVProcesser(size_t dense_features, size_t sparse_features, size_t repeat)
 28 |         : _dense_features(dense_features), _sparse_features(sparse_features), _repeat(repeat),
 29 |           _encoders(sparse_features), _key_labels(sparse_features),
 30 |           _buffer(64 * (sparse_features + dense_features + 1)), _out_buffer(_buffer.size()) {}
 31 | 
 32 |     size_t process(FILE* in, FILE* out) {
 33 |         fgets(_buffer.data(), _buffer.size(), in);
 34 |         size_t n = strlen(_buffer.data());
 35 |         if (n == 0) {
 36 |             return 0;
 37 |         }
 38 |         size_t i = 0;
 39 |         for (size_t k = 0; k < _dense_features + 1; ++k) {
 40 |             skip_dense(_buffer.data(), i);
 41 |             ++i;
 42 |         }
 43 |         size_t sparse_start = i;
 44 |         memcpy(_out_buffer.data(), _buffer.data(), sparse_start);
 45 |         for (size_t k = 0; k < _sparse_features; ++k) {
 46 |             uint64_t key = parse_sparse(_buffer.data(), i);
 47 |             _key_labels[k] = _encoders[k].encode(key);
 48 |             ++i;
 49 |         }
 50 |         for (size_t row = 0; row < _repeat; ++row) {
 51 |             size_t i = sparse_start;
 52 |             for (size_t k = 0; k < _sparse_features; ++k) {
 53 |                 output_sparse(_out_buffer.data(), i, _key_labels[k] * _repeat + row);
 54 |                 _out_buffer[i] = k == _sparse_features - 1 ? '\0' : '\t';
 55 |                 ++i;
 56 |             }
 57 |             fprintf(out, "%s\n", _out_buffer.data());
 58 |         }
 59 |         return _repeat;
 60 |     }
 61 |     
 62 |     void skip_dense(char* buffer, size_t& i) {
 63 |         while (buffer[i] && buffer[i] != '\t') ++i;
 64 |     }
 65 | 
 66 |     int64_t parse_sparse(char* buffer, size_t& i) {
 67 |         if (buffer[i] == '\0' || buffer[i] == '\t') {
 68 |             return -1;
 69 |         }
 70 |         int64_t result = 0;
 71 |         while (buffer[i] && buffer[i] != '\t') {
 72 |             int val = buffer[i] <= '9' && buffer[i] >= '0' ? buffer[i] - '0' : buffer[i] - 'a';
 73 |             result = result * 16 + val;
 74 |             ++i;
 75 |         }
 76 |         return result;
 77 |     }
 78 | 
 79 |     void output_sparse(char* buffer, size_t& i, size_t key) {
 80 |         size_t p = i;
 81 |         do {
 82 |             buffer[i] = '0' + key % 10;
 83 |             key /= 10;
 84 |             ++i;
 85 |         } while (key != 0);
 86 |         std::reverse(buffer + p, buffer + i);
 87 |     }
 88 | 
 89 |     size_t unique_count(size_t sparse_feature) {
 90 |         return _encoders[sparse_feature].unique_count() * _repeat;
 91 |     }
 92 | private:
 93 |     size_t _dense_features = 0;
 94 |     size_t _sparse_features = 0;
 95 |     size_t _repeat = 0;
 96 | 
 97 |     std::vector<LabelEncoder> _encoders;
 98 |     std::vector<size_t> _key_labels;
 99 | 
100 |     std::vector<char> _buffer;
101 |     std::vector<char> _out_buffer;
102 |     
103 | };
104 | 
105 | 
106 | void process(std::string input_dir, std::string output_dir, size_t file_lines, size_t repeat) {
107 |     int day = 1;
108 |     size_t lines = 0;
109 |     auto fout = core::ShellUtility::open(output_dir + "/day_" + std::to_string(day), "w");
110 |     TSVProcesser processer(13, 26, repeat);
111 |     for (std::string input_file: FileSystem::get_file_list(input_dir, "")) {
112 |         SLOG(INFO) << input_file;
113 |         auto fin = core::ShellUtility::open(input_file, "r");
114 |         while (!feof(fin.get())) {
115 |             if (lines >= file_lines) {
116 |                 SLOG(INFO) << "day_" << day << " generated";
117 |                 ++day;
118 |                 lines = 0;
119 |                 fout = core::ShellUtility::open(output_dir + "/day_" + std::to_string(day), "w");
120 |             }
121 |             lines += processer.process(fin.get(), fout.get());
122 |         }
123 |     }
124 |     SLOG(INFO) << "day_" << day << " generated";
125 | 
126 |     fout = core::ShellUtility::open(output_dir + "/meta", "w");
127 |     for (int i = 0; i < 26; ++i) {
128 |         std::string str = pico_lexical_cast<std::string>(processer.unique_count(i));
129 |         fprintf(fout.get(), "C%d %s\n", i + 1, str.c_str());
130 |     }
131 | };
132 | 
133 | }
134 | } // namespace paradigm4
135 | 
136 | DEFINE_string(output, "", "");
137 | DEFINE_string(input, "", "");
138 | DEFINE_int32(file_lines, 10000000, "");
139 | DEFINE_int32(repeat, 2, "");
140 | 
141 | int main(int argc, char* argv[]) {
142 |     google::ParseCommandLineFlags(&argc, &argv, false);
143 |     paradigm4::pico::process(FLAGS_input, FLAGS_output, FLAGS_file_lines, FLAGS_repeat);
144 |     return 0;
145 | }


--------------------------------------------------------------------------------
/test/optimizer_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import tensorflow as tf
 3 | import openembedding.tensorflow as embed
 4 | 
 5 | 
 6 | def run_tf_optimizer(optimizer, gradients):
 7 |     optimizer = optimizer.__class__.from_config(optimizer.get_config())
 8 |     var = tf.Variable(tf.ones(gradients[0].shape, gradients[0].dtype))
 9 |     for grad in gradients:
10 |         optimizer.apply_gradients([(grad, var)])
11 |     return var.read_value()
12 | 
13 | 
14 | def run_my_optimizer(optimizer, gradients):
15 |     var = embed.Embedding(gradients[0].shape[0], gradients[0].shape[1],
16 |           tf.keras.initializers.Constant(1.0), dtype=gradients[0].dtype)
17 |     indices = tf.range(var.input_dim)
18 |     var.build(indices.shape)
19 |     var.variable.set_server_optimizer(optimizer)
20 |     for grad in gradients:
21 |         fakegrad = var.variable.push_gradients(indices, grad)
22 |         var.variable.update_weights(fakegrad)
23 |     return var.variable.sparse_read(indices)
24 | 
25 | from tensorflow.keras.optimizers import *
26 | 
27 | gradients1d = [ tf.ones([1, 1], dtype=tf.float64) ]
28 | gradients10d = [ tf.random.uniform([111, 11], -1, 1, dtype=tf.float64) for i in range(10) ]
29 | gradients100d = [ tf.random.uniform([111, 11], -1, 1, dtype=tf.float64) for i in range(100) ]
30 | gradients1 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients1d ]
31 | gradients10 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients10d ]
32 | gradients100 = [ tf.cast(tensor, dtype=tf.float32) for tensor in gradients100d ]
33 | optimizers = [
34 |     Adadelta(), Adadelta(0.1), Adadelta(0.1, rho=0.8),
35 |     Adagrad(), Adagrad(0.1), Adagrad(0.1, 1000),
36 |     Adam(), Adam(0.1), Adam(0.1, beta_1=0.8, beta_2=0.97),
37 |     Adamax(), Adamax(0.1), Adamax(0.1, beta_1=0.8, beta_2=0.97),
38 |     Ftrl(), Ftrl(0.1), 
39 |     Ftrl(0.1, -0.5, 0.1, 0.01, 0.05, 'Ftrl', 0),
40 |     Ftrl(0.1, -0.5, 0.1, 0.01, 0.05, 'Ftrl', 0, 0.05),
41 |     Ftrl(0.1, -0.5, 0.1, 0.00, 0.05, 'Ftrl', 0),
42 |     Ftrl(0.1, -0.5, 0.1, 0.00, 0.05, 'Ftrl', 0, 0.1),
43 |     Ftrl(0.1, -0.5, 0.1, 0.01, 0.01, 'Ftrl', 0.05),
44 |     Ftrl(0.1, -0.5, 0.1, 0.05, 0.00, 'Ftrl', 0),
45 |     Ftrl(0.1, -0.5, 10, 0.00, 0.05, 'Ftrl', 0),
46 |     Ftrl(0.1, -0.5, 10, 0.00, 0.05, 'Ftrl', 0, 0.5),
47 |     Ftrl(0.1, -0.5, 10, 0.01, 0.01, 'Ftrl', 0.05),
48 |     Ftrl(0.1, -0.5, 10, 0.05, 0.01, 'Ftrl', 0.05),
49 |     RMSprop(), RMSprop(0.1, rho=0.8), RMSprop(0.1, momentum=0.5), RMSprop(rho=0.7, momentum=0.7),
50 |     SGD(), SGD(0.1), SGD(momentum=0.5)
51 | ]
52 | 
53 | 
54 | all_results = []
55 | for gradients in [gradients1, gradients1d, gradients10, gradients10d, gradients100, gradients100d]:
56 |     results = []
57 |     for optimizer in optimizers:
58 |         A = run_tf_optimizer(optimizer, gradients)
59 |         B = run_my_optimizer(optimizer, gradients)
60 |         row = A.shape[0] - 1
61 |         col = A.shape[1] - 1
62 |         error = tf.reduce_sum(tf.reduce_sum(tf.abs(A - B)))
63 |         results.append((float(error), optimizer.get_config()['name'],
64 |               float(A[0][0]), float(B[0][0]), float(A[row][col]), float(B[row][col])))
65 |     all_results.append(sorted(results, key=lambda x: x[0]))
66 | 
67 | for results in all_results:
68 |     for result in results:
69 |         if result[0] > 10.0:
70 |             print("error! ", result, file=sys.stderr)
71 |             exit(1)
72 |         print(result)
73 |     print()
74 | 


--------------------------------------------------------------------------------