├── .clang-format ├── .gitignore ├── .pre-commit-config.yaml ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── build.sh ├── csrc ├── include │ ├── infinity │ │ ├── core │ │ │ ├── Configuration.h │ │ │ ├── Context.cpp │ │ │ └── Context.h │ │ ├── infinity.h │ │ ├── memory │ │ │ ├── Atomic.cpp │ │ │ ├── Atomic.h │ │ │ ├── Buffer.cpp │ │ │ ├── Buffer.h │ │ │ ├── Region.cpp │ │ │ ├── Region.h │ │ │ ├── RegionToken.cpp │ │ │ ├── RegionToken.h │ │ │ ├── RegionType.h │ │ │ ├── RegisteredMemory.cpp │ │ │ └── RegisteredMemory.h │ │ ├── queues │ │ │ ├── QueuePair.cpp │ │ │ ├── QueuePair.h │ │ │ ├── QueuePairFactory.cpp │ │ │ └── QueuePairFactory.h │ │ ├── requests │ │ │ ├── RequestToken.cpp │ │ │ └── RequestToken.h │ │ └── utils │ │ │ ├── Address.cpp │ │ │ ├── Address.h │ │ │ └── Debug.h │ ├── miniz │ │ └── miniz.h │ └── qvf │ │ ├── com_endpoint.h │ │ ├── common.h │ │ ├── dist_tensor_client.h │ │ ├── dist_tensor_server.h │ │ ├── pipe.h │ │ ├── qvf.h │ │ ├── range.h │ │ ├── shared_loader.h │ │ └── tensor_endpoint.h └── src │ ├── module.cpp │ ├── register.cpp │ └── shared_loader.cpp ├── docs ├── imgs │ ├── Network Bandwidth Under 100Gbps IB.png │ ├── consistent_memory_view.png │ ├── e2e_feature_collection.png │ ├── e2e_feature_collection_performance.png │ ├── gpu0_centered_access_performance.png │ ├── memory_usage.png │ ├── multi_qp.png │ ├── one_batch_feature_collection.png │ ├── peak_memory_footprint.png │ ├── pgas_tensor_access.png │ ├── pgas_tensor_view.png │ ├── range_partition.png │ ├── rdma_mtt.png │ ├── shared_load.png │ ├── subset_signaled_requests.png │ └── train_gnn_on_large_graphs.png ├── memory.md ├── partition_methods.md └── rdma_details.md ├── examples ├── mag240m │ ├── README.md │ ├── config.py │ ├── distribute_training.py │ ├── preprocess.py │ └── preprocess_quiver.py ├── ogb-products │ ├── config.py │ └── distribute_training.py └── reddit │ ├── config.py │ └── distribute_training.py ├── quiver_feature ├── __init__.py ├── common.py ├── dist_helper.py ├── dist_tensor_pgas.py ├── dist_tensor_rpc.py ├── local_tensor_pgas.py ├── multiprocessing │ ├── __init__.py │ └── reductions.py ├── tensor_loader.py └── utils.py ├── setup.py └── tests ├── cpp ├── test_DistTensorClient.cpp ├── test_DistTensorServer.cpp ├── test_Pipe.cpp └── test_main.cpp ├── infinity ├── feature_server.cpp ├── read-write-send.cpp ├── send-performance.cpp ├── test_multiread.cpp ├── test_multiread_multiqp.cpp └── test_read.cpp └── python ├── config.py ├── preprocess_Dataset.py ├── test_DGLUnifiedTensor.py ├── test_DistHelper.py ├── test_DistTensorClient.py ├── test_DistTensorPGAS.py ├── test_DistTensorRPC.py ├── test_DistTensorServer.py ├── test_LocalTensorPGAS.py ├── test_MultiMachineDistTensorClientServer.py ├── test_MultiMachineDistTensorPGAS.py ├── test_MultiMachineDistTensorRPC.py ├── test_PipeParam.py ├── test_RealDataset.py ├── test_RegisteredTensorTransfer.py ├── test_SharedLoader.py ├── test_TensorEndPoint.py └── tmp.py /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Chromium 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .vscode/ 132 | build/ 133 | infinity_realease/ 134 | 135 | examples/reddit/processed/ 136 | examples/reddit/raw/ 137 | tests/data/ 138 | 139 | .idea/* 140 | cmake-build-debug/ 141 | 142 | # OSX 143 | .DS_Store 144 | 145 | *.pt -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.2.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - id: check-yaml 8 | exclude: | 9 | (?x)^( 10 | conda/pytorch-geometric/meta.yaml| 11 | conda/pyg/meta.yaml 12 | )$ 13 | # - repo: https://github.com/adrienverge/yamllint.git 14 | # rev: v1.26.3 15 | # hooks: 16 | # - id: yamllint 17 | # args: [-c=.yamllint.yml] 18 | 19 | # - repo: https://github.com/regebro/pyroma 20 | # rev: "4.0" 21 | # hooks: 22 | # - id: pyroma 23 | # name: Check packaging 24 | # args: [--min=10, .] 25 | 26 | # - repo: https://github.com/pre-commit/mirrors-yapf 27 | # rev: v0.32.0 28 | # hooks: 29 | # - id: yapf 30 | # name: Format code 31 | 32 | # - repo: https://github.com/pycqa/isort 33 | # rev: 5.10.1 34 | # hooks: 35 | # - id: isort 36 | # name: Sort imports 37 | 38 | # - repo: https://github.com/PyCQA/flake8 39 | # rev: 4.0.1 40 | # hooks: 41 | # - id: flake8 42 | # name: Check PEP8 43 | 44 | - repo: https://github.com/pre-commit/mirrors-clang-format 45 | rev: v14.0.1 46 | hooks: 47 | - id: clang-format 48 | name: Format C++ code 49 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(quiver_feature) 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_CUDA_STANDARD 14) 5 | 6 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) 7 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE) 8 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 9 | 10 | file(GLOB HEADERS csrc/include/qvf/*.h csrc/include/infinity/*.h csrc/include/miniz/*.h csrc/include/infinity/core/*.h csrc/include/infinity/memory/*.h csrc/include/infinity/queues/*.h csrc/include/infinity/requests/*.h csrc/include/infinity/utils/*.h) 11 | file(GLOB SOURCES csrc/src/*.cpp csrc/include/miniz/*.c csrc/include/infinity/requests/*.cpp csrc/include/infinity/core/*.cpp csrc/include/infinity/memory/*.cpp csrc/include/infinity/queues/*.cpp csrc/include/infinity/utils/*.cpp) 12 | file(GLOB TEST_SOURCES tests/cpp/*.cpp) 13 | 14 | set_source_files_properties(SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs") 15 | set_source_files_properties(TEST_SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs") 16 | 17 | find_package(Python3 COMPONENTS Interpreter Development) 18 | find_package(Torch REQUIRED) 19 | add_library(${PROJECT_NAME} SHARED ${SOURCES}) 20 | find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib") 21 | 22 | target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) 23 | target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python) 24 | target_link_libraries(${PROJECT_NAME} PRIVATE ibverbs) 25 | 26 | if (PROF) 27 | target_link_options(${PROJECT_NAME} PRIVATE "-pg") 28 | endif() 29 | 30 | target_include_directories(${PROJECT_NAME} PUBLIC csrc/include) 31 | 32 | include(GNUInstallDirs) 33 | include(CMakePackageConfigHelpers) 34 | 35 | install(TARGETS ${PROJECT_NAME} 36 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}) 37 | install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}) 38 | 39 | if(BUILD_TEST) 40 | add_executable(cpp_test ${TEST_SOURCES}) 41 | target_link_libraries(cpp_test PRIVATE ${TORCH_LIBRARIES}) 42 | target_link_libraries(cpp_test PRIVATE Python3::Python) 43 | target_link_libraries(cpp_test PRIVATE ${PROJECT_NAME}) 44 | endif() 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ################################################## 2 | # 3 | # (c) 2018 Claude Barthels, ETH Zurich 4 | # 5 | # Call 'make library' to build the library 6 | # Call 'make examples' to build the examples 7 | # Call 'make all' to build everything 8 | # 9 | ################################################## 10 | 11 | PROJECT_NAME = libinfinity 12 | 13 | ################################################## 14 | 15 | CC = g++ 16 | CC_FLAGS = -O3 -std=c++14 17 | LD_FLAGS = -linfinity -libverbs 18 | 19 | ################################################## 20 | 21 | SOURCE_FOLDER = csrc/include/ 22 | BUILD_FOLDER = build/infinity 23 | RELEASE_FOLDER = build/infinity_release 24 | INCLUDE_FOLDER = include 25 | EXAMPLES_FOLDER = infinity/ 26 | 27 | ################################################## 28 | 29 | SOURCE_FILES = $(SOURCE_FOLDER)/infinity/core/Context.cpp \ 30 | $(SOURCE_FOLDER)/infinity/memory/Atomic.cpp \ 31 | $(SOURCE_FOLDER)/infinity/memory/Buffer.cpp \ 32 | $(SOURCE_FOLDER)/infinity/memory/Region.cpp \ 33 | $(SOURCE_FOLDER)/infinity/memory/RegionToken.cpp \ 34 | $(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.cpp \ 35 | $(SOURCE_FOLDER)/infinity/queues/QueuePair.cpp \ 36 | $(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.cpp \ 37 | $(SOURCE_FOLDER)/infinity/requests/RequestToken.cpp \ 38 | $(SOURCE_FOLDER)/infinity/utils/Address.cpp 39 | 40 | HEADER_FILES = $(SOURCE_FOLDER)/infinity/infinity.h \ 41 | $(SOURCE_FOLDER)/infinity/core/Context.h \ 42 | $(SOURCE_FOLDER)/infinity/core/Configuration.h \ 43 | $(SOURCE_FOLDER)/infinity/memory/Atomic.h \ 44 | $(SOURCE_FOLDER)/infinity/memory/Buffer.h \ 45 | $(SOURCE_FOLDER)/infinity/memory/Region.h \ 46 | $(SOURCE_FOLDER)/infinity/memory/RegionToken.h \ 47 | $(SOURCE_FOLDER)/infinity/memory/RegionType.h \ 48 | $(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.h \ 49 | $(SOURCE_FOLDER)/infinity/queues/QueuePair.h \ 50 | $(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.h \ 51 | $(SOURCE_FOLDER)/infinity/requests/RequestToken.h \ 52 | $(SOURCE_FOLDER)/infinity/utils/Debug.h \ 53 | $(SOURCE_FOLDER)/infinity/utils/Address.h 54 | 55 | ################################################## 56 | 57 | OBJECT_FILES = $(patsubst $(SOURCE_FOLDER)/%.cpp,$(BUILD_FOLDER)/%.o,$(SOURCE_FILES)) 58 | SOURCE_DIRECTORIES = $(dir $(HEADER_FILES)) 59 | BUILD_DIRECTORIES = $(patsubst $(SOURCE_FOLDER)/%,$(BUILD_FOLDER)/%,$(SOURCE_DIRECTORIES)) 60 | 61 | ################################################## 62 | 63 | all: library examples 64 | 65 | ################################################## 66 | 67 | $(BUILD_FOLDER)/%.o: $(SOURCE_FILES) $(HEADER_FILES) 68 | mkdir -p $(BUILD_FOLDER) 69 | mkdir -p $(BUILD_DIRECTORIES) 70 | $(CC) $(CC_FLAGS) -c $(SOURCE_FOLDER)/$*.cpp -I $(SOURCE_FOLDER) -o $(BUILD_FOLDER)/$*.o 71 | 72 | ################################################## 73 | 74 | library: $(OBJECT_FILES) 75 | mkdir -p $(RELEASE_FOLDER) 76 | ar rvs $(RELEASE_FOLDER)/$(PROJECT_NAME).a $(OBJECT_FILES) 77 | rm -rf $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) 78 | cp --parents $(HEADER_FILES) $(RELEASE_FOLDER) 79 | mv $(RELEASE_FOLDER)/$(SOURCE_FOLDER)/ $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) 80 | 81 | ################################################## 82 | 83 | clean: 84 | rm -rf $(BUILD_FOLDER) 85 | rm -rf $(RELEASE_FOLDER) 86 | 87 | ################################################## 88 | 89 | examples: 90 | mkdir -p $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER) 91 | # $(CC) tests/infinity/read-write-send.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/read-write-send 92 | # $(CC) tests/infinity/send-performance.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/send-performance 93 | # $(CC) tests/infinity/test_read.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_read 94 | # $(CC) tests/infinity/test_multiread.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread 95 | $(CC) tests/infinity/test_multiread_multiqp.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread_multiqp 96 | $(CC) tests/cpp/test_pipe.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_pipe 97 | 98 | ################################################## 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [pypi-image]: https://badge.fury.io/py/torch-geometric.svg 2 | [pypi-url]: https://pypi.org/project/quiver-feature/ 3 | 4 |

5 | 6 |

7 | 8 | -------------------------------------------------------------------------------- 9 | 10 | Quiver-Feature is a RDMA-based high performance **distributed feature collection component** for **training GNN models on extremely large graphs**, It is built on [Quiver](https://github.com/quiver-team/torch-quiver) and has several novel features: 11 | 12 | 1. **High Performance**: Quiver-Feature has **5-10x throughput performance** over feature collection solutions in existing GNN systems such as [DGL](https://github.com/dmlc/dgl) and [PyG](https://github.com/pyg-team/pytorch_geometric). 13 | 14 | 2. **Maximum Hardware Resource Utilization Efficiency**: Quiver-Feature has minimal CPU usage and minimal memory bus traffic, leaving much of the CPU and memory resource to tasks like graph sampling and model training. 15 | 16 | 3. **Easy to use**: To use Quiver-Feature, developers only need to add a few lines of code in existing PyG/DGL programs. Quiver-Feature is thus easy to be adopted by PyG/DGL users and deployed in production clusters. 17 | 18 | ![train_gnn_models_on_large_graph](docs/imgs/train_gnn_on_large_graphs.png) 19 | 20 | -------------------------------------------------------------------------------- 21 | 22 | # GPU-centric Data Placement And Zero-Copy Data Access 23 | 24 | **`GPU-centric data placement`** and **`Zero-Copy data access method`** are two keys behind Quiver-Feature's high performance. 25 | 26 | **`GPU-Centric Data Placement`:** Quiver-Feature has a unified view of memories across heterogeneous devices and machines. It classifies these memories into 4 memory spaces under a GPU-centric view: **Local HBM**(Current GPU's Memory),**Neighbor HBM**, **Local DRAM**(Current machines's CPU memory) and **Remote DRAM**(Remote CPU's memory). These 4 memory spaces have connections with each other using PCIe, NVLink and RDMA etc. 27 | 28 | ![memory_view](docs/imgs/consistent_memory_view.png) 29 | 30 | Accessing different memory spaces from GPU has unbalanced performance. Considering that feature data access frequency during GNN training is also unbalanced, Quiver-Feature uses an **`application-aware and GPU-Centric data palcement algorithm`** to takes full advantage of the GPU-centric multi-level memory layers. 31 | 32 | **`Zero-Copy Data Access`:** Feature collection in GNN training involves massive data movement across network, DRAM, PCIe and NVLink and any extra memory copy hurts the e2e performance. Quiver-Feature uses one-sided commnunication methods such as `UVA` for local memory spaces access(Local HBM, Local DRAM, Neighbor HBM) and `RDMA READ` for remote memory space access(Remote DRAM), achiving zero-copy and minimum CPU intervention.([You can refer to this document for more RDMA details](docs/rdma_details.md)) 33 | 34 | 35 | **`DistTensorPGAS`:** Above those memory spaces, Quiver-Feature adopts **[`PGAS`](https://en.wikipedia.org/wiki/Partitioned_global_address_space) memory model** and implements a 2-dimension distributed tensor abstraction which is called `DistTensorPGAS`. Users can use `DistTensorPGAS` just like a local torch.Tensor, such as querying `shape` and performing `slicing operation` etc. 36 | 37 | ![pgas_tensor](docs/imgs/pgas_tensor_view.png) 38 | 39 | 40 | # Performance Benchmark 41 | 42 | As far as we know, there's no public GNN system directly supports using RDMA for feature collection. `DGL` uses [TensorPipe](https://github.com/pytorch/tensorpipe) as its rpc backend, [TensorPipe](https://github.com/pytorch/tensorpipe) itself supports RDMA but `DGL` has not integrated this feature. Since [TensorPipe](https://github.com/pytorch/tensorpipe) is also the [official rpc backend](https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc) of Pytorch, we compare the feature collection performance between`Quiver-Feature` with `Pytorch-RPC Based Solution`. 43 | 44 | We have 2 machines and 100Gbps IB networks between them. We partition the data uniformly and start M GPU training processes on each machine(which we will refer as `2 Machines 2M GPUs` in the following result chart). we benchmark feature collection performance of `Quiver-Feature` and `Pytorch-RPC Based Solution` and we can see that `Quiver-Feature` is 5x better over `Pytorch-RPC Based Solution` in all settings. 45 | 46 | ![img](docs/imgs/e2e_feature_collection.png) 47 | 48 | # Install 49 | 50 | ## Install From Source(Recommended For Now) 51 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver). 52 | 53 | 2. Install Quiver-Feature from source 54 | 55 | $ git clone git@github.com:quiver-team/quiver-feature 56 | $ cd quiver-feature/ 57 | $ pip install . 58 | 59 | ## Pip Install 60 | 61 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver). 62 | 63 | 2. Install the `Quiver-Feature` pip package. 64 | 65 | $ pip install quiver-feature 66 | 67 | We have tested Quiver with the following setup: 68 | 69 | - OS: Ubuntu 18.04, Ubuntu 20.04 70 | 71 | - CUDA: 10.2, 11.1 72 | 73 | - GPU: Nvidia P100, V100, Titan X, A6000 74 | 75 | ## Test Install 76 | 77 | You can download Quiver-Feature's examples to test installation: 78 | 79 | $ git clone git@github.com:quiver-team/quiver-feature.git 80 | $ cd quiver-feature/examples/reddit 81 | $ python3 distribute_training.py 82 | 83 | A successful run should contain the following line: 84 | 85 | `Starting Server With: xxxx` 86 | 87 | 88 | # Quick Start 89 | 90 | To use Quiver-Feature, you need to replace PyG's feature tensors with `quiver_feature.DistTensorPGAS`,this usually requires only a few changes in existing PyG programs with following 4 steps on each machine: 91 | 92 | - Load feature partition and meta data which belongs to the current machine. 93 | 94 | - Exchange feature partition meta data with other processes using `quiver_feature.DistHelper`. 95 | 96 | - Create a `quiver_feature.DistTensorPGAS` from local feature partition and meta data. 97 | 98 | - Pass the `quiver_feature.DistTensorPGAS` built above as parameter to each training process for feature collection. 99 | 100 | Here is a simple example for using Quiver-Feature in a PyG's program. You can check the [original scripts](examples/reddit/distribute_training.py) for more details. 101 | 102 | ```python 103 | 104 | def train_process(rank, dist_tensor): 105 | ... 106 | for batch_size, n_id, adjs in train_loader: 107 | ... 108 | # Using DistTensorPGAS Just Like A torch.Tensor 109 | collected_feature = dist_tensor[n_id] 110 | ... 111 | 112 | if __name__ == "__main__": 113 | 114 | # Step 1: Load Local data partition 115 | local_tensor, cached_range, local_range = load_partitioned_data(...) 116 | 117 | # Step 2: Exchange TensorPoints Information 118 | dist_helper = DistHelper(...) 119 | tensor_endpoints = dist_helper.exchange_tensor_endpoints_info() 120 | 121 | 122 | # Step 3: Build DistTensorPGAS from local feature partition 123 | dist_tensor = DistTensorPGAS(...) 124 | 125 | 126 | # Step 4: Spawn Training Processes Using DistTensor as Parameter 127 | mp.spawn( 128 | train_process, 129 | args=(..., dist_tensor, ...), 130 | nprocs=args.device_per_node, 131 | join=True 132 | ) 133 | ... 134 | 135 | ``` 136 | 137 | # License 138 | 139 | Quiver-Feature is licensed under the Apache License, Version 2.0 140 | 141 | # Citation 142 | If you use Quiver-Feature in your publication,please cite it by using the following BibTeX entry. 143 | 144 | @Misc{Quiver-Feature, 145 | institution = {Quiver Team}, 146 | title = {Quiver-Feature:A High Performance Feature Collection Component For Training GNN On Extremely Large Graphs}, 147 | howpublished = {\url{https://github.com/quiver-team/quiver-feature}}, 148 | year = {2022} 149 | } -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | mkdir -p build 2 | cd build 3 | Torch_DIR=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \ 4 | cmake -DBUILD_TEST=1 -DCMAKE_INSTALL_PREFIX=. .. 5 | make install 6 | -------------------------------------------------------------------------------- /csrc/include/infinity/core/Configuration.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Core - Configuration 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef CORE_CONFIGURATION_H_ 10 | #define CORE_CONFIGURATION_H_ 11 | 12 | #include 13 | 14 | namespace infinity { 15 | namespace core { 16 | 17 | class Configuration { 18 | 19 | public: 20 | 21 | /** 22 | * Queue length settings 23 | */ 24 | 25 | static const uint32_t SEND_COMPLETION_QUEUE_LENGTH = 8191; // Must be less than MAX_CQE 26 | 27 | static const uint32_t RECV_COMPLETION_QUEUE_LENGTH = 8191; // Must be less than MAX_CQE 28 | 29 | static const uint32_t SHARED_RECV_QUEUE_LENGTH = 8191; // Must be less than MAX_SRQ_WR 30 | 31 | static const uint32_t MAX_NUMBER_OF_OUTSTANDING_REQUESTS = 8191; // Must be less than (MAX_QP_WR * MAX_QP) 32 | // Since we use one single shared receive queue, 33 | // this number should be less than MAX_SRQ_WR 34 | 35 | static const uint32_t MAX_NUMBER_OF_SGE_ELEMENTS = 1; // Must be less than MAX_SGE 36 | 37 | public: 38 | 39 | /** 40 | * System settings 41 | */ 42 | 43 | static const uint32_t PAGE_SIZE = 4096; // Memory regions will be page aligned by the Infinity library 44 | 45 | static const uint32_t MAX_CONNECTION_USER_DATA_SIZE = 1024; // Size of the user data which can be transmitted when establishing a connection 46 | 47 | static constexpr const char* DEFAULT_IB_DEVICE = "ib0"; // Default name of IB device 48 | 49 | }; 50 | 51 | } /* namespace core */ 52 | } /* namespace infinity */ 53 | 54 | #endif /* CORE_CONFIGURATION_H_ */ 55 | -------------------------------------------------------------------------------- /csrc/include/infinity/core/Context.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Core - Context 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef CORE_CONTEXT_H_ 10 | #define CORE_CONTEXT_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | namespace infinity { 18 | namespace memory { 19 | class Region; 20 | class Buffer; 21 | class Atomic; 22 | class RegisteredMemory; 23 | } 24 | } 25 | 26 | namespace infinity { 27 | namespace queues { 28 | class QueuePair; 29 | class QueuePairFactory; 30 | } 31 | } 32 | 33 | namespace infinity { 34 | namespace requests { 35 | class RequestToken; 36 | } 37 | } 38 | 39 | namespace infinity { 40 | namespace core { 41 | 42 | typedef struct { 43 | infinity::memory::Buffer *buffer; 44 | uint32_t bytesWritten; 45 | uint32_t immediateValue; 46 | bool immediateValueValid; 47 | infinity::queues::QueuePair *queuePair; 48 | } receive_element_t; 49 | 50 | class Context { 51 | 52 | friend class infinity::memory::Region; 53 | friend class infinity::memory::Buffer; 54 | friend class infinity::memory::Atomic; 55 | friend class infinity::memory::RegisteredMemory; 56 | friend class infinity::queues::QueuePair; 57 | friend class infinity::queues::QueuePairFactory; 58 | friend class infinity::requests::RequestToken; 59 | 60 | public: 61 | 62 | /** 63 | * Constructors 64 | */ 65 | Context(uint16_t device = 0, uint16_t devicePort = 1); 66 | 67 | /** 68 | * Destructor 69 | */ 70 | ~Context(); 71 | 72 | public: 73 | 74 | /** 75 | * Check if receive operation completed 76 | */ 77 | bool receive(receive_element_t *receiveElement); 78 | bool receive(infinity::memory::Buffer **buffer, uint32_t *bytesWritten, uint32_t *immediateValue, bool *immediateValueValid, infinity::queues::QueuePair **queuePair = NULL); 79 | 80 | /** 81 | * Post a new buffer for receiving messages 82 | */ 83 | void postReceiveBuffer(infinity::memory::Buffer *buffer); 84 | 85 | /* 86 | Poll expected signal from completion queue 87 | */ 88 | int batchPollSendCompletionQueue(int poll_batch, int expected_num, ibv_wc* wc, bool force_all); 89 | 90 | public: 91 | 92 | infinity::requests::RequestToken * defaultRequestToken; 93 | infinity::memory::Atomic * defaultAtomic; 94 | 95 | protected: 96 | 97 | /** 98 | * Returns ibVerbs context 99 | */ 100 | ibv_context * getInfiniBandContext(); 101 | 102 | /** 103 | * Returns local device id 104 | */ 105 | uint16_t getLocalDeviceId(); 106 | 107 | /** 108 | * Returns device port 109 | */ 110 | uint16_t getDevicePort(); 111 | 112 | /** 113 | * Returns ibVerbs protection domain 114 | */ 115 | ibv_pd * getProtectionDomain(); 116 | 117 | protected: 118 | 119 | /** 120 | * Check if send operation completed 121 | */ 122 | bool pollSendCompletionQueue(); 123 | 124 | /** 125 | * Returns ibVerbs completion queue for sending 126 | */ 127 | ibv_cq * getSendCompletionQueue(); 128 | 129 | /** 130 | * Returns ibVerbs completion queue for receiving 131 | */ 132 | ibv_cq * getReceiveCompletionQueue(); 133 | 134 | /** 135 | * Returns ibVerbs shared receive queue 136 | */ 137 | ibv_srq * getSharedReceiveQueue(); 138 | 139 | protected: 140 | 141 | /** 142 | * IB context and protection domain 143 | */ 144 | ibv_context *ibvContext; 145 | ibv_pd *ibvProtectionDomain; 146 | 147 | /** 148 | * Local device id and port 149 | */ 150 | ibv_device *ibvDevice; 151 | uint16_t ibvLocalDeviceId; 152 | uint16_t ibvDevicePort; 153 | 154 | /** 155 | * IB send and receive completion queues 156 | */ 157 | ibv_cq *ibvSendCompletionQueue; 158 | ibv_cq *ibvReceiveCompletionQueue; 159 | ibv_srq *ibvSharedReceiveQueue; 160 | 161 | protected: 162 | 163 | void registerQueuePair(infinity::queues::QueuePair *queuePair); 164 | std::unordered_map queuePairMap; 165 | 166 | }; 167 | 168 | } /* namespace core */ 169 | } /* namespace infinity */ 170 | 171 | #endif /* CORE_CONTEXT_H_ */ 172 | -------------------------------------------------------------------------------- /csrc/include/infinity/infinity.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Infinity - A C++ RDMA library for InfiniBand 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef INFINITY_H_ 10 | #define INFINITY_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #endif /* INFINITY_H_ */ 27 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Atomic.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Atomic 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "Atomic.h" 10 | 11 | #include 12 | 13 | namespace infinity { 14 | namespace memory { 15 | 16 | Atomic::Atomic(infinity::core::Context* context) { 17 | 18 | this->context = context; 19 | this->sizeInBytes = sizeof(uint64_t); 20 | this->memoryRegionType = RegionType::ATOMIC; 21 | 22 | this->value = 0; 23 | this->data = &value; 24 | 25 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), &(this->value), this->sizeInBytes, 26 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); 27 | 28 | 29 | } 30 | 31 | uint64_t infinity::memory::Atomic::getValue() { 32 | 33 | return this->value; 34 | 35 | } 36 | 37 | void infinity::memory::Atomic::setValueNonAtomic(uint64_t value) { 38 | 39 | this->value = value; 40 | 41 | } 42 | 43 | 44 | Atomic::~Atomic() { 45 | 46 | ibv_dereg_mr(this->ibvMemoryRegion); 47 | 48 | } 49 | 50 | } /* namespace memory */ 51 | } /* namespace infinity */ 52 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Atomic.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Atomic 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef MEMORY_ATOMIC_H_ 10 | #define MEMORY_ATOMIC_H_ 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | 18 | namespace infinity { 19 | namespace memory { 20 | 21 | class Atomic : public Region { 22 | 23 | public: 24 | 25 | Atomic(infinity::core::Context *context); 26 | virtual ~Atomic(); 27 | 28 | public: 29 | 30 | uint64_t getValue(); 31 | 32 | void setValueNonAtomic(uint64_t value); 33 | 34 | protected: 35 | 36 | uint64_t value; 37 | 38 | 39 | }; 40 | 41 | } /* namespace memory */ 42 | } /* namespace infinity */ 43 | 44 | #endif /* MEMORY_ATOMIC_H_ */ 45 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Buffer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Buffer 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "Buffer.h" 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #define MIN(a, b) (((a) < (b)) ? (a) : (b)) 18 | 19 | namespace infinity { 20 | namespace memory { 21 | 22 | Buffer::Buffer(infinity::core::Context* context, uint64_t sizeInBytes) { 23 | this->context = context; 24 | this->sizeInBytes = sizeInBytes; 25 | this->memoryRegionType = RegionType::BUFFER; 26 | 27 | int res = posix_memalign( 28 | &(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes); 29 | INFINITY_ASSERT( 30 | res == 0, 31 | "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n"); 32 | 33 | memset(this->data, 0, sizeInBytes); 34 | 35 | this->ibvMemoryRegion = ibv_reg_mr( 36 | this->context->getProtectionDomain(), this->data, this->sizeInBytes, 37 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | 38 | IBV_ACCESS_REMOTE_READ); 39 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, 40 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n"); 41 | 42 | this->memoryAllocated = true; 43 | this->memoryRegistered = true; 44 | } 45 | 46 | Buffer::Buffer(infinity::core::Context* context, 47 | infinity::memory::RegisteredMemory* memory, 48 | uint64_t offset, 49 | uint64_t sizeInBytes) { 50 | this->context = context; 51 | this->sizeInBytes = sizeInBytes; 52 | this->memoryRegionType = RegionType::BUFFER; 53 | 54 | this->data = reinterpret_cast(memory->getData()) + offset; 55 | this->ibvMemoryRegion = memory->getRegion(); 56 | 57 | this->memoryAllocated = false; 58 | this->memoryRegistered = false; 59 | } 60 | 61 | Buffer::Buffer(infinity::core::Context* context, 62 | void* memory, 63 | uint64_t sizeInBytes) { 64 | this->context = context; 65 | this->sizeInBytes = sizeInBytes; 66 | this->memoryRegionType = RegionType::BUFFER; 67 | 68 | this->data = memory; 69 | this->ibvMemoryRegion = ibv_reg_mr( 70 | this->context->getProtectionDomain(), this->data, this->sizeInBytes, 71 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | 72 | IBV_ACCESS_REMOTE_READ); 73 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, 74 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n"); 75 | 76 | this->memoryAllocated = false; 77 | this->memoryRegistered = true; 78 | } 79 | 80 | Buffer::Buffer(infinity::core::Context* context, 81 | uint64_t sizeInBytes, 82 | int device) { 83 | this->context = context; 84 | this->sizeInBytes = sizeInBytes; 85 | this->memoryRegionType = RegionType::BUFFER; 86 | 87 | cudaSetDevice(device); 88 | int cap = sizeInBytes + infinity::core::Configuration::PAGE_SIZE; 89 | int res = cudaMalloc(&this->data, cap); 90 | INFINITY_ASSERT( 91 | res == 0, 92 | "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n"); 93 | 94 | void* temp = this->data; 95 | if (uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE) { 96 | uint64_t head = 97 | infinity::core::Configuration::PAGE_SIZE - 98 | uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE; 99 | temp += head; 100 | } 101 | cudaMemset(this->data, 0, cap); 102 | 103 | this->ibvMemoryRegion = 104 | ibv_reg_mr(this->context->getProtectionDomain(), temp, this->sizeInBytes, 105 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | 106 | IBV_ACCESS_REMOTE_READ); 107 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, 108 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n"); 109 | 110 | this->memoryAllocated = true; 111 | this->memoryRegistered = true; 112 | this->cuda = true; 113 | } 114 | 115 | Buffer::~Buffer() { 116 | if (this->memoryRegistered) { 117 | ibv_dereg_mr(this->ibvMemoryRegion); 118 | } 119 | if (this->memoryAllocated) { 120 | if (!this->cuda) { 121 | free(this->data); 122 | } else { 123 | cudaFree(this->data); 124 | } 125 | } 126 | } 127 | 128 | void* Buffer::getData() { 129 | return reinterpret_cast(this->getAddress()); 130 | } 131 | 132 | void Buffer::resize(uint64_t newSize, void* newData) { 133 | void* oldData = this->data; 134 | uint32_t oldSize = this->sizeInBytes; 135 | 136 | if (newData == NULL) { 137 | newData = this->data; 138 | } 139 | 140 | if (oldData != newData) { 141 | uint64_t copySize = MIN(newSize, oldSize); 142 | memcpy(newData, oldData, copySize); 143 | } 144 | 145 | if (memoryRegistered) { 146 | ibv_dereg_mr(this->ibvMemoryRegion); 147 | this->ibvMemoryRegion = 148 | ibv_reg_mr(this->context->getProtectionDomain(), newData, newSize, 149 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | 150 | IBV_ACCESS_REMOTE_READ); 151 | this->data = newData; 152 | this->sizeInBytes = newSize; 153 | } else { 154 | INFINITY_ASSERT(false, 155 | "[INFINITY][MEMORY][BUFFER] You can only resize memory " 156 | "which has registered by this buffer.\n"); 157 | } 158 | } 159 | 160 | } /* namespace memory */ 161 | } /* namespace infinity */ 162 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Buffer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Buffer 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef MEMORY_BUFFER_H_ 10 | #define MEMORY_BUFFER_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | namespace infinity { 19 | namespace memory { 20 | 21 | class Buffer : public Region { 22 | public: 23 | Buffer(infinity::core::Context* context, uint64_t sizeInBytes); 24 | Buffer(infinity::core::Context* context, uint64_t sizeInBytes, int device); 25 | Buffer(infinity::core::Context* context, 26 | infinity::memory::RegisteredMemory* memory, 27 | uint64_t offset, 28 | uint64_t sizeInBytes); 29 | Buffer(infinity::core::Context* context, void* memory, uint64_t sizeInBytes); 30 | ~Buffer(); 31 | 32 | public: 33 | void* getData(); 34 | void resize(uint64_t newSize, void* newData = NULL); 35 | 36 | protected: 37 | bool memoryRegistered; 38 | bool memoryAllocated; 39 | bool cuda; 40 | }; 41 | 42 | } /* namespace memory */ 43 | } /* namespace infinity */ 44 | 45 | #endif /* MEMORY_BUFFER_H_ */ 46 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Region.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Region 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "Buffer.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace infinity { 15 | namespace memory { 16 | 17 | Region::~Region() { 18 | // To be overwritten in sub class 19 | } 20 | 21 | RegionToken* Region::createRegionToken() { 22 | return new RegionToken(this, getMemoryRegionType(), getSizeInBytes(), getAddress(), getLocalKey(), getRemoteKey()); 23 | } 24 | 25 | RegionToken * Region::createRegionToken(uint64_t offset) { 26 | return new RegionToken(this, getMemoryRegionType(), getRemainingSizeInBytes(offset), getAddressWithOffset(offset), getLocalKey(), getRemoteKey()); 27 | } 28 | 29 | RegionToken * Region::createRegionToken(uint64_t offset, uint64_t size) { 30 | return new RegionToken(this, getMemoryRegionType(), size, getAddressWithOffset(offset), getLocalKey(), getRemoteKey()); 31 | } 32 | 33 | RegionType Region::getMemoryRegionType() { 34 | return this->memoryRegionType; 35 | } 36 | 37 | uint64_t Region::getSizeInBytes() { 38 | return this->sizeInBytes; 39 | } 40 | 41 | uint64_t Region::getRemainingSizeInBytes(uint64_t offset) { 42 | return this->sizeInBytes - offset; 43 | } 44 | 45 | uint64_t Region::getAddress() { 46 | return reinterpret_cast(this->data); 47 | } 48 | 49 | uint64_t Region::getAddressWithOffset(uint64_t offset) { 50 | return reinterpret_cast(this->data) + offset; 51 | } 52 | 53 | uint32_t Region::getLocalKey() { 54 | return this->ibvMemoryRegion->lkey; 55 | } 56 | 57 | uint32_t Region::getRemoteKey() { 58 | return this->ibvMemoryRegion->rkey; 59 | } 60 | 61 | } /* namespace memory */ 62 | } /* namespace infinity */ 63 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/Region.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Region 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef MEMORY_REGION_H_ 10 | #define MEMORY_REGION_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | namespace infinity { 19 | namespace memory { 20 | 21 | class RegionToken; 22 | 23 | class Region { 24 | 25 | public: 26 | 27 | virtual ~Region(); 28 | 29 | RegionToken * createRegionToken(); 30 | RegionToken * createRegionToken(uint64_t offset); 31 | RegionToken * createRegionToken(uint64_t offset, uint64_t size); 32 | 33 | public: 34 | 35 | RegionType getMemoryRegionType(); 36 | uint64_t getSizeInBytes(); 37 | uint64_t getRemainingSizeInBytes(uint64_t offset); 38 | uint64_t getAddress(); 39 | uint64_t getAddressWithOffset(uint64_t offset); 40 | uint32_t getLocalKey(); 41 | uint32_t getRemoteKey(); 42 | 43 | protected: 44 | 45 | infinity::core::Context* context; 46 | RegionType memoryRegionType; 47 | ibv_mr *ibvMemoryRegion; 48 | 49 | protected: 50 | 51 | void * data; 52 | uint64_t sizeInBytes; 53 | 54 | }; 55 | 56 | } /* namespace memory */ 57 | } /* namespace infinity */ 58 | 59 | #endif /* MEMORY_REGION_H_ */ 60 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/RegionToken.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Region Token 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | 11 | namespace infinity { 12 | namespace memory { 13 | 14 | RegionToken::RegionToken() : 15 | memoryRegion (NULL), 16 | memoryRegionType (UNKNOWN), 17 | sizeInBytes(0), 18 | address(0), 19 | localKey(0), 20 | remoteKey(0) { 21 | 22 | // Nothing to do here 23 | 24 | } 25 | 26 | RegionToken::RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey) : 27 | memoryRegion (memoryRegion), 28 | memoryRegionType (memoryRegionType), 29 | sizeInBytes(sizeInBytes), 30 | address(address), 31 | localKey(localKey), 32 | remoteKey(remoteKey) { 33 | 34 | // Nothing to do here 35 | 36 | } 37 | 38 | Region* RegionToken::getMemoryRegion() { 39 | return memoryRegion; 40 | } 41 | 42 | RegionType RegionToken::getMemoryRegionType() { 43 | return this->memoryRegionType; 44 | } 45 | 46 | uint64_t RegionToken::getSizeInBytes() { 47 | return this->sizeInBytes; 48 | } 49 | 50 | uint64_t RegionToken::getRemainingSizeInBytes(uint64_t offset) { 51 | return this->sizeInBytes-offset; 52 | } 53 | 54 | uint64_t RegionToken::getAddress() { 55 | return address; 56 | } 57 | 58 | uint64_t RegionToken::getAddressWithOffset(uint64_t offset) { 59 | return address + offset; 60 | } 61 | 62 | uint32_t RegionToken::getLocalKey() { 63 | return this->localKey; 64 | } 65 | 66 | uint32_t RegionToken::getRemoteKey() { 67 | return this->remoteKey; 68 | } 69 | 70 | 71 | } /* namespace memory */ 72 | } /* namespace infinity */ 73 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/RegionToken.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Region Token 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef MEMORY_REGIONTOKEN_H_ 10 | #define MEMORY_REGIONTOKEN_H_ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | namespace infinity { 17 | namespace memory { 18 | 19 | class RegionToken { 20 | 21 | public: 22 | 23 | RegionToken(); 24 | RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey); 25 | 26 | public: 27 | 28 | Region * getMemoryRegion(); 29 | RegionType getMemoryRegionType(); 30 | uint64_t getSizeInBytes(); 31 | uint64_t getRemainingSizeInBytes(uint64_t offset); 32 | uint64_t getAddress(); 33 | uint64_t getAddressWithOffset(uint64_t offset); 34 | uint32_t getLocalKey(); 35 | uint32_t getRemoteKey(); 36 | 37 | protected: 38 | 39 | Region *memoryRegion; 40 | const RegionType memoryRegionType; 41 | const uint64_t sizeInBytes; 42 | const uint64_t address; 43 | const uint32_t localKey; 44 | const uint32_t remoteKey; 45 | 46 | }; 47 | 48 | } /* namespace memory */ 49 | } /* namespace infinity */ 50 | 51 | #endif /* MEMORY_REGIONTOKEN_H_ */ 52 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/RegionType.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Region Type 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef MEMORY_REGIONTYPE_H_ 10 | #define MEMORY_REGIONTYPE_H_ 11 | 12 | namespace infinity { 13 | namespace memory { 14 | 15 | enum RegionType {BUFFER, ATOMIC, UNKNOWN}; 16 | 17 | } /* namespace memory */ 18 | } /* namespace infinity */ 19 | 20 | #endif /* MEMORY_REGIONTYPE_H_ */ 21 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/RegisteredMemory.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Registered Memory 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "RegisteredMemory.h" 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | namespace infinity { 18 | namespace memory { 19 | 20 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, uint64_t sizeInBytes) { 21 | 22 | this->context = context; 23 | this->sizeInBytes = sizeInBytes; 24 | this->memoryAllocated = true; 25 | 26 | int res = posix_memalign(&(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes); 27 | INFINITY_ASSERT(res == 0, "[INFINITY][MEMORY][REGISTERED] Cannot allocate and align buffer.\n"); 28 | 29 | memset(this->data, 0, sizeInBytes); 30 | 31 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes, 32 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); 33 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n"); 34 | } 35 | 36 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, void *data, uint64_t sizeInBytes) { 37 | 38 | this->context = context; 39 | this->sizeInBytes = sizeInBytes; 40 | this->memoryAllocated = false; 41 | 42 | this->data = data; 43 | 44 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes, 45 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); 46 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n"); 47 | } 48 | 49 | 50 | RegisteredMemory::~RegisteredMemory() { 51 | 52 | ibv_dereg_mr(this->ibvMemoryRegion); 53 | 54 | if(this->memoryAllocated) { 55 | free(this->data); 56 | } 57 | 58 | } 59 | 60 | void* RegisteredMemory::getData() { 61 | 62 | return this->data; 63 | 64 | } 65 | 66 | uint64_t RegisteredMemory::getSizeInBytes() { 67 | 68 | return this->sizeInBytes; 69 | 70 | } 71 | 72 | ibv_mr* RegisteredMemory::getRegion() { 73 | 74 | return this->ibvMemoryRegion; 75 | 76 | } 77 | 78 | } /* namespace pool */ 79 | } /* namespace ivory */ 80 | -------------------------------------------------------------------------------- /csrc/include/infinity/memory/RegisteredMemory.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Memory - Registered Memory 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef INFINITY_MEMORY_REGISTEREDMEMORY_H_ 10 | #define INFINITY_MEMORY_REGISTEREDMEMORY_H_ 11 | 12 | #include 13 | 14 | namespace infinity { 15 | namespace memory { 16 | 17 | class RegisteredMemory { 18 | 19 | public: 20 | 21 | RegisteredMemory(infinity::core::Context *context, uint64_t sizeInBytes); 22 | RegisteredMemory(infinity::core::Context *context, void *data, uint64_t sizeInBytes); 23 | ~RegisteredMemory(); 24 | 25 | void * getData(); 26 | 27 | uint64_t getSizeInBytes(); 28 | 29 | ibv_mr * getRegion(); 30 | 31 | 32 | protected: 33 | 34 | infinity::core::Context* context; 35 | 36 | void *data; 37 | uint64_t sizeInBytes; 38 | 39 | ibv_mr *ibvMemoryRegion; 40 | 41 | protected: 42 | 43 | bool memoryAllocated; 44 | 45 | }; 46 | 47 | } /* namespace infinity */ 48 | } /* namespace memory */ 49 | 50 | #endif /* INFINITY_MEMORY_REGISTEREDMEMORY_H_ */ 51 | -------------------------------------------------------------------------------- /csrc/include/infinity/queues/QueuePair.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Queues - Queue Pair 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef QUEUES_QUEUEPAIR_H_ 10 | #define QUEUES_QUEUEPAIR_H_ 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | namespace infinity { 22 | namespace queues { 23 | class QueuePairFactory; 24 | } 25 | } // namespace infinity 26 | 27 | namespace infinity { 28 | namespace queues { 29 | struct SendRequestBuffer { 30 | std::vector sges; 31 | std::vector requests; 32 | SendRequestBuffer() {} 33 | SendRequestBuffer(int num) { 34 | sges.resize(num); 35 | requests.resize(num); 36 | } 37 | void resize(int num) { 38 | sges.resize(num); 39 | requests.resize(num); 40 | } 41 | void reset() { 42 | memset(sges.data(), 0, sizeof(ibv_sge)); 43 | memset(requests.data(), 0, sizeof(ibv_send_wr)); 44 | } 45 | }; 46 | } // namespace queues 47 | } // namespace infinity 48 | 49 | namespace infinity { 50 | namespace queues { 51 | 52 | struct IbvWcBuffer { 53 | ibv_wc* wc; 54 | int size_; 55 | IbvWcBuffer() {} 56 | IbvWcBuffer(int size) { 57 | wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size); 58 | size_ = size; 59 | } 60 | void resize(int size) { 61 | wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size); 62 | size_ = size; 63 | } 64 | 65 | ibv_wc* ptr() { return wc; } 66 | int size() { return size_; } 67 | }; 68 | } // namespace queues 69 | } // namespace infinity 70 | 71 | namespace infinity { 72 | namespace queues { 73 | 74 | class OperationFlags { 75 | public: 76 | bool fenced; 77 | bool signaled; 78 | bool inlined; 79 | 80 | OperationFlags() : fenced(false), signaled(false), inlined(false){}; 81 | 82 | /** 83 | * Turn the bools into a bit field. 84 | */ 85 | int ibvFlags(); 86 | }; 87 | 88 | class QueuePair { 89 | friend class infinity::queues::QueuePairFactory; 90 | 91 | public: 92 | /** 93 | * Constructor 94 | */ 95 | QueuePair(infinity::core::Context* context); 96 | 97 | /** 98 | * Destructor 99 | */ 100 | ~QueuePair(); 101 | 102 | protected: 103 | /** 104 | * Activation methods 105 | */ 106 | 107 | void activate(uint16_t remoteDeviceId, 108 | uint32_t remoteQueuePairNumber, 109 | uint32_t remoteSequenceNumber); 110 | void setRemoteUserData(void* userData, uint32_t userDataSize); 111 | 112 | public: 113 | /** 114 | * User data received during connection setup 115 | */ 116 | 117 | bool hasUserData(); 118 | uint32_t getUserDataSize(); 119 | void* getUserData(); 120 | 121 | public: 122 | /** 123 | * Queue pair information 124 | */ 125 | 126 | uint16_t getLocalDeviceId(); 127 | uint32_t getQueuePairNumber(); 128 | uint32_t getSequenceNumber(); 129 | 130 | public: 131 | /** 132 | * Buffer operations 133 | */ 134 | 135 | void send(infinity::memory::Buffer* buffer, 136 | infinity::requests::RequestToken* requestToken = NULL); 137 | void send(infinity::memory::Buffer* buffer, 138 | uint32_t sizeInBytes, 139 | infinity::requests::RequestToken* requestToken = NULL); 140 | void send(infinity::memory::Buffer* buffer, 141 | uint64_t localOffset, 142 | uint32_t sizeInBytes, 143 | OperationFlags flags, 144 | infinity::requests::RequestToken* requestToken = NULL); 145 | 146 | void write(infinity::memory::Buffer* buffer, 147 | infinity::memory::RegionToken* destination, 148 | infinity::requests::RequestToken* requestToken = NULL); 149 | void write(infinity::memory::Buffer* buffer, 150 | infinity::memory::RegionToken* destination, 151 | uint32_t sizeInBytes, 152 | infinity::requests::RequestToken* requestToken = NULL); 153 | void write(infinity::memory::Buffer* buffer, 154 | uint64_t localOffset, 155 | infinity::memory::RegionToken* destination, 156 | uint64_t remoteOffset, 157 | uint32_t sizeInBytes, 158 | OperationFlags flags, 159 | infinity::requests::RequestToken* requestToken = NULL); 160 | 161 | void read(infinity::memory::Buffer* buffer, 162 | infinity::memory::RegionToken* source, 163 | infinity::requests::RequestToken* requestToken = NULL); 164 | void read(infinity::memory::Buffer* buffer, 165 | infinity::memory::RegionToken* source, 166 | uint32_t sizeInBytes, 167 | infinity::requests::RequestToken* requestToken = NULL); 168 | void read(infinity::memory::Buffer* buffer, 169 | uint64_t localOffset, 170 | infinity::memory::RegionToken* source, 171 | uint64_t remoteOffset, 172 | uint32_t sizeInBytes, 173 | OperationFlags flags, 174 | infinity::requests::RequestToken* requestToken = NULL); 175 | 176 | public: 177 | /** 178 | * Complex buffer operations 179 | */ 180 | 181 | void multiWrite(infinity::memory::Buffer** buffers, 182 | uint32_t* sizesInBytes, 183 | uint64_t* localOffsets, 184 | uint32_t numberOfElements, 185 | infinity::memory::RegionToken* destination, 186 | uint64_t remoteOffset, 187 | OperationFlags flags, 188 | infinity::requests::RequestToken* requestToken = NULL); 189 | 190 | void multiRead(uint32_t batch_size, 191 | infinity::memory::Buffer* buffer, 192 | int64_t* localOffset, 193 | infinity::memory::RegionToken* source, 194 | int64_t* remoteOffset, 195 | uint32_t sizeInBytes, 196 | OperationFlags send_flags, 197 | infinity::requests::RequestToken* requestToken, 198 | infinity::queues::SendRequestBuffer& send_buffer); 199 | 200 | void sendWithImmediate(infinity::memory::Buffer* buffer, 201 | uint64_t localOffset, 202 | uint32_t sizeInBytes, 203 | uint32_t immediateValue, 204 | OperationFlags flags, 205 | infinity::requests::RequestToken* requestToken = NULL); 206 | 207 | void writeWithImmediate( 208 | infinity::memory::Buffer* buffer, 209 | uint64_t localOffset, 210 | infinity::memory::RegionToken* destination, 211 | uint64_t remoteOffset, 212 | uint32_t sizeInBytes, 213 | uint32_t immediateValue, 214 | OperationFlags flags, 215 | infinity::requests::RequestToken* requestToken = NULL); 216 | 217 | void multiWriteWithImmediate( 218 | infinity::memory::Buffer** buffers, 219 | uint32_t* sizesInBytes, 220 | uint64_t* localOffsets, 221 | uint32_t numberOfElements, 222 | infinity::memory::RegionToken* destination, 223 | uint64_t remoteOffset, 224 | uint32_t immediateValue, 225 | OperationFlags flags, 226 | infinity::requests::RequestToken* requestToken = NULL); 227 | 228 | public: 229 | /** 230 | * Atomic value operations 231 | */ 232 | 233 | void compareAndSwap(infinity::memory::RegionToken* destination, 234 | uint64_t compare, 235 | uint64_t swap, 236 | infinity::requests::RequestToken* requestToken = NULL); 237 | void compareAndSwap(infinity::memory::RegionToken* destination, 238 | infinity::memory::Atomic* previousValue, 239 | uint64_t compare, 240 | uint64_t swap, 241 | OperationFlags flags, 242 | infinity::requests::RequestToken* requestToken = NULL); 243 | void fetchAndAdd(infinity::memory::RegionToken* destination, 244 | uint64_t add, 245 | infinity::requests::RequestToken* requestToken = NULL); 246 | void fetchAndAdd(infinity::memory::RegionToken* destination, 247 | infinity::memory::Atomic* previousValue, 248 | uint64_t add, 249 | OperationFlags flags, 250 | infinity::requests::RequestToken* requestToken = NULL); 251 | 252 | protected: 253 | infinity::core::Context* const context; 254 | 255 | ibv_qp* ibvQueuePair; 256 | uint32_t sequenceNumber; 257 | 258 | void* userData; 259 | uint32_t userDataSize; 260 | }; 261 | 262 | } /* namespace queues */ 263 | } /* namespace infinity */ 264 | 265 | #endif /* QUEUES_QUEUEPAIR_H_ */ 266 | -------------------------------------------------------------------------------- /csrc/include/infinity/queues/QueuePairFactory.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Queues - Queue Pair Factory 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef QUEUES_QUEUEPAIRFACTORY_H_ 10 | #define QUEUES_QUEUEPAIRFACTORY_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | namespace infinity { 19 | namespace queues { 20 | 21 | class QueuePairFactory { 22 | public: 23 | 24 | QueuePairFactory(infinity::core::Context *context); 25 | ~QueuePairFactory(); 26 | 27 | /** 28 | * Bind to port for listening to incoming connections 29 | */ 30 | void bindToPort(uint16_t port); 31 | 32 | /** 33 | * Accept incoming connection request (passive side) 34 | */ 35 | QueuePair * acceptIncomingConnection(void *userData = NULL, uint32_t userDataSizeInBytes = 0); 36 | 37 | /** 38 | * Connect to remote machine (active side) 39 | */ 40 | QueuePair * connectToRemoteHost(const char* hostAddress, uint16_t port, void *userData = NULL, uint32_t userDataSizeInBytes = 0); 41 | 42 | /** 43 | * Create loopback queue pair 44 | */ 45 | QueuePair * createLoopback(void *userData = NULL, uint32_t userDataSizeInBytes = 0); 46 | 47 | protected: 48 | 49 | infinity::core::Context * context; 50 | 51 | int32_t serverSocket; 52 | 53 | }; 54 | 55 | } /* namespace queues */ 56 | } /* namespace infinity */ 57 | 58 | #endif /* QUEUES_QUEUEPAIRFACTORY_H_ */ 59 | -------------------------------------------------------------------------------- /csrc/include/infinity/requests/RequestToken.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Requests - Request Token 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "RequestToken.h" 10 | 11 | namespace infinity { 12 | namespace requests { 13 | 14 | RequestToken::RequestToken(infinity::core::Context *context) : 15 | context(context) { 16 | this->success.store(false); 17 | this->completed.store(false); 18 | this->region = NULL; 19 | this->userData = NULL; 20 | this->userDataValid = false; 21 | this->userDataSize = 0; 22 | this->immediateValue = 0; 23 | this->immediateValueValid = false; 24 | } 25 | 26 | void RequestToken::setCompleted(bool success) { 27 | this->success.store(success); 28 | this->completed.store(true); 29 | } 30 | 31 | bool RequestToken::checkIfCompleted() { 32 | if (this->completed.load()) { 33 | return true; 34 | } else { 35 | this->context->pollSendCompletionQueue(); 36 | return this->completed.load(); 37 | } 38 | } 39 | 40 | void RequestToken::waitUntilCompleted() { 41 | while (!this->completed.load()) { 42 | this->context->pollSendCompletionQueue(); 43 | } 44 | } 45 | 46 | bool RequestToken::wasSuccessful() { 47 | return this->success.load(); 48 | } 49 | 50 | void RequestToken::reset() { 51 | this->success.store(false); 52 | this->completed.store(false); 53 | this->region = NULL; 54 | this->userData = NULL; 55 | this->userDataValid = false; 56 | this->userDataSize = 0; 57 | this->immediateValue = 0; 58 | this->immediateValueValid = false; 59 | } 60 | 61 | void RequestToken::setRegion(infinity::memory::Region* region) { 62 | this->region = region; 63 | } 64 | 65 | infinity::memory::Region* RequestToken::getRegion() { 66 | return this->region; 67 | } 68 | 69 | void RequestToken::setUserData(void* userData, uint32_t userDataSize) { 70 | this->userData = userData; 71 | this->userDataSize = userDataSize; 72 | this->userDataValid = true; 73 | } 74 | 75 | void* RequestToken::getUserData() { 76 | return this->userData; 77 | } 78 | 79 | bool RequestToken::hasUserData() { 80 | return this->userDataValid; 81 | } 82 | 83 | uint32_t RequestToken::getUserDataSize() { 84 | return this->userDataSize; 85 | } 86 | 87 | void RequestToken::setImmediateValue(uint32_t immediateValue) { 88 | this->immediateValue = immediateValue; 89 | this->immediateValueValid = true; 90 | } 91 | 92 | uint32_t RequestToken::getImmediateValue() { 93 | return this->immediateValue; 94 | } 95 | 96 | bool RequestToken::hasImmediateValue() { 97 | return this->immediateValueValid; 98 | } 99 | 100 | } /* namespace requests */ 101 | } /* namespace infinity */ 102 | -------------------------------------------------------------------------------- /csrc/include/infinity/requests/RequestToken.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Requests - Request Token 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef REQUESTS_REQUESTTOKEN_H_ 10 | #define REQUESTS_REQUESTTOKEN_H_ 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | namespace infinity { 19 | namespace requests { 20 | 21 | class RequestToken { 22 | 23 | public: 24 | 25 | RequestToken(infinity::core::Context *context); 26 | 27 | void reset(); 28 | 29 | void setRegion(infinity::memory::Region * region); 30 | infinity::memory::Region * getRegion(); 31 | 32 | void setCompleted(bool success); 33 | bool wasSuccessful(); 34 | 35 | bool checkIfCompleted(); 36 | void waitUntilCompleted(); 37 | 38 | void setImmediateValue(uint32_t immediateValue); 39 | bool hasImmediateValue(); 40 | uint32_t getImmediateValue(); 41 | 42 | void setUserData(void* userData, uint32_t userDataSize); 43 | bool hasUserData(); 44 | void* getUserData(); 45 | uint32_t getUserDataSize(); 46 | 47 | protected: 48 | 49 | infinity::core::Context * const context; 50 | infinity::memory::Region * region; 51 | 52 | std::atomic completed; 53 | std::atomic success; 54 | 55 | void *userData; 56 | uint32_t userDataSize; 57 | bool userDataValid; 58 | 59 | uint32_t immediateValue; 60 | bool immediateValueValid; 61 | 62 | }; 63 | 64 | } /* namespace requests */ 65 | } /* namespace infinity */ 66 | 67 | #endif /* REQUESTS_REQUESTTOKEN_H_ */ 68 | -------------------------------------------------------------------------------- /csrc/include/infinity/utils/Address.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Utils - Address 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include "Address.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | namespace infinity { 21 | namespace utils { 22 | 23 | char* Address::getIpAddressOfInterface(const char* interfaceName) { 24 | 25 | struct ifaddrs *ifAddr; 26 | struct ifaddrs *ifa; 27 | char *ipAddress = (char*) calloc(16, sizeof(char)); 28 | 29 | int returnValue = getifaddrs(&ifAddr); 30 | INFINITY_ASSERT(returnValue != -1, "[INFINITY][UTILS][ADDRESS] Cannot read interface list.\n"); 31 | 32 | for (ifa = ifAddr; ifa != NULL; ifa = ifa->ifa_next) { 33 | if (ifa->ifa_addr == NULL) { 34 | continue; 35 | } 36 | if ((ifa->ifa_addr->sa_family == AF_INET) && (strcasecmp(interfaceName, ifa->ifa_name) == 0)) { 37 | sprintf(ipAddress, "%s", inet_ntoa(((struct sockaddr_in *) ifa->ifa_addr)->sin_addr)); 38 | break; 39 | } 40 | } 41 | INFINITY_ASSERT(ifa != NULL, "[INFINITY][UTILS][ADDRESS] Cannot find interface named %s.\n", interfaceName); 42 | 43 | freeifaddrs(ifAddr); 44 | 45 | return ipAddress; 46 | 47 | } 48 | 49 | uint32_t Address::getIpAddressAsUint32(const char* ipAddress) { 50 | 51 | uint32_t ipAddressNumbers[4]; 52 | sscanf(ipAddress, "%d.%d.%d.%d", &ipAddressNumbers[3], &ipAddressNumbers[2], &ipAddressNumbers[1], &ipAddressNumbers[0]); 53 | uint32_t ipAddressNumber(ipAddressNumbers[0] | ipAddressNumbers[1] << 8 | ipAddressNumbers[2] << 16 | ipAddressNumbers[3] << 24); 54 | return ipAddressNumber; 55 | } 56 | 57 | } /* namespace utils */ 58 | } /* namespace infinity */ 59 | -------------------------------------------------------------------------------- /csrc/include/infinity/utils/Address.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Utils - Address 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef UTILS_ADDRESS_H_ 10 | #define UTILS_ADDRESS_H_ 11 | 12 | #include 13 | 14 | namespace infinity { 15 | namespace utils { 16 | 17 | class Address { 18 | 19 | public: 20 | 21 | static char * getIpAddressOfInterface(const char *interfaceName); 22 | static uint32_t getIpAddressAsUint32(const char *ipAddress); 23 | 24 | }; 25 | 26 | } /* namespace utils */ 27 | } /* namespace infinity */ 28 | 29 | #endif /* UTILS_ADDRESS_H_ */ 30 | -------------------------------------------------------------------------------- /csrc/include/infinity/utils/Debug.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Utils - Debug 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #ifndef UTILS_DEBUG_H_ 10 | #define UTILS_DEBUG_H_ 11 | 12 | #include 13 | #include 14 | 15 | #ifdef INFINITY_DEBUG_ON 16 | #define INFINITY_DEBUG(X, ...) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout);} 17 | #else 18 | #define INFINITY_DEBUG(X, ...) {} 19 | #endif 20 | 21 | #ifdef INFINITY_ASSERT_ON 22 | #define INFINITY_ASSERT(B, X, ...) {if(!(B)) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout); exit(-1);}} 23 | #else 24 | #define INFINITY_ASSERT(B, X, ...) {} 25 | #endif 26 | 27 | #endif /* UTILS_DEBUG_H_ */ 28 | -------------------------------------------------------------------------------- /csrc/include/qvf/com_endpoint.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | namespace qvf { 4 | class ComEndPoint { 5 | private: 6 | std::string ip_address; 7 | int port; 8 | int rank; 9 | 10 | public: 11 | ComEndPoint() {} 12 | 13 | ComEndPoint(int rank, std::string ip_address, int port) 14 | : rank(rank), ip_address(ip_address), port(port) {} 15 | 16 | ComEndPoint& operator=(const ComEndPoint& other) { 17 | this->rank = other.rank; 18 | this->ip_address = other.ip_address; 19 | this->port = other.port; 20 | return *this; 21 | } 22 | 23 | void set_data(int rank, std::string ip_address, int port) { 24 | this->rank = rank; 25 | this->ip_address = ip_address; 26 | this->port = port; 27 | } 28 | 29 | std::string get_address(void) { return ip_address; } 30 | int get_port(void) { return port; } 31 | int get_rank(void) { return rank; } 32 | }; 33 | } // namespace qvf 34 | -------------------------------------------------------------------------------- /csrc/include/qvf/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | #define QUIVER_FEATURE_ASSERT(B, X, ...) \ 6 | { \ 7 | if (!(B)) { \ 8 | fprintf(stdout, X, ##__VA_ARGS__); \ 9 | fflush(stdout); \ 10 | exit(-1); \ 11 | } \ 12 | } 13 | -------------------------------------------------------------------------------- /csrc/include/qvf/dist_tensor_client.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace qvf { 21 | struct CollectionTask { 22 | public: 23 | void* base_address; 24 | int collect_from; 25 | int64_t* local_offsets; 26 | int64_t* remote_offsets; 27 | int64_t size; 28 | 29 | public: 30 | CollectionTask() {} 31 | CollectionTask(void* base_address, 32 | int64_t* local_offsets, 33 | int64_t* remote_offsets, 34 | int64_t size, 35 | int collect_from) 36 | : base_address(base_address), 37 | local_offsets(local_offsets), 38 | remote_offsets(remote_offsets), 39 | size(size), 40 | collect_from(collect_from) {} 41 | }; 42 | class DistTensorClient { 43 | public: 44 | std::vector pipes; 45 | std::vector com_endpoints; 46 | 47 | // About communication 48 | PipeParam pipe_param; 49 | int server_size; 50 | int server_rank; 51 | 52 | // About IB 53 | infinity::core::Context* context; 54 | infinity::queues::QueuePairFactory* qpFactory; 55 | 56 | infinity::memory::Buffer* tensor_buffer; 57 | infinity::memory::RegionToken* tensor_token; 58 | 59 | // about feature client 60 | std::deque task_queue; 61 | 62 | public: 63 | DistTensorClient(int server_rank, 64 | std::vector com_endpoints, 65 | PipeParam pipe_param) { 66 | this->server_rank = server_rank; 67 | this->com_endpoints = com_endpoints; 68 | this->pipe_param = pipe_param; 69 | server_size = com_endpoints.size(); 70 | init_connection(); 71 | } 72 | 73 | void init_connection() { 74 | context = new infinity::core::Context(); 75 | qpFactory = new infinity::queues::QueuePairFactory(context); 76 | pipes.resize(server_size); 77 | for (int idx = 0; idx < server_size; idx++) { 78 | if (com_endpoints[idx].get_rank() == server_rank) { 79 | continue; 80 | } 81 | pipes[com_endpoints[idx].get_rank()] = 82 | new Pipe(context, qpFactory, com_endpoints[idx], pipe_param); 83 | pipes[com_endpoints[idx].get_rank()]->connect(); 84 | } 85 | } 86 | 87 | torch::Tensor create_registered_float32_tensor( 88 | std::vector tensor_shape) { 89 | QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2, 90 | "Only support 2-dimensional tensor"); 91 | auto tensor_option = torch::TensorOptions().dtype(torch::kFloat32); 92 | uint64_t size_in_bytes = 4; 93 | for (int index = 0; index < tensor_shape.size(); index++) { 94 | size_in_bytes *= tensor_shape[index]; 95 | } 96 | tensor_buffer = new infinity::memory::Buffer(context, size_in_bytes); 97 | tensor_token = tensor_buffer->createRegionToken(); 98 | return torch::from_blob(tensor_buffer->getData(), 99 | {tensor_shape[0], tensor_shape[1]}, tensor_option); 100 | } 101 | 102 | void register_float_tensor(torch::Tensor& float_tensor) { 103 | QUIVER_FEATURE_ASSERT( 104 | float_tensor.dim() == 2, 105 | "Only support 2-dimensional tensor, But got %d-dimensional tensor\n", 106 | float_tensor.dim()); 107 | 108 | uint64_t size_in_bytes = float_tensor.element_size() * float_tensor.numel(); 109 | 110 | tensor_buffer = new infinity::memory::Buffer( 111 | context, float_tensor.data_ptr(), size_in_bytes); 112 | 113 | tensor_token = tensor_buffer->createRegionToken(); 114 | } 115 | 116 | torch::Tensor create_registered_float32_tensor_cuda( 117 | std::vector tensor_shape, 118 | int device) { 119 | QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2, 120 | "Only support 2-dimensional tensor"); 121 | uint64_t size_in_bytes = 4; 122 | for (int index = 0; index < tensor_shape.size(); index++) { 123 | size_in_bytes *= tensor_shape[index]; 124 | } 125 | tensor_buffer = 126 | new infinity::memory::Buffer(context, size_in_bytes, device); 127 | tensor_token = tensor_buffer->createRegionToken(); 128 | auto tensor_option = torch::TensorOptions() 129 | .dtype(torch::kFloat32) 130 | .device(torch::kCUDA, device); 131 | return torch::from_blob(tensor_buffer->getData(), 132 | {tensor_shape[0], tensor_shape[1]}, tensor_option); 133 | } 134 | 135 | void sync_read(int server_rank, 136 | torch::Tensor& res_tensor, 137 | torch::Tensor& local_offsets, 138 | torch::Tensor& remote_offsets) { 139 | QUIVER_FEATURE_ASSERT( 140 | reinterpret_cast(res_tensor.data_ptr()) == 141 | tensor_buffer->getAddress(), 142 | "Result Tensor is not created from registered buffer"); 143 | 144 | pipes[server_rank]->read(tensor_buffer, local_offsets, remote_offsets, 145 | res_tensor.size(1) * res_tensor.element_size()); 146 | } 147 | 148 | void collect_inner(CollectionTask collection_task) { 149 | task_queue.push_back(collection_task); 150 | } 151 | 152 | void start_feature_client() {} 153 | }; 154 | } // namespace qvf 155 | -------------------------------------------------------------------------------- /csrc/include/qvf/dist_tensor_server.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | 21 | namespace qvf { 22 | class DistTensorServer { 23 | private: 24 | int port; 25 | int world_size; 26 | int qp_per_pipe; 27 | 28 | infinity::core::Context* context; 29 | infinity::queues::QueuePairFactory* qpFactory; 30 | infinity::memory::Buffer* feature_buffer; 31 | infinity::memory::RegionToken* bufferToken; 32 | 33 | std::thread server_thread; 34 | 35 | public: 36 | DistTensorServer(int port, int world_size, int qp_per_pipe) 37 | : port(port), world_size(world_size), qp_per_pipe(qp_per_pipe) { 38 | context = new infinity::core::Context(); 39 | qpFactory = new infinity::queues::QueuePairFactory(context); 40 | qpFactory->bindToPort(port); 41 | } 42 | 43 | void join() { server_thread.join(); } 44 | 45 | void serve(void* data, int64_t size_in_bytes) { 46 | feature_buffer = 47 | new infinity::memory::Buffer(context, data, (uint64_t)size_in_bytes); 48 | bufferToken = feature_buffer->createRegionToken(); 49 | server_thread = 50 | std::thread(run, qpFactory, bufferToken, qp_per_pipe * world_size); 51 | } 52 | 53 | void serve_tensor(torch::Tensor& data) { 54 | std::cout << "Registering Buffer, Please Wait..." << std::endl; 55 | uint64_t size_in_bytes = data.numel() * data.element_size(); 56 | 57 | feature_buffer = new infinity::memory::Buffer( 58 | context, data.data_ptr(), size_in_bytes); 59 | bufferToken = feature_buffer->createRegionToken(); 60 | server_thread = std::thread(run, qpFactory, bufferToken, 61 | qp_per_pipe * (world_size - 1)); 62 | } 63 | 64 | static void run(infinity::queues::QueuePairFactory* qpFactory, 65 | infinity::memory::RegionToken* bufferToken, 66 | int total_qp_num) { 67 | std::cout << "Buffer Registeration Done! Ready To Receive Connections, " 68 | "Start Your Clients Now" 69 | << std::endl; 70 | for (int qp_index = 0; qp_index < total_qp_num; qp_index++) { 71 | qpFactory->acceptIncomingConnection( 72 | bufferToken, sizeof(infinity::memory::RegionToken)); 73 | } 74 | 75 | while (1) { 76 | std::this_thread::sleep_for(std::chrono::seconds(10)); // 10s 77 | } 78 | } 79 | }; 80 | 81 | } // namespace qvf 82 | -------------------------------------------------------------------------------- /csrc/include/qvf/pipe.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | namespace qvf { 19 | 20 | // Pipe are used for single side RDMA read to remote data servers 21 | struct PipeParam { 22 | int qp_num; 23 | int ctx_poll_batch; 24 | int tx_depth; 25 | int post_list_size; 26 | PipeParam() {} 27 | PipeParam(int qp_num, 28 | int ctx_poll_batch, 29 | int tx_depth, 30 | int post_list_size) { 31 | this->qp_num = qp_num; 32 | this->ctx_poll_batch = ctx_poll_batch; 33 | this->tx_depth = tx_depth; 34 | this->post_list_size = post_list_size; 35 | } 36 | void set_params(int qp_num, 37 | int ctx_poll_batch, 38 | int tx_depth, 39 | int post_list_size) { 40 | this->qp_num = qp_num; 41 | this->ctx_poll_batch = ctx_poll_batch; 42 | this->tx_depth = tx_depth; 43 | this->post_list_size = post_list_size; 44 | } 45 | void set_param_vec(std::vector param_vec){ 46 | qp_num = param_vec[0]; 47 | ctx_poll_batch = param_vec[1]; 48 | tx_depth = param_vec[2]; 49 | post_list_size = param_vec[3]; 50 | } 51 | 52 | std::vector get_param_vec(){ 53 | std::vector params; 54 | params.push_back(qp_num); 55 | params.push_back(ctx_poll_batch); 56 | params.push_back(tx_depth); 57 | params.push_back(post_list_size); 58 | return params; 59 | } 60 | 61 | PipeParam& operator=(const PipeParam& pipe_param) { 62 | set_params(pipe_param.qp_num, pipe_param.ctx_poll_batch, 63 | pipe_param.tx_depth, pipe_param.post_list_size); 64 | return *this; 65 | } 66 | }; 67 | 68 | class Pipe { 69 | private: 70 | ComEndPoint remote_end; 71 | PipeParam pipe_param; 72 | std::vector remote_buffer_tokens; 73 | std::vector qps; 74 | std::vector requests; 75 | infinity::queues::SendRequestBuffer send_buffer; 76 | infinity::core::Context* context; 77 | infinity::queues::QueuePairFactory* qpFactory; 78 | infinity::queues::IbvWcBuffer wc_buffer; 79 | int requests_size; 80 | bool connected; 81 | 82 | public: 83 | Pipe() : connected(false) {} 84 | Pipe(infinity::core::Context* context, 85 | infinity::queues::QueuePairFactory* qpFactory, 86 | ComEndPoint com_endpoint, 87 | PipeParam pipe_param) { 88 | this->context = context; 89 | this->qpFactory = qpFactory; 90 | this->remote_end = com_endpoint; 91 | this->pipe_param = pipe_param; 92 | connected = false; 93 | } 94 | 95 | Pipe& operator=(const Pipe& pipe) { 96 | if (pipe.connected) { 97 | fprintf(stderr, "Pipe can only be assigned before connect"); 98 | } 99 | this->remote_end = pipe.remote_end; 100 | this->pipe_param = pipe.pipe_param; 101 | this->context = pipe.context; 102 | this->qpFactory = pipe.qpFactory; 103 | connected = false; 104 | return *this; 105 | } 106 | 107 | void connect() { 108 | qps.resize(pipe_param.qp_num); 109 | remote_buffer_tokens.resize(pipe_param.qp_num); 110 | requests_size = 111 | pipe_param.tx_depth / pipe_param.post_list_size; 112 | requests.resize(requests_size); 113 | send_buffer.resize(pipe_param.post_list_size); 114 | wc_buffer.resize(pipe_param.ctx_poll_batch); 115 | for (int qp_index = 0; qp_index < pipe_param.qp_num; qp_index++) { 116 | qps[qp_index] = qpFactory->connectToRemoteHost( 117 | remote_end.get_address().c_str(), remote_end.get_port()); 118 | remote_buffer_tokens[qp_index] = 119 | (infinity::memory::RegionToken*)qps[qp_index]->getUserData(); 120 | } 121 | 122 | for (int request_index = 0; request_index < requests.size(); 123 | request_index++) { 124 | requests[request_index] = new infinity::requests::RequestToken(context); 125 | } 126 | connected = true; 127 | } 128 | 129 | void read(infinity::memory::Buffer* local_buffer, 130 | std::vector local_offsets, 131 | std::vector remote_offsets, 132 | uint64_t stride) { 133 | uint64_t post_list_cnt = 134 | (local_offsets.size() + pipe_param.post_list_size - 1) / 135 | pipe_param.post_list_size; 136 | 137 | // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check 138 | // Local_Offset_Size "<< remote_offsets.size()<multiRead( 150 | batch_read_size, local_buffer, 151 | &local_offsets[post_index * pipe_param.post_list_size], 152 | remote_buffer_tokens[post_index % pipe_param.qp_num], 153 | &remote_offsets[post_index * pipe_param.post_list_size], stride, 154 | infinity::queues::OperationFlags(), requests[epoch_scnt], 155 | send_buffer); 156 | epoch_scnt += 1; 157 | 158 | if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) { 159 | context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch, 160 | epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1); 161 | epoch_scnt = 0; 162 | } 163 | } 164 | } 165 | 166 | void read(infinity::memory::Buffer* local_buffer, 167 | torch::Tensor& local_offsets_tensor, 168 | torch::Tensor& remote_offsets_tensor, 169 | uint64_t stride) { 170 | QUIVER_FEATURE_ASSERT(local_offsets_tensor.dim() == 1, 171 | "local_offsets should be 1-dimensional tensor"); 172 | QUIVER_FEATURE_ASSERT(remote_offsets_tensor.dim() == 1, 173 | "local_offsets should be 1-dimensional tensor"); 174 | QUIVER_FEATURE_ASSERT( 175 | remote_offsets_tensor.size(0) == local_offsets_tensor.size(0), 176 | "local_offsets and remote_offsets should have the same length"); 177 | 178 | int64_t* local_offsets = local_offsets_tensor.data_ptr(); 179 | int64_t* remote_offsets = remote_offsets_tensor.data_ptr(); 180 | 181 | uint64_t post_list_cnt = 182 | (local_offsets_tensor.size(0) + pipe_param.post_list_size - 1) / 183 | pipe_param.post_list_size; 184 | 185 | // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check 186 | // Local_Offset_Size "<< remote_offsets.size()<multiRead( 203 | batch_read_size, local_buffer, 204 | &local_offsets[post_index * pipe_param.post_list_size], 205 | remote_buffer_tokens[post_index % pipe_param.qp_num], 206 | &remote_offsets[post_index * pipe_param.post_list_size], stride, 207 | infinity::queues::OperationFlags(), requests[epoch_scnt], 208 | send_buffer); 209 | epoch_scnt += 1; 210 | 211 | if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) { 212 | int cq_num = context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch, 213 | epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1); 214 | epoch_scnt -= cq_num; 215 | } 216 | } 217 | } 218 | }; 219 | } // namespace qvf 220 | -------------------------------------------------------------------------------- /csrc/include/qvf/qvf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | -------------------------------------------------------------------------------- /csrc/include/qvf/range.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | namespace qvf { 5 | class Range { 6 | private: 7 | int64_t start; 8 | int64_t end; 9 | 10 | public: 11 | Range() {} 12 | Range(int64_t start, int64_t end) : start(start), end(end) {} 13 | void set_params(int64_t start, int64_t end) { 14 | this->start = start; 15 | this->end = end; 16 | } 17 | Range& operator=(const Range& other) { 18 | this->start = other.start; 19 | this->end = other.end; 20 | return *this; 21 | } 22 | int64_t range_start() { return start; } 23 | int64_t range_end() { return end; } 24 | }; 25 | } // namespace qvf 26 | -------------------------------------------------------------------------------- /csrc/include/qvf/shared_loader.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by joker on 2022/5/15. 3 | // 4 | 5 | #ifndef QUIVER_FEATURE_SHAREDLOADER_H 6 | #define QUIVER_FEATURE_SHAREDLOADER_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace qvf { 14 | 15 | using caffe2::serialize::PyTorchStreamReader; 16 | using caffe2::serialize::ReadAdapterInterface; 17 | 18 | template 19 | struct Rob { 20 | friend typename Tag::type get(Tag) { return M; } 21 | }; 22 | 23 | #define ROB_FIELD_FROM_READER(FieldType, FieldName) \ 24 | struct PyTorchStreamReader_##FieldName { \ 25 | typedef FieldType PyTorchStreamReader::*type; \ 26 | friend type get(PyTorchStreamReader_##FieldName); \ 27 | }; \ 28 | template struct Rob 30 | 31 | ROB_FIELD_FROM_READER(std::string, archive_name_plus_slash_); 32 | ROB_FIELD_FROM_READER(std::unique_ptr, ar_); 33 | ROB_FIELD_FROM_READER(std::mutex, reader_lock_); 34 | 35 | struct TORCH_API SharedLoader { 36 | PyTorchStreamReader reader; 37 | explicit SharedLoader(const std::string& file_name) : reader(file_name) {} 38 | explicit SharedLoader(std::istream* in) : reader(in) {} 39 | explicit SharedLoader(std::shared_ptr in) 40 | : reader(in) {} 41 | void valid(const char* what, const char* info = ""); 42 | std::tuple getRecord(const std::string& name); 43 | size_t getRecordID(const std::string& name); 44 | size_t getRecordOffset(const std::string& name) { 45 | return reader.getRecordOffset(name); 46 | } 47 | bool hasRecord(const std::string& name) { return reader.hasRecord(name); } 48 | std::vector getAllRecords() { return reader.getAllRecords(); } 49 | }; 50 | 51 | } // namespace qvf 52 | #endif // QUIVER_FEATURE_SHAREDLOADER_H 53 | -------------------------------------------------------------------------------- /csrc/include/qvf/tensor_endpoint.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | namespace qvf { 5 | class TensorEndPoint { 6 | public: 7 | ComEndPoint com_endpoint; 8 | Range range; 9 | 10 | public: 11 | TensorEndPoint(ComEndPoint com_endpoint, Range range) { 12 | this->com_endpoint = com_endpoint; 13 | this->range = range; 14 | } 15 | 16 | TensorEndPoint(int rank, 17 | std::string ip, 18 | int port, 19 | int64_t range_start, 20 | int64_t range_end) { 21 | this->com_endpoint = ComEndPoint(rank, ip, port); 22 | this->range = Range(range_start, range_end); 23 | } 24 | 25 | TensorEndPoint(std::string ip, int port, int rank, Range range) { 26 | this->com_endpoint = ComEndPoint(rank, ip, port); 27 | this->range = range; 28 | } 29 | 30 | TensorEndPoint& operator=(const TensorEndPoint& other) { 31 | this->com_endpoint = other.com_endpoint; 32 | this->range = other.range; 33 | return *this; 34 | } 35 | }; 36 | } // namespace qvf 37 | -------------------------------------------------------------------------------- /csrc/src/module.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | void register_TensorEndPoint(pybind11::module& m); 8 | void register_DistTensorServer(pybind11::module& m); 9 | void register_PipeParam(pybind11::module& m); 10 | void register_DistTensorClient(pybind11::module& m); 11 | void register_ComEndPoint(pybind11::module& m); 12 | void register_SharedStorageReader(pybind11::module& m); 13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 14 | register_TensorEndPoint(m); 15 | register_DistTensorServer(m); 16 | register_PipeParam(m); 17 | register_DistTensorClient(m); 18 | register_ComEndPoint(m); 19 | register_SharedStorageReader(m); 20 | } 21 | -------------------------------------------------------------------------------- /csrc/src/register.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void register_TensorEndPoint(pybind11::module& m) { 10 | // define TensorEndPoint 11 | py::class_(m, "TensorEndPoint") 12 | .def(py::init()); 13 | } 14 | 15 | void register_ComEndPoint(pybind11::module& m) { 16 | // define ComEndPoint 17 | py::class_(m, "ComEndPoint") 18 | .def(py::init()) 19 | .def(py::init<>()) 20 | .def("rank", &qvf::ComEndPoint::get_rank, py::call_guard()) 21 | .def("address", &qvf::ComEndPoint::get_address, py::call_guard()) 22 | .def("port", &qvf::ComEndPoint::get_port, py::call_guard()); 23 | } 24 | 25 | void register_DistTensorServer(pybind11::module& m) { 26 | // define TensorEndPoint 27 | py::class_(m, "DistTensorServer") 28 | .def(py::init()) 29 | .def("serve_tensor", &qvf::DistTensorServer::serve_tensor, 30 | py::call_guard()) 31 | .def("join", &qvf::DistTensorServer::join, 32 | py::call_guard()); 33 | } 34 | 35 | void register_PipeParam(pybind11::module& m) { 36 | py::class_(m, "PipeParam") 37 | .def(py::init()) 38 | .def(py::init<>()) 39 | .def("get_param_vec", &qvf::PipeParam::get_param_vec, py::call_guard()) 40 | .def("set_param_vec", &qvf::PipeParam::set_param_vec, py::call_guard()) 41 | ; 42 | } 43 | 44 | void register_DistTensorClient(pybind11::module& m) { 45 | py::class_(m, "DistTensorClient") 46 | .def(py::init, qvf::PipeParam>()) 47 | .def("create_registered_float32_tensor", 48 | &qvf::DistTensorClient::create_registered_float32_tensor, 49 | py::call_guard()) 50 | .def("register_float_tensor", 51 | &qvf::DistTensorClient::register_float_tensor, 52 | py::call_guard()) 53 | .def("create_registered_float32_tensor_cuda", 54 | &qvf::DistTensorClient::create_registered_float32_tensor_cuda, 55 | py::call_guard()) 56 | 57 | .def("sync_read", &qvf::DistTensorClient::sync_read, 58 | py::call_guard()); 59 | } 60 | 61 | void register_SharedStorageReader(pybind11::module& m) { 62 | class BufferAdapter : public caffe2::serialize::ReadAdapterInterface { 63 | public: 64 | BufferAdapter(const py::object& buffer) : buffer_(buffer) { 65 | // Jump to the end of the buffer to get its size 66 | auto current = buffer.attr("tell")(); 67 | start_offset_ = py::cast(current); 68 | buffer.attr("seek")(current, py::module::import("os").attr("SEEK_END")); 69 | size_ = py::cast(buffer.attr("tell")()) - start_offset_; 70 | buffer.attr("seek")(current); 71 | 72 | // If we can read directly into a buffer, do that instead of an extra copy 73 | use_readinto_ = py::hasattr(buffer, "readinto"); 74 | } 75 | 76 | size_t size() const override { return size_; } 77 | 78 | THPObjectPtr getMemview(void* buf, size_t n) const { 79 | THPObjectPtr memview(PyMemoryView_FromMemory(reinterpret_cast(buf), 80 | n, PyBUF_WRITE)); 81 | if (!memview) { 82 | throw python_error(); 83 | } 84 | return memview; 85 | } 86 | 87 | size_t read(uint64_t pos, 88 | void* buf, 89 | size_t n, 90 | const char* what) const override { 91 | // Seek to desired position (NB: this has to be a Py_ssize_t or Python 92 | // throws a weird error) 93 | Py_ssize_t absolute_pos = start_offset_ + pos; 94 | buffer_.attr("seek")(absolute_pos); 95 | 96 | if (use_readinto_) { 97 | auto memview = getMemview(buf, n); 98 | auto res = 99 | PyObject_CallMethod(buffer_.ptr(), "readinto", "O", memview.get()); 100 | if (res) { 101 | int64_t i = static_cast(PyLong_AsLongLong(res)); 102 | if (i > 0) { 103 | return i; 104 | } 105 | } 106 | } 107 | 108 | // Read bytes into `buf` from the buffer 109 | std::string bytes = py::cast(buffer_.attr("read")(n)); 110 | std::copy(bytes.data(), bytes.data() + bytes.size(), 111 | reinterpret_cast(buf)); 112 | return bytes.size(); 113 | } 114 | 115 | py::object buffer_; 116 | size_t size_; 117 | size_t start_offset_; 118 | bool use_readinto_; 119 | }; 120 | py::class_>( 121 | m, "SharedTensorLoader") 122 | .def(py::init()) 123 | .def(py::init([](const py::object& buffer) { 124 | auto adapter = std::make_unique(buffer); 125 | return std::make_shared(std::move(adapter)); 126 | })) 127 | .def("get_record", 128 | [](qvf::SharedLoader& self, const std::string& key) { 129 | at::DataPtr data; 130 | size_t size = 0; 131 | std::tie(data, size) = self.getRecord(key); 132 | return py::bytes(reinterpret_cast(data.get()), size); 133 | }) 134 | .def("has_record", 135 | [](qvf::SharedLoader& self, const std::string& key) { 136 | return self.hasRecord(key); 137 | }) 138 | .def("get_storage_from_record", 139 | [](qvf::SharedLoader& self, const std::string& key, size_t numel, 140 | py::object data_type_obj) { 141 | at::DataPtr data(std::get<0>(self.getRecord(key))); 142 | auto scalar_type = 143 | reinterpret_cast(data_type_obj.ptr())->scalar_type; 144 | 145 | c10::Storage storage(c10::Storage::use_byte_size_t(), 146 | numel * elementSize(scalar_type), 147 | std::move(data), 148 | /*allocator=*/nullptr, 149 | /*resizable=*/false); 150 | auto ptr = 151 | c10::make_intrusive( 152 | std::move(storage), at::DispatchKeySet(), 153 | at::CPU(scalar_type).typeMeta()); 154 | return at::Tensor(std::move(ptr)); 155 | }) 156 | .def("get_all_records", 157 | [](qvf::SharedLoader& self) { return self.getAllRecords(); }); 158 | } -------------------------------------------------------------------------------- /csrc/src/shared_loader.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | extern "C" { 5 | #include 6 | } 7 | 8 | #define RB(x) get(PyTorchStreamReader_##x()) 9 | 10 | at::DataPtr new_fd_storage(ptrdiff_t size) { 11 | int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE | 12 | at::ALLOCATOR_MAPPED_KEEPFD | at::ALLOCATOR_MAPPED_UNLINK; 13 | std::string handle = at::NewProcessWideShmHandle(); 14 | auto sptr = at::MapAllocator::makeDataPtr(handle.c_str(), flags, 15 | size * sizeof(uint8_t), nullptr); 16 | 17 | return sptr; 18 | } 19 | 20 | size_t qvf::SharedLoader::getRecordID(const std::string& name) { 21 | std::string ss = reader.*RB(archive_name_plus_slash_) + name; 22 | size_t result = mz_zip_reader_locate_file((reader.*RB(ar_)).get(), ss.c_str(), 23 | nullptr, 0); 24 | valid("locating file ", name.c_str()); 25 | return result; 26 | } 27 | 28 | std::tuple qvf::SharedLoader::getRecord( 29 | const std::string& name) { 30 | std::lock_guard guard(reader.*RB(reader_lock_)); 31 | size_t key = getRecordID(name); 32 | mz_zip_archive_file_stat stat; 33 | mz_zip_reader_file_stat((reader.*RB(ar_)).get(), key, &stat); 34 | valid("retrieving file meta-data for ", name.c_str()); 35 | at::DataPtr retval = new_fd_storage(stat.m_uncomp_size); 36 | mz_zip_reader_extract_to_mem((reader.*RB(ar_)).get(), key, retval.get(), 37 | stat.m_uncomp_size, 0); 38 | valid("reading file ", name.c_str()); 39 | 40 | return std::make_tuple(std::move(retval), stat.m_uncomp_size); 41 | } 42 | 43 | void qvf::SharedLoader::valid(const char* what, const char* info) { 44 | const auto err = mz_zip_get_last_error((reader.*RB(ar_)).get()); 45 | TORCH_CHECK(err == MZ_ZIP_NO_ERROR, "PytorchStreamReader failed ", what, info, 46 | ": ", mz_zip_get_error_string(err)); 47 | } -------------------------------------------------------------------------------- /docs/imgs/Network Bandwidth Under 100Gbps IB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/Network Bandwidth Under 100Gbps IB.png -------------------------------------------------------------------------------- /docs/imgs/consistent_memory_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/consistent_memory_view.png -------------------------------------------------------------------------------- /docs/imgs/e2e_feature_collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection.png -------------------------------------------------------------------------------- /docs/imgs/e2e_feature_collection_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection_performance.png -------------------------------------------------------------------------------- /docs/imgs/gpu0_centered_access_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/gpu0_centered_access_performance.png -------------------------------------------------------------------------------- /docs/imgs/memory_usage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/memory_usage.png -------------------------------------------------------------------------------- /docs/imgs/multi_qp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/multi_qp.png -------------------------------------------------------------------------------- /docs/imgs/one_batch_feature_collection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/one_batch_feature_collection.png -------------------------------------------------------------------------------- /docs/imgs/peak_memory_footprint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/peak_memory_footprint.png -------------------------------------------------------------------------------- /docs/imgs/pgas_tensor_access.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_access.png -------------------------------------------------------------------------------- /docs/imgs/pgas_tensor_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_view.png -------------------------------------------------------------------------------- /docs/imgs/range_partition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/range_partition.png -------------------------------------------------------------------------------- /docs/imgs/rdma_mtt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/rdma_mtt.png -------------------------------------------------------------------------------- /docs/imgs/shared_load.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/shared_load.png -------------------------------------------------------------------------------- /docs/imgs/subset_signaled_requests.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/subset_signaled_requests.png -------------------------------------------------------------------------------- /docs/imgs/train_gnn_on_large_graphs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/train_gnn_on_large_graphs.png -------------------------------------------------------------------------------- /docs/memory.md: -------------------------------------------------------------------------------- 1 | # Peak Memory Footprint Optimization In Quiver-Feature 2 | 3 | By default, Quiver-Feature use `range partition` method to partition original gaint feature array onto different machines. This is pretty easy to understand, let's talk more about memory usage in each machine. 4 | 5 | ![range_partition](imgs/range_partition.png) 6 | 7 | On each machine: 8 | 1. The feature tensor needs to be pinned so that RNIC and GPU can access its memory directly. 9 | 10 | 2. The feature tensor should be in SHM because multiple processes needs to access its data. 11 | 12 | ![memory_usage](imgs/memory_usage.png) 13 | 14 | Pinning memory doesnt consume extra memory but moving a torch.Tensor to SHM will cause 15 | 2x the peak memory of the original data size. 16 | 17 | To solve this problem, we implement `quiver_feature.shared_load` to replace the original `torch.load`. **`quiver_feature.shared_load` is almost the same as `torch.load` except that it loads data directly into SHM**. So the peak memory during creating `DistTensorPGAS` using `quiver_feature.shared_load` will just be around the original data size, **half of that when using torch.load**. 18 | 19 | ![shared_load](imgs/shared_load.png) 20 | 21 | ![peak_memory](imgs/peak_memory_footprint.png) 22 | 23 | You can check our [test script](../tests/python/test_SharedLoader.py) for more details. 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /docs/partition_methods.md: -------------------------------------------------------------------------------- 1 | # Partition Methods 2 | 3 | This doc will mainly describe feature partition methods we use in `Quiver-Feature`. 4 | 5 | # Metadata Of Each Partition 6 | 7 | Default metadata for each partition is `TensorEndPoint` which records `Range` information of each serverßß. 8 | 9 | ```python 10 | Range = namedtuple("Range", ["start", "end"]) 11 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"]) 12 | 13 | ``` 14 | For example, in the following partition settting, we have a list of `TensorEndPoint` like shown below. With this list, we can easily compute the `server_rank` and `local offset` of a certrain node idx. 15 | 16 | ```python 17 | [ 18 | TensorEndPoint(server_rank=0, ip=ip0, port=port0, range=Range(start=0, end=M)), 19 | TensorEndPoint(server_rank=1, ip=ip1, port=port1, range=Range(start=M, end=N)) 20 | ] 21 | ``` 22 | 23 | ![](imgs/range_partition.png) 24 | 25 | # Range Partition 26 | Range partition is the default partition method we support for now. Take the following partition setting as example, we just assign [0, M) to Machine0 and assign [M, N) to Machine1. 27 | 28 | ![](imgs/range_partition.png) -------------------------------------------------------------------------------- /docs/rdma_details.md: -------------------------------------------------------------------------------- 1 | # RDMA Details 2 | 3 | This doc will mainly describe how we use RDMA for remote data access and summarize techniques we use to get the best RDMA performance. 4 | 5 | Before we start, we would like to show our appreciation to [@claudebarthels](https://github.com/claudebarthels) for developing [infinity](https://github.com/claudebarthels/infinity) which is a lightweight C++ RDMA library for IB and is also the code base for our RDMA implementation. 6 | 7 | 8 | ## Use RDMA READ for Feature Collection 9 | 10 | As we mentioned in the [REAMDE](../README.md), `quiver_feature.DistTensorPGAS` is a 2-dimension distributed tensor abstraction above different memory spaces using `PGAS` model(Partitioned Global Address Space) and **`quiver_feature.DistTensorPGAS` is partitioned by row onto different machines**. 11 | ![memory_view](imgs/pgas_tensor_view.png) 12 | 13 | By default, we use `range partition`, when we want to access a certain row of `quiver_feature.DistTensorPGAS`, **we can compute the target machine's index and the memory offset of this row on that target machine from row index**. 14 | 15 | ![range_partition](imgs/range_partition.png) 16 | 17 | 18 | Since each row's data size can be known in advance, **we can use one single `RDMA READ` to fetch this wanted row's data(which corresponds to a single node's feature)**. 19 | 20 | ![memory_view](imgs/pgas_tensor_view.png) 21 | 22 | So **each batch's feature collection involves millons of `RDMA READ`**, each `READ` for one node's feature. 23 | 24 | ![feature_collection](imgs/one_batch_feature_collection.png) 25 | 26 | ## 4 Techniques We Use 27 | Feature collection invloves millions of small `RDMA READs`(each `READ` may read just 2KB data), and we use these 4 techniques to get the best performance. 28 | 29 | ### Rule 1: Use Multi QPs Per Client 30 | 31 | RDMA hosts use Queue Pair(QP) to communicate with each other. Nowadays, RNICs contains a pool of processing units(PUs) and we believe that requests in the same QP is always processed by the same PU to avoid cross-PU synchronization. But CPU is much powerful than a PU so if we only use one QP per RDMA client, the performance can be easily bottlenecked by the PU's ability. So we use multi QPs per RDMA client and dispatch READ requests evenly to these QPs to take full advantage of RNIC's parallel processing ability. 32 | 33 | ![multi_qp](imgs/multi_qp.png) 34 | 35 | 36 | ### Rule 2: Only Set A Subset Of All Requests as Signaled 37 | 38 | Each RDMA read request can be set as signaled or unsignaled. Signaled requests need CPU intervention but users can check result status by polling CQs(Completion Queue). Unsignaled requests dont involve CPU, but users have to decide their own way to check if these requests are completed successfully. 39 | 40 | Like we said before, each batch's feature collection involves millions of `RDMA READ` requests. For each QP, we sequentially send these requests but only set one request out of `CQ_MOD`(which we often set as 128) requests as signaled, i.e. we only set 1/128 of all requests as signaled and check their result status. We also set the last request as signaled and wait until its completion to make sure that all requests in this QP are completed. If these signaled requests' result status are all successful, we think all requests are completed sucessfully. 41 | 42 | In the future we may add more mechanisms about failures: If we find a signaled request is failed, we will retry this group of `CQ_MOD` requests again. Even with that, We could not guarantee that all requests are completed successfully. 43 | 44 | ![subset_signaled](imgs/subset_signaled_requests.png) 45 | 46 | 47 | ## Set QP's max_rd_atomic as the RNIC's max_qp_rd_atom 48 | 49 | `max_rd_atomic` is a crucial QP attribute for performance, it is the number of RDMA Reads & atomic operations outstanding at any time that can be handled by a RC QP as an initiator. We suggest you set it as RNIC's `max_qp_rd_atom` which you can get by calling `ibv_query_device()`. You can refer to [our code](https://github.com/quiver-team/quiver-feature/blob/main/csrc/include/infinity/queues/QueuePair.cpp#L38) to see how to set this attribute. 50 | 51 | ## Reduce Address Translation Overhead 52 | 53 | RNIC uses DMA to access system memory, since DMA can only handle physical addresses, the memory region which is exposed to RNIC must be registered so that RNIC stores virtual-to-physical mapping of this memory region in its MTT(Memory Translation Table). MTT is stored in system memory but RNIC's SRAM will cache some. Every time RNIC receive a RDMA read/write requests, it will first translate user's virtual address to physical address by looking up it's MTT cache, if the cache is missed, it will send requsts through PCIe to check this mapping in system memory which may bring severe overhead and thus cause RDMA performance degradation. 54 | 55 | ![rdma_mtt](imgs/rdma_mtt.png) 56 | 57 | To reduce this address translation overhead, we choose to sort our requested node ids before sending RDMA requests to increase memory accessing locality so that RNIC's cache could get higher hit rate. 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /examples/mag240m/README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | Distributed training setting on MAG240M dataset is almost the same as the [official example in DGL](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) except that we use `Quiver-Feature` for distributed feature collection. 4 | 5 | Our implementation is much faster than DGL's offical example while achieving similar accuracy. 6 | 7 | # Data Preprocess & Partition 8 | 9 | First, please run [preprocess.py](./preprocess.py) to generate `graph.dgl` and `full.npy`, you can check [DGL's official guide](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) for more details. 10 | 11 | Then we use [Range Partition](../../docs/partition_methods.md) to partition feature data, it is very easy to understand, you can check [process_quiver.py](./process_quiver.py) for more details. 12 | 13 | ![](../../docs/imgs/range_partition.png) 14 | 15 | 16 | # Running Training Script 17 | 18 | On each machine, please run: 19 | 20 | python3 distributed_training.py \ 21 | --rootdir . \ 22 | --graph-path ./graph.dgl \ 23 | --feature-partition-path ./feature_part.pt \ 24 | --server_world_size 2 25 | --server_rank 0 26 | 27 | Remember to: 28 | 29 | - Set shm size limit as large as your physical memory size. You can set by: 30 | 31 | sudo mount -o remount,size=300G /dev/shm 32 | 33 | - Set `MASTER_IP` as your master node's IP 34 | 35 | 36 | The validation accuracy is 0.680. We do not have ground truth test labels so we do not report test accuracy. 37 | 38 | # Performance 39 | 40 | With 2 machines and 1 GPU per machine, we need 2 minutes 10 seconds to train and 15 seconds to validate for each epoch. This is 3x faster than [DGL's performance result](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M). 41 | 42 | From logs we can see that most of the training time of each iteration is spended on model computation. 43 | 44 | Avg_Sample: 0.0051s, Avg_Feature: 0.0176s, Avg_Model: 0.1801s, Avg_Feature_BandWidth = 14588.4937 MB/s 45 | 46 | # Hardware configurations 47 | 48 | We have 2 machines, each have 377G memory and they are connected with 100Gbps IB. Running training script will consume around 256GB memory. 49 | -------------------------------------------------------------------------------- /examples/mag240m/config.py: -------------------------------------------------------------------------------- 1 | PORT_NUMBER = 3344 2 | MASTER_IP = "155.198.152.17" 3 | #MASTER_IP = "127.0.0.1" 4 | HLPER_PORT = 5678 5 | NODE_COUNT = 1200000 6 | FEATURE_DIM = 128 7 | FEATURE_TYPE_SIZE = 4 8 | SAMPLE_NUM = 80000 9 | ITER_NUM = 10 10 | POST_LIST_SIZE = 128 11 | QP_NUM = 8 12 | TX_DEPTH = 2048 13 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE 14 | TEST_TLB_OPTIMIZATION = True 15 | 16 | # For MAG240M Training 17 | SAMPLE_PARAM = [15, 25] 18 | BATCH_SIZE = 1024 19 | -------------------------------------------------------------------------------- /examples/mag240m/preprocess.py: -------------------------------------------------------------------------------- 1 | import ogb 2 | from ogb.lsc import MAG240MDataset 3 | import tqdm 4 | import numpy as np 5 | import torch 6 | import dgl 7 | import dgl.function as fn 8 | import argparse 9 | import os 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--rootdir', type=str, default='.', help='Directory to download the OGB dataset.') 13 | parser.add_argument('--author-output-path', type=str, help='Path to store the author features.') 14 | parser.add_argument('--inst-output-path', type=str, 15 | help='Path to store the institution features.') 16 | parser.add_argument('--graph-output-path', type=str, help='Path to store the graph.') 17 | parser.add_argument('--graph-format', type=str, default='csc', help='Graph format (coo, csr or csc).') 18 | parser.add_argument('--graph-as-homogeneous', action='store_true', help='Store the graph as DGL homogeneous graph.') 19 | parser.add_argument('--full-output-path', type=str, 20 | help='Path to store features of all nodes. Effective only when graph is homogeneous.') 21 | args = parser.parse_args() 22 | 23 | print('Building graph') 24 | dataset = MAG240MDataset(root=args.rootdir) 25 | ei_writes = dataset.edge_index('author', 'writes', 'paper') 26 | ei_cites = dataset.edge_index('paper', 'paper') 27 | ei_affiliated = dataset.edge_index('author', 'institution') 28 | 29 | # We sort the nodes starting with the papers, then the authors, then the institutions. 30 | author_offset = 0 31 | inst_offset = author_offset + dataset.num_authors 32 | paper_offset = inst_offset + dataset.num_institutions 33 | 34 | g = dgl.heterograph({ 35 | ('author', 'write', 'paper'): (ei_writes[0], ei_writes[1]), 36 | ('paper', 'write-by', 'author'): (ei_writes[1], ei_writes[0]), 37 | ('author', 'affiliate-with', 'institution'): (ei_affiliated[0], ei_affiliated[1]), 38 | ('institution', 'affiliate', 'author'): (ei_affiliated[1], ei_affiliated[0]), 39 | ('paper', 'cite', 'paper'): (np.concatenate([ei_cites[0], ei_cites[1]]), np.concatenate([ei_cites[1], ei_cites[0]])) 40 | }) 41 | 42 | paper_feat = dataset.paper_feat 43 | author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features)) 44 | inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features)) 45 | 46 | # Iteratively process author features along the feature dimension. 47 | BLOCK_COLS = 16 48 | with tqdm.trange(0, dataset.num_paper_features, BLOCK_COLS) as tq: 49 | for start in tq: 50 | tq.set_postfix_str('Reading paper features...') 51 | g.nodes['paper'].data['x'] = torch.FloatTensor(paper_feat[:, start:start + BLOCK_COLS].astype('float32')) 52 | # Compute author features... 53 | tq.set_postfix_str('Computing author features...') 54 | g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='write-by') 55 | # Then institution features... 56 | tq.set_postfix_str('Computing institution features...') 57 | g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='affiliate-with') 58 | tq.set_postfix_str('Writing author features...') 59 | author_feat[:, start:start + BLOCK_COLS] = g.nodes['author'].data['x'].numpy().astype('float16') 60 | tq.set_postfix_str('Writing institution features...') 61 | inst_feat[:, start:start + BLOCK_COLS] = g.nodes['institution'].data['x'].numpy().astype('float16') 62 | del g.nodes['paper'].data['x'] 63 | del g.nodes['author'].data['x'] 64 | del g.nodes['institution'].data['x'] 65 | author_feat.flush() 66 | inst_feat.flush() 67 | 68 | # Convert to homogeneous if needed. (The RGAT baseline needs homogeneous graph) 69 | if args.graph_as_homogeneous: 70 | # Process graph 71 | g = dgl.to_homogeneous(g) 72 | # DGL ensures that nodes with the same type are put together with the order preserved. 73 | # DGL also ensures that the node types are sorted in ascending order. 74 | assert torch.equal( 75 | g.ndata[dgl.NTYPE], 76 | torch.cat([torch.full((dataset.num_authors,), 0), 77 | torch.full((dataset.num_institutions,), 1), 78 | torch.full((dataset.num_papers,), 2)])) 79 | assert torch.equal( 80 | g.ndata[dgl.NID], 81 | torch.cat([torch.arange(dataset.num_authors), 82 | torch.arange(dataset.num_institutions), 83 | torch.arange(dataset.num_papers)])) 84 | g.edata['etype'] = g.edata[dgl.ETYPE].byte() 85 | del g.edata[dgl.ETYPE] 86 | del g.ndata[dgl.NTYPE] 87 | del g.ndata[dgl.NID] 88 | 89 | # Process feature 90 | full_feat = np.memmap( 91 | args.full_output_path, mode='w+', dtype='float16', 92 | shape=(dataset.num_authors + dataset.num_institutions + dataset.num_papers, dataset.num_paper_features)) 93 | BLOCK_ROWS = 100000 94 | for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS): 95 | end = min(dataset.num_authors, start + BLOCK_ROWS) 96 | full_feat[author_offset + start:author_offset + end] = author_feat[start:end] 97 | for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS): 98 | end = min(dataset.num_institutions, start + BLOCK_ROWS) 99 | full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end] 100 | for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS): 101 | end = min(dataset.num_papers, start + BLOCK_ROWS) 102 | full_feat[paper_offset + start:paper_offset + end] = paper_feat[start:end] 103 | 104 | # Convert the graph to the given format and save. (The RGAT baseline needs CSC graph) 105 | g = g.formats(args.graph_format) 106 | dgl.save_graphs(args.graph_output_path, g) -------------------------------------------------------------------------------- /examples/mag240m/preprocess_quiver.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | meta = torch.load("/data/mag/mag240m_kddcup2021/meta.pt") 6 | 7 | print("Dataset Loading Finished") 8 | 9 | paper_offset = meta["author"] + meta["institution"] 10 | num_nodes = paper_offset + meta["paper"] 11 | num_features = 768 12 | 13 | feats = np.memmap("/data/dalong/full.npy", mode='r', dtype='float16', shape=(num_nodes, num_features)) 14 | 15 | print("Paper Loading Finished") 16 | 17 | print("Creating Float32 Tensor") 18 | tensor_feature = torch.HalfTensor(feats[num_nodes//2: ]) 19 | 20 | torch.save(tensor_feature, "/data/dalong/second_half.pt") 21 | 22 | 23 | -------------------------------------------------------------------------------- /examples/ogb-products/config.py: -------------------------------------------------------------------------------- 1 | PORT_NUMBER = 3344 2 | MASTER_IP = "127.0.0.1" 3 | HLPER_PORT = 5678 4 | NODE_COUNT = 1200000 5 | FEATURE_DIM = 128 6 | FEATURE_TYPE_SIZE = 4 7 | SAMPLE_NUM = 80000 8 | ITER_NUM = 10 9 | POST_LIST_SIZE = 128 10 | QP_NUM = 8 11 | TX_DEPTH = 2048 12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE 13 | TEST_TLB_OPTIMIZATION = True 14 | 15 | # For Reddit Training 16 | SAMPLE_PARAM = [15, 10, 5] 17 | BATCH_SIZE = 1024 18 | -------------------------------------------------------------------------------- /examples/reddit/config.py: -------------------------------------------------------------------------------- 1 | PORT_NUMBER = 3344 2 | MASTER_IP = "127.0.0.1" 3 | HLPER_PORT = 5678 4 | NODE_COUNT = 1200000 5 | FEATURE_DIM = 128 6 | FEATURE_TYPE_SIZE = 4 7 | SAMPLE_NUM = 80000 8 | ITER_NUM = 10 9 | POST_LIST_SIZE = 128 10 | QP_NUM = 8 11 | TX_DEPTH = 2048 12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE 13 | TEST_TLB_OPTIMIZATION = True 14 | 15 | # For Reddit Training 16 | SAMPLE_PARAM = [25, 10] 17 | BATCH_SIZE = 256 18 | -------------------------------------------------------------------------------- /quiver_feature/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from . import multiprocessing 3 | from .dist_tensor_rpc import DistTensorRPC 4 | from .common import Range, TensorEndPoint, DistTensorDeviceParam, DistTensorServerParam 5 | from .dist_tensor_pgas import DistTensor as DistTensorPGAS 6 | from .dist_helper import DistHelper 7 | from .local_tensor_pgas import LocalTensorPGAS 8 | from .tensor_loader import shared_load 9 | from .utils import serve_tensor_for_remote_access 10 | from qvf import PipeParam, DistTensorServer 11 | 12 | __all__ = ["DistTensorRPC", "DistTensorPGAS", "LocalTensorPGAS" , "Range", "TensorEndPoint", "DistHelper", 13 | 'shared_load', "PipeParam", "DistTensorServer", "serve_tensor_for_remote_access", "DistTensorServerParam", "DistTensorDeviceParam"] 14 | -------------------------------------------------------------------------------- /quiver_feature/common.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | Range = namedtuple("Range", ["start", "end"]) 3 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"]) 4 | DistTensorServerParam = namedtuple("DistTensorServerParam", ["port_num", "server_world_size", "device_per_server"]) 5 | DistTensorServerParam.__new__.__defaults__ = (3344, 1, 1) 6 | DistTensorDeviceParam = namedtuple("DistTensorDeviceParam", ["device_list", "device_cache_size", "cache_policy"]) 7 | DistTensorDeviceParam.__new__.__defaults__ = ([], 0, "device_replicate") -------------------------------------------------------------------------------- /quiver_feature/dist_helper.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as torch_dist 2 | import socket 3 | import pickle 4 | from datetime import timedelta 5 | from .common import TensorEndPoint, Range 6 | 7 | def resolve_my_ip(): 8 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 9 | s.connect(("8.8.8.8", 80)) 10 | my_ip = s.getsockname()[0] 11 | return my_ip 12 | 13 | class DistHelper: 14 | def __init__(self, master_ip: str, master_port: int, world_size: int, my_rank: int): 15 | self.tcp_store = torch_dist.TCPStore(master_ip, master_port, world_size, my_rank == 0, wait_for_workers = True, multi_tenant=True) 16 | self.my_server_rank = my_rank 17 | self.server_world_size = world_size 18 | self.sync_point = 0 19 | 20 | def exchange_tensor_endpoints_info(self, local_tensor_range: Range, dist_tensor_server_port=3344): 21 | my_ip = resolve_my_ip() 22 | 23 | local_tensor_endpoint = TensorEndPoint(server_rank=self.my_server_rank, ip=my_ip, port=dist_tensor_server_port, range=local_tensor_range) 24 | pickled_data = pickle.dumps(local_tensor_endpoint) 25 | self.tcp_store.set(f"worker{self.my_server_rank}_data", pickled_data) 26 | 27 | 28 | tensor_endpoints = [0] * self.server_world_size 29 | tensor_endpoints[self.my_server_rank] = local_tensor_endpoint 30 | for rank in range(self.server_world_size): 31 | if rank != self.my_server_rank: 32 | tensor_endpoints[rank] = pickle.loads(self.tcp_store.get(f"worker{rank}_data")) 33 | 34 | self.tcp_store.set(f"worker{self.my_server_rank}_status", "DONE") 35 | 36 | keys = [f"worker{rank}_status" for rank in range(self.server_world_size)] 37 | if self.my_server_rank == 0: 38 | while True: 39 | try: 40 | self.tcp_store.wait(keys, timedelta(seconds=1)) 41 | break 42 | except: 43 | pass 44 | 45 | 46 | return tensor_endpoints 47 | 48 | def sync_all(self): 49 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC1") 50 | 51 | keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)] 52 | while True: 53 | try: 54 | self.tcp_store.wait(keys, timedelta(seconds=1)) 55 | break 56 | except: 57 | pass 58 | 59 | 60 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1") 61 | 62 | keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)] 63 | if self.my_server_rank == 0: 64 | while True: 65 | try: 66 | self.tcp_store.wait(keys, timedelta(seconds=1)) 67 | break 68 | except: 69 | pass 70 | 71 | 72 | # TODO Delete Keys 73 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}") 74 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}") 75 | self.sync_point += 1 76 | 77 | def sync_start(self): 78 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC") 79 | 80 | def sync_end(self): 81 | 82 | 83 | keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)] 84 | while True: 85 | try: 86 | self.tcp_store.wait(keys, timedelta(seconds=1)) 87 | break 88 | except: 89 | pass 90 | 91 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1") 92 | 93 | keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)] 94 | if self.my_server_rank == 0: 95 | while True: 96 | try: 97 | self.tcp_store.wait(keys, timedelta(seconds=1)) 98 | break 99 | except: 100 | pass 101 | # TODO Delete Keys 102 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}") 103 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}") 104 | self.sync_point += 1 105 | -------------------------------------------------------------------------------- /quiver_feature/dist_tensor_pgas.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | from typing import List 4 | from .common import Range, TensorEndPoint, DistTensorServerParam, DistTensorDeviceParam 5 | from .dist_helper import DistHelper 6 | from .local_tensor_pgas import LocalTensorPGAS 7 | from .utils import serve_tensor_for_remote_access 8 | 9 | FloatType = [torch.float32, torch.float64, torch.float16, torch.bfloat16] 10 | IntType = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64] 11 | 12 | class DistTensor: 13 | def __init__(self, server_rank, tensor_endpoints: List[TensorEndPoint], pipe_param: qvf.PipeParam, buffer_tensor_shape, cached_range: Range= Range(start=0, end=0), order_transform:torch.Tensor=None, dtype=torch.float32)-> None: 14 | 15 | # About DistTensorClient 16 | self.server_rank = server_rank 17 | self.world_size = len(tensor_endpoints) 18 | self.tensor_endpoints = sorted(tensor_endpoints, key= lambda x: x.server_rank) 19 | self.buffer_tensor_shape = buffer_tensor_shape 20 | self.pipe_param = pipe_param 21 | self.com_endpoints = [qvf.ComEndPoint(item.server_rank, item.ip, item.port) for item in tensor_endpoints] 22 | 23 | self.data_type = dtype 24 | 25 | # About Lazy Init 26 | self.inited = False 27 | 28 | # About ShardTensor 29 | self.local_tensor_pgas = None 30 | self.cached_range = cached_range 31 | self.device_rank = -1 32 | self.order_transform = order_transform 33 | 34 | @property 35 | def dtype(self): 36 | return self.data_type 37 | 38 | def lazy_init(self): 39 | if self.inited: 40 | return 41 | self.inited = True 42 | 43 | self.device_rank = torch.cuda.current_device() 44 | 45 | # Create DistTensorClient 46 | self.dist_tensor_client = qvf.DistTensorClient(self.server_rank, self.com_endpoints, self.pipe_param) 47 | self.registered_tensor = torch.zeros(self.buffer_tensor_shape, dtype=self.dtype).pin_memory() 48 | self.dist_tensor_client.register_float_tensor(self.registered_tensor) 49 | 50 | if self.order_transform is not None: 51 | self.order_transform = self.order_transform.to(self.device_rank) 52 | 53 | 54 | def from_cpu_tensor(self, cpu_tensor, dist_helper:DistHelper, server_param:DistTensorServerParam= None, device_param:DistTensorDeviceParam=None): 55 | 56 | self.data_type = cpu_tensor.dtype 57 | 58 | server_param: DistTensorServerParam = server_param or DistTensorServerParam() 59 | device_param: DistTensorDeviceParam = device_param or DistTensorDeviceParam() 60 | 61 | cpu_tensor.share_memory_() 62 | 63 | # Start Server 64 | serve_tensor_for_remote_access(server_param.port_num, self.pipe_param.get_param_vec()[0], server_param.server_world_size, server_param.device_per_server, cpu_tensor, dist_helper) 65 | 66 | # Build Local Tensor 67 | self.local_tensor_pgas = LocalTensorPGAS(device_param.device_list, device_param.device_cache_size, device_param.cache_policy) 68 | self.local_tensor_pgas.from_cpu_tensor(cpu_tensor) 69 | 70 | 71 | def to(self, device_rank): 72 | self.device_rank = device_rank 73 | if self.order_transform is not None: 74 | self.order_transform = self.order_transform.to(device_rank) 75 | 76 | return self 77 | 78 | def size(self, dim): 79 | assert dim < 2, "DistTensorPGAS is 2-dimensional" 80 | if dim == 1: 81 | return self.buffer_tensor_shape[1] 82 | if dim == 0: 83 | all_ends = [item.range.end for item in self.tensor_endpoints] 84 | all_ends.sort() 85 | return all_ends[-1] 86 | 87 | @property 88 | def shape(self): 89 | return [self.size(0), self.size(1)] 90 | 91 | def collect(self, nodes): 92 | nodes -= self.tensor_endpoints[self.server_rank].range.start 93 | nodes += self.cached_range.end 94 | data = self.local_tensor_pgas[nodes] 95 | return data 96 | 97 | def collect_cached_data(self, nodes): 98 | data = self.local_tensor_pgas[nodes] 99 | return data 100 | 101 | def cal_remote_offsets(self, nodes, server_rank): 102 | remote_offsets = (nodes - self.tensor_endpoints[server_rank].range.start + self.cached_range.end) * self.buffer_tensor_shape[1] * self.registered_tensor.element_size() 103 | return remote_offsets 104 | 105 | def __getitem__(self, nodes): 106 | 107 | self.lazy_init() 108 | nodes = nodes.cuda() 109 | if self.order_transform is not None: 110 | nodes = self.order_transform[nodes] 111 | 112 | input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device) 113 | 114 | feature = torch.empty(nodes.shape[0], self.shape[1], device = nodes.device, dtype=self.dtype) 115 | 116 | cache_nodes_mask = None 117 | local_nodes_mask = None 118 | 119 | 120 | # Load cache data 121 | if self.cached_range.end > 0: 122 | cache_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end) 123 | cache_request_nodes = torch.masked_select(nodes, cache_nodes_mask) 124 | cache_part_orders = torch.masked_select(input_orders, cache_nodes_mask) 125 | if cache_request_nodes.shape[0] > 0: 126 | feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes) 127 | 128 | 129 | 130 | 131 | # Load local data 132 | range_item = self.tensor_endpoints[self.server_rank].range 133 | local_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end) 134 | local_request_nodes = torch.masked_select(nodes, local_nodes_mask) 135 | local_part_orders = torch.masked_select(input_orders, local_nodes_mask) 136 | if local_request_nodes.shape[0] > 0: 137 | feature[local_part_orders] = self.collect(local_request_nodes) 138 | 139 | 140 | # Collect Remote Data 141 | if cache_nodes_mask is None: 142 | all_remote_nodes_mask = torch.logical_not(local_nodes_mask) 143 | else: 144 | all_remote_nodes_mask = torch.logical_not(torch.logical_or(local_nodes_mask, cache_nodes_mask)) 145 | 146 | all_remote_nodes = torch.masked_select(nodes, all_remote_nodes_mask) 147 | all_remote_orders = torch.masked_select(input_orders, all_remote_nodes_mask) 148 | 149 | assert all_remote_nodes.shape[0] <= self.registered_tensor.shape[0], "Collected Data Exceeds Buffer Size" 150 | 151 | for server_rank in range(self.world_size): 152 | 153 | range_item = self.tensor_endpoints[server_rank].range 154 | if server_rank != self.server_rank: 155 | request_nodes_mask = (all_remote_nodes >= range_item.start) & (all_remote_nodes < range_item.end) 156 | request_nodes = torch.masked_select(all_remote_nodes, request_nodes_mask) 157 | if request_nodes.shape[0] > 0: 158 | local_orders = torch.masked_select(input_orders[:all_remote_nodes.shape[0]], request_nodes_mask) 159 | local_offsets = local_orders * self.registered_tensor.shape[1] * self.registered_tensor.element_size() 160 | remote_offsets = self.cal_remote_offsets(request_nodes, server_rank) 161 | self.dist_tensor_client.sync_read(server_rank, self.registered_tensor, local_offsets.cpu(), remote_offsets.cpu()) 162 | 163 | feature[all_remote_orders] = self.registered_tensor[:all_remote_nodes.shape[0]].to(self.device_rank) 164 | return feature 165 | -------------------------------------------------------------------------------- /quiver_feature/dist_tensor_rpc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed.rpc as rpc 3 | from typing import List 4 | from .common import Range 5 | 6 | class Task: 7 | def __init__(self, prev_order, fut): 8 | self.prev_order_ = prev_order 9 | self.fut_ = fut 10 | self.data_ = None 11 | 12 | def wait(self): 13 | self.data_ = self.fut_.wait() 14 | 15 | @property 16 | def data(self): 17 | return self.data_ 18 | 19 | @property 20 | def prev_order(self): 21 | return self.prev_order_ 22 | 23 | class Singleton(object): 24 | def __init__(self, cls): 25 | self._cls = cls 26 | self._instance = {} 27 | def __call__(self, *args, **kwargs): 28 | if self._cls not in self._instance: 29 | self._instance[self._cls] = self._cls() 30 | self._instance[self._cls].init(*args, **kwargs) 31 | return self._instance[self._cls] 32 | 33 | 34 | def collect(nodes): 35 | dist_tensor = DistTensorRPC() 36 | return dist_tensor.collect(nodes) 37 | 38 | 39 | @Singleton 40 | class DistTensorRPC(object): 41 | 42 | def __init__(self): 43 | pass 44 | 45 | def init(self, world_size, rank, local_size, local_rank, shard_tensor, range_list: List[Range], rpc_option, cached_range = Range(start=0, end=0), order_transform=None, **debug_params) -> None: 46 | self.shard_tensor = shard_tensor 47 | self.range_list = range_list 48 | self.cached_range = cached_range 49 | self.order_transform = None 50 | if order_transform is not None: 51 | self.order_transform = order_transform.to(local_rank) 52 | self.rank = rank 53 | self.local_rank = local_rank 54 | self.world_size = world_size 55 | self.local_size = local_size 56 | self.debug_params = debug_params 57 | 58 | rpc.init_rpc(f"worker{rank}", rank=self.rank, world_size= world_size, rpc_backend_options=rpc_option) 59 | 60 | def collect(self, nodes): 61 | 62 | # TODO Just For Debugging 63 | if nodes.is_cuda: 64 | torch.cuda.set_device(self.local_rank) 65 | nodes -= self.range_list[self.rank].start 66 | nodes += self.cached_range.end 67 | data = self.shard_tensor[nodes] 68 | 69 | return data 70 | 71 | def collect_cached_data(self, nodes): 72 | # TODO Just For Debugging 73 | if nodes.is_cuda: 74 | torch.cuda.set_device(self.local_rank) 75 | data = self.shard_tensor[nodes] 76 | 77 | return data 78 | 79 | def __getitem__(self, nodes): 80 | 81 | task_list: List[Task] = [] 82 | if self.order_transform is not None: 83 | nodes = self.order_transform[nodes] 84 | input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device) 85 | 86 | remote_collect = 0 87 | for worker_id in range(self.local_rank, self.world_size, self.local_size): 88 | range_item = self.range_list[worker_id] 89 | if worker_id != self.rank: 90 | request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end) 91 | request_nodes = torch.masked_select(nodes, request_nodes_mask) 92 | if request_nodes.shape[0] > 0: 93 | remote_collect += request_nodes.shape[0] 94 | part_orders = torch.masked_select(input_orders, request_nodes_mask) 95 | fut = rpc.rpc_async(f"worker{worker_id}", collect, args=(request_nodes, )) 96 | task_list.append(Task(part_orders, fut)) 97 | 98 | feature = torch.zeros(nodes.shape[0], self.shard_tensor.shape[1], device = f"cuda:{self.local_rank}") 99 | 100 | # Load Cached Data 101 | if self.cached_range.end > 0: 102 | request_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end) 103 | cache_request_nodes = torch.masked_select(nodes, request_nodes_mask) 104 | cache_part_orders = torch.masked_select(input_orders, request_nodes_mask) 105 | if cache_request_nodes.shape[0] > 0: 106 | feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes).to(self.local_rank) 107 | 108 | 109 | # Load local data 110 | range_item = self.range_list[self.rank] 111 | request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end) 112 | local_request_nodes = torch.masked_select(nodes, request_nodes_mask) 113 | local_part_orders = torch.masked_select(input_orders, request_nodes_mask) 114 | if local_request_nodes.shape[0] > 0: 115 | feature[local_part_orders] = self.collect(local_request_nodes).to(self.local_rank) 116 | 117 | for task in task_list: 118 | task.wait() 119 | feature[task.prev_order] = task.data.to(self.local_rank) 120 | return feature 121 | -------------------------------------------------------------------------------- /quiver_feature/multiprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .reductions import init_reductions 2 | 3 | init_reductions() -------------------------------------------------------------------------------- /quiver_feature/multiprocessing/reductions.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.reduction import ForkingPickler 2 | import qvf 3 | from ..local_tensor_pgas import LocalTensorPGAS 4 | 5 | def rebuild_qvf_pipeparam(ipc_handle): 6 | 7 | pipe_param = qvf.PipeParam() 8 | pipe_param.set_param_vec(ipc_handle) 9 | return pipe_param 10 | 11 | def reduce_qvf_pipeparam(pipe_param): 12 | param_vec = pipe_param.get_param_vec() 13 | return(rebuild_qvf_pipeparam, (param_vec, )) 14 | 15 | 16 | def rebuild_qvf_comendpoint(ipc_handle): 17 | 18 | com_endpoint = qvf.ComEndPoint(ipc_handle[0], ipc_handle[1], ipc_handle[2]) 19 | return com_endpoint 20 | 21 | def reduce_qvf_comendpoint(com_endpoint): 22 | param_vec = (com_endpoint.rank(), com_endpoint.address(), com_endpoint.port()) 23 | return (rebuild_qvf_comendpoint, (param_vec, )) 24 | 25 | def rebuild_localtensorpgas(ipc_handle): 26 | 27 | feature = LocalTensorPGAS.lazy_from_ipc_handle(ipc_handle) 28 | return feature 29 | 30 | 31 | def reduce_localtensorpgas(feature): 32 | 33 | ipc_handle = feature.share_ipc() 34 | return (rebuild_localtensorpgas, (ipc_handle, )) 35 | 36 | def init_reductions(): 37 | ForkingPickler.register(qvf.PipeParam, reduce_qvf_pipeparam) 38 | ForkingPickler.register(qvf.ComEndPoint, reduce_qvf_comendpoint) 39 | ForkingPickler.register(LocalTensorPGAS, reduce_localtensorpgas) -------------------------------------------------------------------------------- /quiver_feature/tensor_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | import torch.serialization as se 4 | from torch.serialization import * 5 | 6 | 7 | class _open_zipfile_reader(torch.serialization._opener): 8 | def __init__(self, name_or_buffer) -> None: 9 | super(_open_zipfile_reader, self).__init__(qvf.SharedTensorLoader(name_or_buffer)) 10 | 11 | 12 | def shared_load(f, map_location=None, pickle_module=pickle, **pickle_load_args): 13 | se._check_dill_version(pickle_module) 14 | 15 | if 'encoding' not in pickle_load_args.keys(): 16 | pickle_load_args['encoding'] = 'utf-8' 17 | 18 | with se._open_file_like(f, 'rb') as opened_file: 19 | if se._is_zipfile(opened_file): 20 | # The zipfile reader is going to advance the current file position. 21 | # If we want to actually tail call to torch.jit.load, we need to 22 | # reset back to the original position. 23 | orig_position = opened_file.tell() 24 | with _open_zipfile_reader(opened_file) as opened_zipfile: 25 | if se._is_torchscript_zip(opened_zipfile): 26 | warnings.warn("'torch.load' received a zip file that looks like a TorchScript archive" 27 | " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to" 28 | " silence this warning)", UserWarning) 29 | opened_file.seek(orig_position) 30 | return torch.jit.load(opened_file) 31 | return se._load(opened_zipfile, map_location, pickle_module, **pickle_load_args) 32 | return se._legacy_load(opened_file, map_location, pickle_module, **pickle_load_args) 33 | -------------------------------------------------------------------------------- /quiver_feature/utils.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from qvf import DistTensorServer 3 | 4 | def server_thread(port_number, qp_num, world_size, tensor, dist_helper): 5 | dist_tensor_server = DistTensorServer(port_number, world_size, qp_num) 6 | dist_tensor_server.serve_tensor(tensor) 7 | dist_helper.sync_start() 8 | dist_tensor_server.join() 9 | 10 | def serve_tensor_for_remote_access(port_number, qp_num, server_world_size, device_per_server, cpu_tensor, dist_helper): 11 | server = threading.Thread(target=server_thread, args=(port_number, qp_num, server_world_size * device_per_server, cpu_tensor, dist_helper)) 12 | server.daemon = True 13 | server.start() 14 | dist_helper.sync_end() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import os.path as osp 5 | from itertools import product 6 | from setuptools import setup, find_packages 7 | import platform 8 | 9 | import torch 10 | from torch.__config__ import parallel_info 11 | from torch.utils.cpp_extension import BuildExtension 12 | from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME 13 | import torch.utils.cpp_extension as cpp_extension 14 | 15 | WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None 16 | suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu'] 17 | if os.getenv('FORCE_CUDA', '0') == '1': 18 | suffices = ['cuda', 'cpu'] 19 | if os.getenv('FORCE_ONLY_CUDA', '0') == '1': 20 | suffices = ['cuda'] 21 | if os.getenv('FORCE_ONLY_CPU', '0') == '1': 22 | suffices = ['cpu'] 23 | 24 | BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1' 25 | 26 | WITH_SYMBOLS = True if os.getenv('WITH_SYMBOLS', '0') == '1' else False 27 | 28 | 29 | def get_torch_includes(): 30 | lib_include = os.path.join(cpp_extension._TORCH_PATH, 'include') 31 | paths = [ 32 | osp.join(lib_include, 'ATen'), 33 | osp.join(lib_include, 'c10'), 34 | osp.join(lib_include, 'caffe2'), 35 | ] 36 | 37 | return paths 38 | 39 | 40 | def get_extensions(): 41 | extensions = [] 42 | libraries = ['ibverbs'] 43 | 44 | extensions_dir = osp.join('csrc') 45 | 46 | srcs = glob.glob(osp.join(extensions_dir, 'src', '*.cpp')) 47 | srcs += glob.glob(osp.join(extensions_dir, 'src', '*.cu')) 48 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/core", '*.cpp')) 49 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/memory", '*.cpp')) 50 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/queues", '*.cpp')) 51 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/requests", '*.cpp')) 52 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/utils", '*.cpp')) 53 | srcs += glob.glob(osp.join(extensions_dir, 'include', "miniz", '*.c')) 54 | includes = osp.join(extensions_dir, 'include/') 55 | 56 | define_macros = [('WITH_PYTHON', None)] 57 | extra_compile_args = { 58 | 'cxx': ['-O3', '-std=c++17', '-libverbs'], 59 | '/usr/local/cuda/bin/nvcc': ['-O3', '--expt-extended-lambda', '-std=c++17', '-libverbs']} 60 | extra_link_args = [] if WITH_SYMBOLS else ['-s'] 61 | 62 | Extension = CUDAExtension 63 | extension = Extension( 64 | 'qvf', 65 | srcs, 66 | include_dirs=[includes] + get_torch_includes(), 67 | define_macros=define_macros, 68 | extra_compile_args=extra_compile_args, 69 | extra_link_args=extra_link_args, 70 | libraries=libraries, 71 | ) 72 | extensions += [extension] 73 | return extensions 74 | 75 | 76 | install_requires = [] 77 | setup_requires = [] 78 | tests_require = ['pytest', 'pytest-runner', 'pytest-cov'] 79 | 80 | setup( 81 | name='quiver_feature', 82 | version='0.0.1', 83 | author='quiver-team', 84 | author_email='', 85 | url='https://github.com/quiver-team/quiver_feature', 86 | description=('PyTorch Library for graph learning sampling'), 87 | keywords=['pytorch', 'sparse', 'graph'], 88 | license='Apache', 89 | python_requires='>=3.6', 90 | install_requires=install_requires, 91 | setup_requires=setup_requires, 92 | tests_require=tests_require, 93 | extras_require={'test': tests_require}, 94 | ext_modules=get_extensions() if not BUILD_DOCS else [], 95 | cmdclass={ 96 | 'build_ext': 97 | BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False) 98 | }, 99 | packages=find_packages(), 100 | ) 101 | -------------------------------------------------------------------------------- /tests/cpp/test_DistTensorClient.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #define PORT_NUMBER 3344 12 | #define SERVER_IP "155.198.152.17" 13 | 14 | #define NODE_COUNT 120000LL 15 | #define FEATURE_DIM 256LL 16 | #define FEATURE_TYPE_SIZE 4LL 17 | #define SAMPLE_NUM 80960LL 18 | #define TEST_COUNT 8192LL 19 | #define ITER_NUM 10LL 20 | #define POST_LIST_SIZE 16LL 21 | #define CQ_MOD 16LL 22 | #define QP_NUM 2LL 23 | #define TX_DEPTH 2048LL 24 | #define CTX_POLL_BATCH 16LL 25 | 26 | int min(int a, int b); 27 | 28 | void print_tensor_res(torch::Tensor& res_tensor) { 29 | float* res = res_tensor.data_ptr(); 30 | for (int col = 0; col < res_tensor.size(1); col++) { 31 | std::cout << res[0 * res_tensor.size(1) + col] << " "; 32 | } 33 | std::cout << std::endl; 34 | } 35 | void check_tensor_res(torch::Tensor& res_tensor, 36 | torch::Tensor& remote_offsets) { 37 | float* res = res_tensor.data_ptr(); 38 | int stride = res_tensor.size(1); 39 | int64_t* offsets = remote_offsets.data_ptr(); 40 | for (int row = 0; row < remote_offsets.size(0); row++) { 41 | for (int col = 0; col < res_tensor.size(1); col++) { 42 | float expected_value = 43 | float(offsets[row]) / (FEATURE_DIM * FEATURE_TYPE_SIZE); 44 | QUIVER_FEATURE_ASSERT( 45 | res[row * stride + col] == expected_value, 46 | "Result Check Failed At (%d, %d)!, Expected %f, Got %f\n", row, col, 47 | expected_value, res[row * stride + col]); 48 | } 49 | } 50 | printf("Result Check Passed, Congrats!\n"); 51 | } 52 | 53 | void test_dist_tensor_client(int argc, char** argv) { 54 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH, 55 | POST_LIST_SIZE); 56 | 57 | qvf::ComEndPoint local_com_end_point(0, SERVER_IP, PORT_NUMBER); 58 | qvf::ComEndPoint remote_com_end_point(1, SERVER_IP, PORT_NUMBER); 59 | std::vector com_endpoints{local_com_end_point, 60 | remote_com_end_point}; 61 | qvf::DistTensorClient dist_tensor_client(0, com_endpoints, pipe_param); 62 | std::vector shape{SAMPLE_NUM, FEATURE_DIM}; 63 | 64 | torch::Tensor registered_tensor = 65 | dist_tensor_client.create_registered_float32_tensor(shape); 66 | 67 | std::vector local_offsets(SAMPLE_NUM); 68 | std::vector remote_offsets(SAMPLE_NUM); 69 | 70 | for (int index = 0; index < SAMPLE_NUM; index++) { 71 | local_offsets[index] = index * FEATURE_DIM * FEATURE_TYPE_SIZE; 72 | remote_offsets[index] = 73 | rand() % NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE; 74 | // remote_offsets[index] = FEATURE_DIM * FEATURE_TYPE_SIZE; 75 | } 76 | 77 | for (int index = 0; index < min(1, SAMPLE_NUM); index++) { 78 | std::cout << "Collect Node " 79 | << remote_offsets[index] / (FEATURE_DIM * FEATURE_TYPE_SIZE) 80 | << ": " << local_offsets[index] << "<-" << remote_offsets[index] 81 | << std::endl; 82 | } 83 | std::cout << std::endl; 84 | 85 | auto tensor_option = torch::TensorOptions().dtype(torch::kInt64); 86 | torch::Tensor local_offsets_tensor = 87 | torch::from_blob(&local_offsets[0], {SAMPLE_NUM}, tensor_option); 88 | torch::Tensor remote_offsets_tensor = 89 | torch::from_blob(&remote_offsets[0], {SAMPLE_NUM}, tensor_option); 90 | 91 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets_tensor, 92 | remote_offsets_tensor); 93 | // print_tensor_res(registered_tensor); 94 | check_tensor_res(registered_tensor, remote_offsets_tensor); 95 | } 96 | -------------------------------------------------------------------------------- /tests/cpp/test_DistTensorServer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define PORT_NUMBER 3344 5 | #define SERVER_IP "155.198.152.17" 6 | 7 | #define NODE_COUNT 120000LL 8 | #define FEATURE_DIM 256LL 9 | #define FEATURE_TYPE_SIZE 4LL 10 | #define TEST_COUNT 8192LL 11 | #define ITER_NUM 10LL 12 | #define POST_LIST_SIZE 16LL 13 | #define CQ_MOD 16LL 14 | #define QP_NUM 2LL 15 | #define TX_DEPTH 2048LL 16 | #define CTX_POLL_BATCH 16LL 17 | 18 | float* allocate_float_feature(bool set_value) { 19 | float* buffer = (float*)malloc(NODE_COUNT * FEATURE_DIM * sizeof(float)); 20 | float index = 0; 21 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) { 22 | for (int dim = 0; dim < FEATURE_DIM; dim++) { 23 | if (set_value) 24 | buffer[start * FEATURE_DIM + dim] = index; 25 | else 26 | buffer[start * FEATURE_DIM + dim] = 0; 27 | } 28 | index += 1; 29 | } 30 | return buffer; 31 | } 32 | 33 | void test_dist_tensor_server(int argc, char** argv) { 34 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH, 35 | POST_LIST_SIZE); 36 | qvf::DistTensorServer dist_tensor_server(PORT_NUMBER, 1, 1); 37 | float* server_data_buffer = allocate_float_feature(true); 38 | dist_tensor_server.serve(server_data_buffer, 39 | NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE); 40 | } 41 | -------------------------------------------------------------------------------- /tests/cpp/test_Pipe.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Examples - Read/Write/Send Operations 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #define PORT_NUMBER 3344 35 | #define SERVER_IP "155.198.152.17" 36 | 37 | #define NODE_COUNT 120000LL 38 | #define FEATURE_DIM 256LL 39 | #define FEATURE_TYPE_SIZE 4LL 40 | #define TEST_COUNT 8192LL 41 | #define ITER_NUM 10LL 42 | #define POST_LIST_SIZE 16LL 43 | #define CQ_MOD 16LL 44 | #define QP_NUM 2LL 45 | #define TX_DEPTH 2048LL 46 | #define CTX_POLL_BATCH 16LL 47 | 48 | int min(int a, int b) { 49 | if (a < b) { 50 | return a; 51 | } 52 | return b; 53 | } 54 | 55 | uint64_t timeDiff(struct timeval stop, struct timeval start) { 56 | return (stop.tv_sec * 1000000L + stop.tv_usec) - 57 | (start.tv_sec * 1000000L + start.tv_usec); 58 | } 59 | 60 | float* allocate_float_feature(bool set_value); 61 | 62 | bool mem_check(float* data_buffer) { 63 | float index = 0; 64 | bool have_valid_data = false; 65 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) { 66 | for (int dim = 0; dim < FEATURE_DIM; dim++) { 67 | if (data_buffer[start * FEATURE_DIM + dim] != 0) { 68 | have_valid_data = true; 69 | } 70 | } 71 | } 72 | QUIVER_FEATURE_ASSERT(have_valid_data == true, "No valid data is copied") 73 | 74 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) { 75 | float expected_value = 76 | (data_buffer[start * FEATURE_DIM] == 0) ? 0 : float(start); 77 | std::cout << data_buffer[start * FEATURE_DIM] << " "; 78 | for (u_int64_t dim = 0; dim < FEATURE_DIM; dim++) { 79 | QUIVER_FEATURE_ASSERT( 80 | data_buffer[start * FEATURE_DIM + dim] == expected_value, 81 | "Result Check Failed At (%lld, %lld)!, Expected %f, Got %f\n", start, 82 | dim, expected_value, data_buffer[start * FEATURE_DIM + dim]); 83 | } 84 | } 85 | return true; 86 | } 87 | 88 | void test_pipe(int argc, char** argv) { 89 | bool random = true; 90 | bool sort_index = false; 91 | 92 | while (argc > 1) { 93 | if (argv[1][0] == '-') { 94 | switch (argv[1][1]) { 95 | case 'l': { 96 | random = false; 97 | break; 98 | } 99 | case 't': { 100 | sort_index = true; 101 | break; 102 | } 103 | } 104 | } 105 | ++argv; 106 | --argc; 107 | } 108 | if (random) { 109 | printf("Test Random Data Access \n"); 110 | } else { 111 | printf("Test Sequential Data Access \n"); 112 | } 113 | if (sort_index) { 114 | printf("Test Data Access With TLB Optimization\n"); 115 | } 116 | 117 | std::vector qps; 118 | infinity::core::Context* context = new infinity::core::Context(); 119 | infinity::queues::QueuePairFactory* qpFactory = 120 | new infinity::queues::QueuePairFactory(context); 121 | 122 | qps.resize(QP_NUM); 123 | qvf::ComEndPoint endpoint(0, SERVER_IP, PORT_NUMBER); 124 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH, 125 | POST_LIST_SIZE); 126 | qvf::Pipe quiver_pipe(context, qpFactory, endpoint, pipe_param); 127 | quiver_pipe.connect(); 128 | 129 | printf("Creating buffers\n"); 130 | std::vector buffers; 131 | float* client_data_buffer = allocate_float_feature(false); 132 | infinity::memory::Buffer* buffer1Sided = new infinity::memory::Buffer( 133 | context, client_data_buffer, 134 | NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE); 135 | infinity::memory::Buffer* buffer2Sided = 136 | new infinity::memory::Buffer(context, 128 * sizeof(char)); 137 | 138 | printf("Reading content from remote buffer\n"); 139 | infinity::requests::RequestToken requestToken(context); 140 | 141 | printf("Start Real Test \n"); 142 | // auto start = std::chrono::system_clock::now(); 143 | struct timeval start, stop; 144 | uint64_t time_consumed = 0; 145 | std::vector local_offsets(TEST_COUNT * POST_LIST_SIZE); 146 | std::vector remote_offsets(TEST_COUNT * POST_LIST_SIZE); 147 | if (sort_index) { 148 | for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) { 149 | std::vector all_request_nodes(TEST_COUNT * POST_LIST_SIZE); 150 | for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) { 151 | all_request_nodes[i] = rand() % NODE_COUNT; 152 | } 153 | std::sort(all_request_nodes.begin(), all_request_nodes.end()); 154 | for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) { 155 | uint64_t remote_node_offset = 156 | all_request_nodes[i] * FEATURE_DIM * FEATURE_TYPE_SIZE; 157 | local_offsets[i] = remote_node_offset; 158 | remote_offsets[i] = remote_node_offset; 159 | } 160 | gettimeofday(&start, NULL); 161 | 162 | quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets, 163 | FEATURE_DIM * FEATURE_TYPE_SIZE); 164 | gettimeofday(&stop, NULL); 165 | time_consumed += timeDiff(stop, start); 166 | } 167 | } else { 168 | for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) { 169 | for (int k = 0; k < TEST_COUNT * POST_LIST_SIZE; k++) { 170 | int request_node = k % NODE_COUNT; 171 | if (random) { 172 | request_node = rand() % NODE_COUNT; 173 | } 174 | uint64_t remote_node_offset = 175 | request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 176 | local_offsets[k] = remote_node_offset; 177 | remote_offsets[k] = remote_node_offset; 178 | } 179 | gettimeofday(&start, NULL); 180 | quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets, 181 | FEATURE_DIM * FEATURE_TYPE_SIZE); 182 | gettimeofday(&stop, NULL); 183 | time_consumed += timeDiff(stop, start); 184 | } 185 | } 186 | 187 | printf("Avg Bandwidth is %f MB/s\n", 188 | (POST_LIST_SIZE * TEST_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE * 189 | ITER_NUM / (1024.0 * 1024.0)) / 190 | (((double)time_consumed) / 1000000L)); 191 | 192 | printf("Memory checking..., Please wait...\n"); 193 | if (!mem_check(client_data_buffer)) { 194 | fprintf(stderr, "Memory Check Failed, Benchmark Failed!\n"); 195 | } else { 196 | printf("Memory check success! Congrats!\n"); 197 | } 198 | 199 | delete buffer1Sided; 200 | delete buffer2Sided; 201 | 202 | for (int index = 0; index < QP_NUM; index++) { 203 | delete qps[index]; 204 | } 205 | delete qpFactory; 206 | delete context; 207 | } 208 | -------------------------------------------------------------------------------- /tests/cpp/test_main.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Usage: ./progam -s for server and ./program for client component 3 | 4 | #include 5 | #include 6 | void test_pipe(int argc, char** argv); 7 | void test_dist_tensor_server(int argc, char** argv); 8 | void test_dist_tensor_client(int argc, char** argv); 9 | int main(int argc, char** argv) { 10 | int test_case = 0; 11 | switch (argv[1][0]) { 12 | case '0': { 13 | test_case = 0; 14 | break; 15 | } 16 | case '1': { 17 | test_case = 1; 18 | break; 19 | } 20 | case '2': { 21 | test_case = 2; 22 | break; 23 | } 24 | } 25 | 26 | ++argv; 27 | --argc; 28 | 29 | if (test_case == 0) { 30 | printf("Testing Pipe ...\n"); 31 | test_pipe(argc, argv); 32 | } else if (test_case == 1) { 33 | printf("Testing DistTensorClient ...\n"); 34 | test_dist_tensor_client(argc, argv); 35 | } else if (test_case == 2) { 36 | printf("Testing DistTensorServer ...\n"); 37 | test_dist_tensor_server(argc, argv); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /tests/infinity/read-write-send.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Examples - Read/Write/Send Operations 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #define PORT_NUMBER 8011 22 | #define SERVER_IP "127.0.0.1" 23 | 24 | // Usage: ./progam -s for server and ./program for client component 25 | int main(int argc, char **argv) { 26 | 27 | bool isServer = false; 28 | 29 | while (argc > 1) { 30 | if (argv[1][0] == '-') { 31 | switch (argv[1][1]) { 32 | 33 | case 's': { 34 | isServer = true; 35 | break; 36 | } 37 | 38 | } 39 | } 40 | ++argv; 41 | --argc; 42 | } 43 | 44 | infinity::core::Context *context = new infinity::core::Context(); 45 | infinity::queues::QueuePairFactory *qpFactory = new infinity::queues::QueuePairFactory(context); 46 | infinity::queues::QueuePair *qp; 47 | 48 | if(isServer) { 49 | 50 | printf("Creating buffers to read from and write to\n"); 51 | infinity::memory::Buffer *bufferToReadWrite = new infinity::memory::Buffer(context, 128 * sizeof(char)); 52 | infinity::memory::RegionToken *bufferToken = bufferToReadWrite->createRegionToken(); 53 | 54 | printf("Creating buffers to receive a message\n"); 55 | infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char)); 56 | context->postReceiveBuffer(bufferToReceive); 57 | 58 | printf("Setting up connection (blocking)\n"); 59 | qpFactory->bindToPort(PORT_NUMBER); 60 | qp = qpFactory->acceptIncomingConnection(bufferToken, sizeof(infinity::memory::RegionToken)); 61 | 62 | printf("Waiting for message (blocking)\n"); 63 | infinity::core::receive_element_t receiveElement; 64 | while(!context->receive(&receiveElement)); 65 | 66 | printf("Message received\n"); 67 | delete bufferToReadWrite; 68 | delete bufferToReceive; 69 | 70 | } else { 71 | 72 | printf("Connecting to remote node\n"); 73 | qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER); 74 | infinity::memory::RegionToken *remoteBufferToken = (infinity::memory::RegionToken *) qp->getUserData(); 75 | 76 | 77 | printf("Creating buffers\n"); 78 | infinity::memory::Buffer *buffer1Sided = new infinity::memory::Buffer(context, 128 * sizeof(char)); 79 | infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char)); 80 | 81 | printf("Reading content from remote buffer\n"); 82 | infinity::requests::RequestToken requestToken(context); 83 | qp->read(buffer1Sided, remoteBufferToken, &requestToken); 84 | requestToken.waitUntilCompleted(); 85 | 86 | printf("Writing content to remote buffer\n"); 87 | qp->write(buffer1Sided, remoteBufferToken, &requestToken); 88 | requestToken.waitUntilCompleted(); 89 | 90 | printf("Sending message to remote host\n"); 91 | qp->send(buffer2Sided, &requestToken); 92 | requestToken.waitUntilCompleted(); 93 | 94 | delete buffer1Sided; 95 | delete buffer2Sided; 96 | 97 | } 98 | 99 | delete qp; 100 | delete qpFactory; 101 | delete context; 102 | 103 | return 0; 104 | 105 | } 106 | -------------------------------------------------------------------------------- /tests/infinity/send-performance.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Examples - Send Performance 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #define PORT_NUMBER 8011 24 | #define SERVER_IP "192.0.0.1" 25 | #define BUFFER_COUNT 128 26 | #define MAX_BUFFER_SIZE 4096 27 | #define OPERATIONS_COUNT 1024 28 | 29 | uint64_t timeDiff(struct timeval stop, struct timeval start); 30 | 31 | // Usage: ./progam -s for server and ./program for client component 32 | int main(int argc, char **argv) { 33 | 34 | bool isServer = false; 35 | 36 | while (argc > 1) { 37 | if (argv[1][0] == '-') { 38 | switch (argv[1][1]) { 39 | 40 | case 's': { 41 | isServer = true; 42 | break; 43 | } 44 | 45 | } 46 | } 47 | ++argv; 48 | --argc; 49 | } 50 | 51 | infinity::core::Context *context = new infinity::core::Context(); 52 | infinity::queues::QueuePairFactory *qpFactory = new infinity::queues::QueuePairFactory(context); 53 | infinity::queues::QueuePair *qp; 54 | 55 | if (isServer) { 56 | 57 | printf("Creating buffers to receive a messages\n"); 58 | infinity::memory::Buffer **receiveBuffers = new infinity::memory::Buffer *[BUFFER_COUNT]; 59 | for (uint32_t i = 0; i < BUFFER_COUNT; ++i) { 60 | receiveBuffers[i] = new infinity::memory::Buffer(context, MAX_BUFFER_SIZE * sizeof(char)); 61 | context->postReceiveBuffer(receiveBuffers[i]); 62 | } 63 | 64 | printf("Waiting for incoming connection\n"); 65 | qpFactory->bindToPort(PORT_NUMBER); 66 | qp = qpFactory->acceptIncomingConnection(); 67 | 68 | printf("Waiting for first message (first message has additional setup costs)\n"); 69 | infinity::core::receive_element_t receiveElement; 70 | while (!context->receive(&receiveElement)); 71 | context->postReceiveBuffer(receiveElement.buffer); 72 | 73 | printf("Performing measurement\n"); 74 | 75 | uint32_t messageSize = 1; 76 | uint32_t rounds = (uint32_t) log2(MAX_BUFFER_SIZE); 77 | 78 | for(uint32_t sizeIndex = 0; sizeIndex <= rounds; ++sizeIndex) { 79 | 80 | printf("Receiving messages of size %d bytes\n", messageSize); 81 | fflush(stdout); 82 | 83 | uint32_t numberOfReceivedMessages = 0; 84 | while (numberOfReceivedMessages < OPERATIONS_COUNT) { 85 | while (!context->receive(&receiveElement)); 86 | ++numberOfReceivedMessages; 87 | context->postReceiveBuffer(receiveElement.buffer); 88 | } 89 | 90 | messageSize *= 2; 91 | } 92 | 93 | printf("All messages received\n"); 94 | 95 | printf("Sending notification to client\n"); 96 | infinity::memory::Buffer *sendBuffer = new infinity::memory::Buffer(context, sizeof(char)); 97 | qp->send(sendBuffer, context->defaultRequestToken); 98 | context->defaultRequestToken->waitUntilCompleted(); 99 | 100 | printf("Clean up\n"); 101 | for (uint32_t i = 0; i < BUFFER_COUNT; ++i) { 102 | delete receiveBuffers[i]; 103 | } 104 | delete receiveBuffers; 105 | delete sendBuffer; 106 | 107 | } else { 108 | 109 | printf("Connecting to remote node\n"); 110 | qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER); 111 | 112 | printf("Creating buffers\n"); 113 | infinity::memory::Buffer *sendBuffer = new infinity::memory::Buffer(context, MAX_BUFFER_SIZE * sizeof(char)); 114 | infinity::memory::Buffer *receiveBuffer = new infinity::memory::Buffer(context, sizeof(char)); 115 | context->postReceiveBuffer(receiveBuffer); 116 | 117 | printf("Sending first message\n"); 118 | qp->send(sendBuffer, sizeof(char), context->defaultRequestToken); 119 | context->defaultRequestToken->waitUntilCompleted(); 120 | 121 | printf("Performing measurement\n"); 122 | uint32_t rounds = (uint32_t) log2(MAX_BUFFER_SIZE); 123 | uint32_t messageSize = 1; 124 | 125 | for(uint32_t sizeIndex = 0; sizeIndex <= rounds; ++sizeIndex) { 126 | 127 | printf("Sending messages of size %d bytes\t", messageSize); 128 | fflush(stdout); 129 | 130 | struct timeval start; 131 | gettimeofday(&start, NULL); 132 | 133 | for(uint32_t i=0; isend(sendBuffer, messageSize, &requestToken); 138 | requestToken.waitUntilCompleted(); 139 | 140 | } else { 141 | 142 | qp->send(sendBuffer, messageSize, NULL); 143 | 144 | } 145 | } 146 | 147 | struct timeval stop; 148 | gettimeofday(&stop, NULL); 149 | 150 | uint64_t time = timeDiff(stop, start); 151 | double msgRate = ((double)(OPERATIONS_COUNT * 1000000L)) / time; 152 | double bandwidth = ((double) (OPERATIONS_COUNT * messageSize)) / (1024*1024) / (((double) time) / 1000000L); 153 | printf("%.3f msg/sec\t%.3f MB/sec\n", msgRate, bandwidth); 154 | fflush(stdout); 155 | 156 | messageSize *= 2; 157 | 158 | } 159 | 160 | printf("Waiting for notification from server\n"); 161 | infinity::core::receive_element_t receiveElement; 162 | while (!context->receive(&receiveElement)); 163 | 164 | delete receiveBuffer; 165 | delete sendBuffer; 166 | } 167 | 168 | delete qp; 169 | delete qpFactory; 170 | delete context; 171 | 172 | return 0; 173 | 174 | } 175 | 176 | uint64_t timeDiff(struct timeval stop, struct timeval start) { 177 | return (stop.tv_sec * 1000000L + stop.tv_usec) - (start.tv_sec * 1000000L + start.tv_usec); 178 | } 179 | -------------------------------------------------------------------------------- /tests/infinity/test_multiread.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Examples - Read/Write/Send Operations 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #define PORT_NUMBER 3344 29 | #define SERVER_IP "155.198.152.17" 30 | 31 | #define NODE_COUNT 1000000 32 | #define FEATURE_DIM 128 33 | #define FEATURE_TYPE_SIZE 4 34 | #define TEST_COUNT 350000 35 | #define MAX_OUTSTANDING_REQ 1 36 | #define POST_LIST_SIZE 20 37 | #define CQ_MOD 25 38 | 39 | int min(int a, int b){ 40 | if(a < b){ 41 | return a; 42 | } 43 | return b; 44 | } 45 | 46 | 47 | uint64_t timeDiff(struct timeval stop, struct timeval start) { 48 | return (stop.tv_sec * 1000000L + stop.tv_usec) - 49 | (start.tv_sec * 1000000L + start.tv_usec); 50 | } 51 | 52 | // Usage: ./progam -s for server and ./program for client component 53 | int main(int argc, char **argv) { 54 | 55 | bool isServer = false; 56 | bool random = true; 57 | 58 | while (argc > 1) { 59 | if (argv[1][0] == '-') { 60 | switch (argv[1][1]) { 61 | case 's': { 62 | isServer = true; 63 | break; 64 | } 65 | case 'l': { 66 | random = false; 67 | break; 68 | } 69 | } 70 | } 71 | ++argv; 72 | --argc; 73 | } 74 | if(random){ 75 | printf("Test Random Data Access \n"); 76 | }else{ 77 | printf("Test Sequential Data Access \n"); 78 | } 79 | 80 | infinity::core::Context *context = new infinity::core::Context(); 81 | infinity::queues::QueuePairFactory *qpFactory = 82 | new infinity::queues::QueuePairFactory(context); 83 | infinity::queues::QueuePair *qp; 84 | 85 | if (isServer) { 86 | 87 | printf("Creating buffers to read from and write to\n"); 88 | std::cout << "Server Buffer Size " << NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl; 89 | infinity::memory::Buffer *bufferToReadWrite = 90 | new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE); 91 | infinity::memory::RegionToken *bufferToken = 92 | bufferToReadWrite->createRegionToken(); 93 | 94 | printf("Creating buffers to receive a message\n"); 95 | infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char)); 96 | context->postReceiveBuffer(bufferToReceive); 97 | 98 | 99 | printf("Setting up connection (blocking)\n"); 100 | qpFactory->bindToPort(PORT_NUMBER); 101 | qp = qpFactory->acceptIncomingConnection( 102 | bufferToken, sizeof(infinity::memory::RegionToken)); 103 | 104 | printf("Waiting for message (blocking)\n"); 105 | infinity::core::receive_element_t receiveElement; 106 | while (!context->receive(&receiveElement)) 107 | ; 108 | 109 | printf("Message received\n"); 110 | delete bufferToReadWrite; 111 | delete bufferToReceive; 112 | 113 | } else { 114 | 115 | std::vector local_offsets(POST_LIST_SIZE, 0); 116 | std::vector remote_offsets(POST_LIST_SIZE, 0); 117 | int start_request = 0; 118 | int end_request = 0; 119 | infinity::queues::SendRequestBuffer send_buffer(POST_LIST_SIZE); 120 | 121 | printf("Connecting to remote node\n"); 122 | qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER); 123 | infinity::memory::RegionToken *remoteBufferToken = 124 | (infinity::memory::RegionToken *)qp->getUserData(); 125 | 126 | printf("Creating buffers\n"); 127 | std::vector buffers; 128 | infinity::memory::Buffer *buffer1Sided = 129 | new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE); 130 | infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char)); 131 | 132 | 133 | printf("Reading content from remote buffer\n"); 134 | infinity::requests::RequestToken requestToken(context); 135 | 136 | // warm up 137 | 138 | printf("Warm up\n"); 139 | for (int k = 0; k < 10; k++) { 140 | int request_node = rand() % NODE_COUNT; 141 | uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 142 | //std::cout << "Getting Data From " << offset << " To " << offset + FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl; 143 | qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE, 144 | infinity::queues::OperationFlags(), &requestToken); 145 | requestToken.waitUntilCompleted(); 146 | } 147 | 148 | printf("Start Real Test \n"); 149 | auto start = std::chrono::system_clock::now(); 150 | int avaliable = MAX_OUTSTANDING_REQ; 151 | for (int k = 0; k < TEST_COUNT; k++) { 152 | for(int multi_read_index = 0; multi_read_index < POST_LIST_SIZE; multi_read_index ++){ 153 | int request_node = (k + multi_read_index) % NODE_COUNT; 154 | if(random){ 155 | request_node = rand() % NODE_COUNT; 156 | } 157 | uint64_t remote_node_offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 158 | local_offsets[multi_read_index] = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 159 | remote_offsets[multi_read_index] = remote_node_offset; 160 | } 161 | 162 | 163 | if(k % CQ_MOD == CQ_MOD -1){ 164 | qp->multiRead(buffer1Sided, local_offsets, remoteBufferToken, remote_offsets, FEATURE_DIM * FEATURE_TYPE_SIZE, 165 | infinity::queues::OperationFlags(), &requestToken, send_buffer); 166 | requestToken.waitUntilCompleted(); 167 | }else{ 168 | qp->multiRead(buffer1Sided, local_offsets, remoteBufferToken, remote_offsets, FEATURE_DIM * FEATURE_TYPE_SIZE, 169 | infinity::queues::OperationFlags(), nullptr, send_buffer); 170 | 171 | } 172 | } 173 | 174 | auto end = std::chrono::system_clock::now(); 175 | std::chrono::duration diff = end - start; 176 | printf("Avg Bandwidth is %f MB/s\n", (POST_LIST_SIZE * TEST_COUNT * FEATURE_DIM/ (1024.0 * 1024.0 ) ) * FEATURE_TYPE_SIZE / diff.count() ); 177 | 178 | printf("Sending message to remote host\n"); 179 | qp->send(buffer2Sided, &requestToken); 180 | requestToken.waitUntilCompleted(); 181 | 182 | delete buffer1Sided; 183 | delete buffer2Sided; 184 | } 185 | 186 | delete qp; 187 | delete qpFactory; 188 | delete context; 189 | 190 | return 0; 191 | } -------------------------------------------------------------------------------- /tests/infinity/test_read.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Examples - Read/Write/Send Operations 3 | * 4 | * (c) 2018 Claude Barthels, ETH Zurich 5 | * Contact: claudeb@inf.ethz.ch 6 | * 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #define PORT_NUMBER 3344 29 | #define SERVER_IP "155.198.152.17" 30 | 31 | #define NODE_COUNT 1 32 | #define FEATURE_DIM 512 33 | #define FEATURE_TYPE_SIZE 4 34 | #define TEST_COUNT 10 35 | #define MAX_OUTSTANDING_REQ 1 36 | 37 | 38 | 39 | uint64_t timeDiff(struct timeval stop, struct timeval start) { 40 | return (stop.tv_sec * 1000000L + stop.tv_usec) - 41 | (start.tv_sec * 1000000L + start.tv_usec); 42 | } 43 | 44 | // Usage: ./progam -s for server and ./program for client component 45 | int main(int argc, char **argv) { 46 | 47 | bool isServer = false; 48 | bool random = false; 49 | 50 | while (argc > 1) { 51 | if (argv[1][0] == '-') { 52 | switch (argv[1][1]) { 53 | case 's': { 54 | isServer = true; 55 | break; 56 | } 57 | case 'r': { 58 | random = true; 59 | break; 60 | } 61 | } 62 | } 63 | ++argv; 64 | --argc; 65 | } 66 | 67 | infinity::core::Context *context = new infinity::core::Context(); 68 | infinity::queues::QueuePairFactory *qpFactory = 69 | new infinity::queues::QueuePairFactory(context); 70 | infinity::queues::QueuePair *qp; 71 | 72 | if (isServer) { 73 | 74 | printf("Creating buffers to read from and write to\n"); 75 | std::cout << "Server Buffer Size " << NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl; 76 | infinity::memory::Buffer *bufferToReadWrite = 77 | new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE); 78 | infinity::memory::RegionToken *bufferToken = 79 | bufferToReadWrite->createRegionToken(); 80 | 81 | printf("Creating buffers to receive a message\n"); 82 | infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char)); 83 | context->postReceiveBuffer(bufferToReceive); 84 | 85 | 86 | printf("Setting up connection (blocking)\n"); 87 | qpFactory->bindToPort(PORT_NUMBER); 88 | qp = qpFactory->acceptIncomingConnection( 89 | bufferToken, sizeof(infinity::memory::RegionToken)); 90 | 91 | printf("Waiting for message (blocking)\n"); 92 | infinity::core::receive_element_t receiveElement; 93 | while (!context->receive(&receiveElement)) 94 | ; 95 | 96 | printf("Message received\n"); 97 | delete bufferToReadWrite; 98 | delete bufferToReceive; 99 | 100 | } else { 101 | 102 | printf("Connecting to remote node\n"); 103 | qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER); 104 | infinity::memory::RegionToken *remoteBufferToken = 105 | (infinity::memory::RegionToken *)qp->getUserData(); 106 | 107 | printf("Creating buffers\n"); 108 | std::vector buffers; 109 | infinity::memory::Buffer *buffer1Sided = 110 | new infinity::memory::Buffer(context, FEATURE_DIM * FEATURE_TYPE_SIZE); 111 | infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char)); 112 | 113 | 114 | printf("Reading content from remote buffer\n"); 115 | std::vector requests; 116 | for (int i = 0; i < 1000; i++) { 117 | requests.push_back(new infinity::requests::RequestToken(context)); 118 | } 119 | 120 | // warm up 121 | 122 | printf("A little Warmup \n"); 123 | for (int k = 0; k < 10; k++) { 124 | int request_node = rand() % NODE_COUNT; 125 | uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 126 | //std::cout << "Getting Data From " << offset << " To " << offset + FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl; 127 | qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE, 128 | infinity::queues::OperationFlags(), requests[k]); 129 | requests[k]->waitUntilCompleted(); 130 | } 131 | 132 | printf("Start Real Test \n"); 133 | auto start = std::chrono::system_clock::now(); 134 | int avaliable = MAX_OUTSTANDING_REQ; 135 | for (int k = 0; k < TEST_COUNT; k++) { 136 | int request_node = k; 137 | if(random){ 138 | request_node = rand() % NODE_COUNT; 139 | } 140 | 141 | uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE; 142 | qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE, 143 | infinity::queues::OperationFlags(), requests[k % 1000]); 144 | avaliable -= 1; 145 | if(avaliable == 0){ 146 | requests[k % MAX_OUTSTANDING_REQ]->waitUntilCompleted(); 147 | avaliable += 1; 148 | } 149 | } 150 | 151 | // make sure all finished 152 | for (int k = 0; k < MAX_OUTSTANDING_REQ; k++) { 153 | requests[k % MAX_OUTSTANDING_REQ]->waitUntilCompleted(); 154 | } 155 | 156 | 157 | auto end = std::chrono::system_clock::now(); 158 | std::chrono::duration diff = end - start; 159 | printf("Avg Bandwidth is %f MB/s\n", TEST_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE / (1024.0 * 1024.0 ) / diff.count() ); 160 | 161 | printf("Sending message to remote host\n"); 162 | qp->send(buffer2Sided, requests[0]); 163 | requests[0]->waitUntilCompleted(); 164 | 165 | delete buffer1Sided; 166 | delete buffer2Sided; 167 | } 168 | 169 | delete qp; 170 | delete qpFactory; 171 | delete context; 172 | 173 | return 0; 174 | } 175 | -------------------------------------------------------------------------------- /tests/python/config.py: -------------------------------------------------------------------------------- 1 | PORT_NUMBER = 3344 2 | MASTER_IP = "155.198.152.17"#"127.0.0.1" 3 | HLPER_PORT = 5678 4 | NODE_COUNT = 1200000 5 | FEATURE_DIM = 128 6 | FEATURE_TYPE_SIZE = 4 7 | SAMPLE_NUM = 80000 8 | ITER_NUM = 10 9 | POST_LIST_SIZE = 128 10 | QP_NUM = 8 11 | TX_DEPTH = 2048 12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE 13 | TEST_TLB_OPTIMIZATION = True 14 | 15 | # For Reddit Training 16 | SAMPLE_PARAM = [25, 10] 17 | BATCH_SIZE = 256 18 | -------------------------------------------------------------------------------- /tests/python/preprocess_Dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import quiver 3 | from torch_geometric.datasets import Reddit 4 | import os.path as osp 5 | 6 | def reindex_with_random(adj_csr, graph_feature=None, hot_ratio=0): 7 | 8 | node_count = adj_csr.indptr.shape[0] - 1 9 | total_range = torch.arange(node_count, dtype=torch.long) 10 | cold_ratio = 1 - hot_ratio 11 | cold_part = int(node_count * cold_ratio) 12 | hot_part = node_count - cold_part 13 | perm_range = torch.randperm(cold_part) + hot_part 14 | # sort and shuffle 15 | degree = adj_csr.indptr[1:] - adj_csr.indptr[:-1] 16 | _, prev_order = torch.sort(degree, descending=True) 17 | new_order = torch.zeros_like(prev_order) 18 | prev_order[hot_part:] = prev_order[perm_range] 19 | new_order[prev_order] = total_range 20 | if graph_feature is not None: 21 | graph_feature = graph_feature[prev_order] 22 | 23 | return graph_feature, new_order 24 | 25 | def reindex_with_certain(adj_csr, graph_feature=None, hot_ratio=0): 26 | node_count = adj_csr.indptr.shape[0] - 1 27 | total_range = torch.arange(node_count, dtype=torch.long) 28 | print("node count", node_count) 29 | cold_ratio = 1 - hot_ratio 30 | cold_part = int(node_count * cold_ratio) 31 | hot_part = node_count - cold_part 32 | 33 | # sort 34 | degree = adj_csr.indptr[1:] - adj_csr.indptr[:-1] 35 | _, prev_order = torch.sort(degree, descending=True) 36 | hot_part_order = prev_order[:hot_part] 37 | 38 | total_range_set = set(total_range.tolist()) 39 | hot_part_set = set(hot_part_order.tolist()) 40 | cold_part_set = total_range_set - hot_part_set 41 | 42 | cold_part_order = torch.LongTensor(list(cold_part_set)) 43 | new_order = torch.zeros_like(prev_order) 44 | print(hot_part_order.shape, cold_part_order.shape, total_range.shape) 45 | 46 | 47 | 48 | new_order[torch.cat([hot_part_order, cold_part_order])] = total_range 49 | 50 | new_feature = torch.cat((graph_feature[hot_part_order], graph_feature[cold_part_order])) 51 | 52 | return new_feature, new_order 53 | 54 | 55 | 56 | 57 | 58 | def load_topo_paper100M(): 59 | indptr = torch.load("/data/papers/ogbn_papers100M/csr/indptr.pt") 60 | indices = torch.load("/data/papers/ogbn_papers100M/csr/indices.pt") 61 | train_idx = torch.load("/data/papers/ogbn_papers100M/index/train_idx.pt") 62 | csr_topo = quiver.CSRTopo(indptr=indptr, indices=indices) 63 | quiver_sampler = quiver.pyg.GraphSageSampler(csr_topo, [15, 10, 5], 0, mode="UVA") 64 | print(f"Graph Stats:\tNodes:{csr_topo.node_count}\tEdges:{csr_topo.edge_count}\tAvg_Deg:{csr_topo.edge_count / csr_topo.node_count}") 65 | return train_idx, csr_topo, quiver_sampler 66 | 67 | def load_feat_paper100M(): 68 | feat = torch.load("/data/papers/ogbn_papers100M/feat/feature.pt") 69 | print(f"Feature Stats:\tDim:{feat.shape[1]}") 70 | return feat 71 | 72 | def load_topo_mag240m(): 73 | pass 74 | 75 | def load_feat_mag240m(): 76 | pass 77 | 78 | 79 | def load_topo_reddit(): 80 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit') 81 | dataset = Reddit(path) 82 | data = dataset[0] 83 | csr_topo = quiver.CSRTopo(edge_index=data.edge_index) 84 | quiver_sampler = quiver.pyg.GraphSageSampler(csr_topo, [25, 10], 0, mode="UVA") 85 | train_idx = data.train_mask.nonzero(as_tuple=False).view(-1) 86 | return train_idx, csr_topo, quiver_sampler 87 | 88 | def load_feat_reddit(): 89 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit') 90 | dataset = Reddit(path) 91 | data = dataset[0] 92 | return data.x 93 | 94 | 95 | def preprocess_dataset(dataset="paper100m", cache_ratio = 0.0, method="certain"): 96 | if dataset == "paper100m": 97 | _, csr_topo, _ = load_topo_paper100M() 98 | 99 | feat = load_feat_paper100M() 100 | elif dataset == "mag240m": 101 | _, csr_topo, _ = load_topo_mag240m() 102 | feat = load_feat_mag240m() 103 | else: 104 | _, csr_topo, _ = load_topo_reddit() 105 | feat = load_feat_reddit() 106 | 107 | if method == "random": 108 | sorted_feature, sorted_order = reindex_with_random(csr_topo, feat, cache_ratio) 109 | else: 110 | sorted_feature, sorted_order = reindex_with_certain(csr_topo, feat, cache_ratio) 111 | 112 | 113 | torch.save(sorted_feature, f"/data/dalong/sorted_feature_{dataset}_{method}_{cache_ratio:.2f}.pt") 114 | torch.save(sorted_order, f"/data/dalong/sorted_order_{dataset}_{method}_{cache_ratio:.2f}.pt") 115 | 116 | 117 | def test_curve(dataset, method="certain", cache_ratio=0, partition_size=2): 118 | if dataset == "reddit": 119 | train_idx, csr_topo, quiver_sampler = load_topo_reddit() 120 | elif dataset == "paper100m": 121 | train_idx, csr_topo, quiver_sampler = load_topo_paper100M() 122 | 123 | 124 | sorted_order_path = f"/data/dalong/sorted_order_{dataset}_{method}_{cache_ratio:.2f}.pt" 125 | order_transform = None 126 | if sorted_order_path is not None: 127 | order_transform = torch.load(sorted_order_path) 128 | order_transform = order_transform.cuda() 129 | dataloader = torch.utils.data.DataLoader(train_idx, batch_size=256) 130 | hot_part = int(cache_ratio * csr_topo.node_count) 131 | cold_part = (csr_topo.node_count - int(cache_ratio * csr_topo.node_count)) // partition_size 132 | 133 | col_part_hit_count = 0 134 | hot_part_hit_count = 0 135 | total_count = 0 136 | for seeds in dataloader: 137 | n_id, _, _ = quiver_sampler.sample(seeds) 138 | n_id = n_id.cuda() 139 | feature_n_id = order_transform[n_id] 140 | col_part_hit_count += feature_n_id[torch.logical_and(feature_n_id > hot_part, feature_n_id < (hot_part + cold_part))].shape[0] 141 | hot_part_hit_count += feature_n_id[feature_n_id < hot_part].shape[0] 142 | total_count += feature_n_id.shape[0] 143 | print(f"Hot Part Hit Ratio:\t{hot_part_hit_count / total_count}\nCold Partition Hit Rate:\t{col_part_hit_count / total_count}") 144 | 145 | 146 | CACHE_RATIO = 0.0 147 | PARTITION_SIZE = 2 148 | METHOD = "random" 149 | DATASET = "paper100m" 150 | #preprocess_dataset(dataset=DATASET, cache_ratio=CACHE_RATIO, method=METHOD) 151 | test_curve(DATASET, cache_ratio = CACHE_RATIO, partition_size=PARTITION_SIZE, method=METHOD) 152 | -------------------------------------------------------------------------------- /tests/python/test_DGLUnifiedTensor.py: -------------------------------------------------------------------------------- 1 | import time 2 | import random 3 | import dgl 4 | import torch 5 | import numpy as np 6 | from texttable import Texttable 7 | 8 | NUM_ELEMENT = 400000000 9 | FEATURE_DIM = 128 10 | SAMPLE_SIZE = 80000 11 | LOOP_NUM = 10 12 | 13 | features = torch.empty((NUM_ELEMENT, FEATURE_DIM)) 14 | features = dgl.contrib.UnifiedTensor(features, device=torch.device('cuda')) 15 | 16 | results = np.empty([1, 3], dtype = int) 17 | for idx in range(LOOP_NUM): 18 | sample_idx = torch.randint(0, high=NUM_ELEMENT - 1, size=(SAMPLE_SIZE, )).to('cuda') 19 | 20 | torch.cuda.synchronize() 21 | start = time.time() 22 | 23 | data = features[sample_idx] 24 | 25 | torch.cuda.synchronize() 26 | end = time.time() 27 | consumed = end - start 28 | 29 | results = np.append(results, [[idx, NUM_ELEMENT * FEATURE_DIM * 4 / 1024 / 1024 / 1024, data.numel() * 4 / 1024 / 1024 / consumed]], axis=0) 30 | 31 | results = np.append(results, [np.mean(results[1:LOOP_NUM], axis=0)], axis=0) 32 | results = results.tolist() 33 | 34 | results[0] = ['', 'Tensor Size (GB)', 'Throughput (MB/s)'] 35 | results[-1][0] = 'Avg' 36 | 37 | table = Texttable() 38 | table.set_deco(Texttable.HEADER) 39 | table.set_cols_dtype(['a', 't', 't']) 40 | table.add_rows(results) 41 | print(table.draw()) -------------------------------------------------------------------------------- /tests/python/test_DistHelper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import time 3 | 4 | import os 5 | import quiver_feature 6 | from quiver_feature import Range 7 | from quiver_feature import DistHelper 8 | 9 | MASTER_ADDR = '155.198.152.17' 10 | MASTER_PORT = 5678 11 | 12 | MY_SERVER_RANK = 1 13 | SERVER_WORLD_SIZE = 2 14 | 15 | 16 | dist_helper = DistHelper(MASTER_ADDR, MASTER_PORT, SERVER_WORLD_SIZE, MY_SERVER_RANK) 17 | LOCAL_RANGE = Range(MY_SERVER_RANK * 100, MY_SERVER_RANK * 200) 18 | tensor_endpoints = dist_helper.exchange_tensor_endpoints_info(LOCAL_RANGE) 19 | 20 | print(f"Check TensorEndPoint ", tensor_endpoints) 21 | time.sleep(MY_SERVER_RANK * 5 + 1) 22 | print(f"Rank {MY_SERVER_RANK} Finished, Begin To Sync") 23 | dist_helper.sync_all() 24 | print(f"Rank {MY_SERVER_RANK} Finished, Bye Bye") 25 | -------------------------------------------------------------------------------- /tests/python/test_DistTensorClient.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | 4 | import config 5 | 6 | import time 7 | 8 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 9 | local_com_endpoint = qvf.ComEndPoint(0, config.MASTER_IP, config.PORT_NUMBER) 10 | remote_com_endpoint = qvf.ComEndPoint(1, config.MASTER_IP, config.PORT_NUMBER) 11 | dist_tensor_client = qvf.DistTensorClient(0, [local_com_endpoint, remote_com_endpoint], pipe_param) 12 | registered_tensor = dist_tensor_client.create_registered_float32_tensor([config.SAMPLE_NUM, config.FEATURE_DIM]) 13 | 14 | print("Before Collect, Check RegisteredTensor Shape ", registered_tensor.shape) 15 | local_idx = torch.arange(0, config.SAMPLE_NUM, dtype=torch.int64) 16 | remote_idx = torch.randint(0, config.NODE_COUNT, (config.SAMPLE_NUM, ), dtype=torch.int64) 17 | 18 | if config.TEST_TLB_OPTIMIZATION: 19 | print("Using TLB Optimization") 20 | remote_idx, _= torch.sort(remote_idx) 21 | 22 | local_offsets = local_idx * config.FEATURE_DIM * config.FEATURE_TYPE_SIZE 23 | remote_offsets = remote_idx * config.FEATURE_DIM * config.FEATURE_TYPE_SIZE 24 | 25 | # warm up 26 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets, remote_offsets) 27 | #registered_tensor[:] = 0 28 | 29 | start_time = time.time() 30 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets, remote_offsets) 31 | consumed = time.time() - start_time 32 | 33 | print("Begin To Check Result...") 34 | registered_tensor = registered_tensor.to('cpu') 35 | for row in range(config.SAMPLE_NUM): 36 | if not all(registered_tensor[row] == remote_idx[row]): 37 | print(f"Result Check Failed At {row}, Expected {remote_idx[row]}, But got {registered_tensor[row]}, Local Offsets {local_offsets[row]}, Remote Offsets {remote_offsets[row]}") 38 | exit() 39 | print(f"Result Check Passed!, Throughput = {registered_tensor.numel() * 4 / 1024 / 1024 / consumed} MB/s") 40 | -------------------------------------------------------------------------------- /tests/python/test_DistTensorPGAS.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import numpy as np 4 | import time 5 | from typing import List 6 | import config 7 | from quiver_feature import TensorEndPoint, Range, DistTensorDeviceParam, DistTensorServerParam, PipeParam 8 | from quiver_feature import DistHelper 9 | from quiver_feature import DistTensorPGAS 10 | 11 | parser = argparse.ArgumentParser(description='') 12 | parser.add_argument('-rank', type=int, default=0, help='rank') 13 | parser.add_argument('-device', type=int, default=0, help="device idx") 14 | parser.add_argument('-world_size', type=int, default=1, help="world size") 15 | parser.add_argument('-start_server', type=int, default=1, help='whether to start server') 16 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache") 17 | 18 | args = parser.parse_args() 19 | 20 | NUM_ELEMENT = 1000000 21 | FEATURE_DIM = 600 22 | SAMPLE_SIZE = 80000 23 | 24 | DEVICE_RANK = args.device 25 | WORLD_SIZE = args.world_size 26 | START_SERVER = args.start_server 27 | CACHE_RATIO = args.cache_ratio 28 | LOCAL_SERVER_RANK = args.rank 29 | 30 | 31 | torch.cuda.set_device(DEVICE_RANK) 32 | 33 | cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * WORLD_SIZE)) 34 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * WORLD_SIZE - cached_range.end) // WORLD_SIZE 35 | 36 | host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM) 37 | host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM) 38 | 39 | tensor = torch.from_numpy(host_tensor).type(torch.float32).share_memory_() 40 | 41 | 42 | 43 | range_list = [] 44 | for idx in range(WORLD_SIZE): 45 | range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1)) 46 | range_list.append(range_item) 47 | 48 | 49 | dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, WORLD_SIZE, LOCAL_SERVER_RANK) 50 | tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK]) 51 | 52 | print(f"Check All TensorEndPoints {tensor_endpoints_list}") 53 | 54 | host_indice = np.random.randint(0, high= WORLD_SIZE * NUM_ELEMENT - 1, size=(SAMPLE_SIZE, )) 55 | indices = torch.from_numpy(host_indice).type(torch.long) 56 | indices_device = indices.to(DEVICE_RANK) 57 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * WORLD_SIZE) 58 | 59 | device_param = DistTensorDeviceParam(device_list=[DEVICE_RANK], device_cache_size="8G", cache_policy="device_replicate") 60 | server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size= WORLD_SIZE) 61 | buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, tensor.shape[1]] 62 | pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 63 | 64 | dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range) 65 | dist_tensor.from_cpu_tensor(tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param) 66 | 67 | 68 | start = time.time() 69 | data = dist_tensor[indices_device] 70 | consumed = time.time() - start 71 | 72 | data = data.cpu() 73 | data_gt = whole_tensor[indices] 74 | 75 | assert torch.equal(data, data_gt), "Result Check Failed!" 76 | 77 | print(f"Result Check Successed! Throughput = {data.numel() * 4 / 1024 / 1024 / consumed} MB/s") 78 | 79 | dist_helper.sync_all() 80 | -------------------------------------------------------------------------------- /tests/python/test_DistTensorRPC.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed.rpc as rpc 3 | #from tmp import DistTensorRPC 4 | from tmp import DistTensorRPC 5 | from quiver_feature import Range 6 | 7 | import numpy as np 8 | from quiver.shard_tensor import ShardTensorConfig, ShardTensor 9 | import argparse 10 | import os 11 | import time 12 | import numpy as np 13 | 14 | 15 | """ 16 | 1. CPU & IB 17 | 2. Komodo1,2,3 18 | 3. can we do some GPU sampling when waiting for network 19 | """ 20 | os.environ['MASTER_ADDR'] = '155.198.152.17' 21 | os.environ['MASTER_PORT'] = '5678' 22 | 23 | os.environ["NCCL_SOCKET_IFNAME"] = "eth0" 24 | os.environ["TP_SOCKET_IFNAME"] = "eth0" 25 | os.environ["GLOO_SOCKET_IFNAME"] = "eth0" 26 | os.environ["TP_VERBOSE_LOGGING"] = "0" 27 | 28 | 29 | 30 | 31 | 32 | parser = argparse.ArgumentParser(description='python3 test.py -rank x -world_size x -cpu_collect True for test CPU') 33 | parser.add_argument('-rank', type=int, help='rank') 34 | parser.add_argument('-local_rank', type=int, default=0, help="local rank") 35 | parser.add_argument('-world_size', type=int, help="world size") 36 | parser.add_argument("-device_per_node", type=int, default=1, help ="device per node") 37 | parser.add_argument("-cpu_collect", type=int, default=0, help ="test for cpu collection") 38 | parser.add_argument("-cpu_collect_gpu_send", type=int, default=0, help ="send from gpu") 39 | parser.add_argument("-test_ib", type=int, default=1, help ="test IB") 40 | 41 | args = parser.parse_args() 42 | device_map = {} 43 | for idx in range(args.world_size): 44 | device_map[f"worker{idx}"] = {} 45 | for device_idx in range(args.device_per_node): 46 | device_map[f"worker{idx}"][device_idx] = device_idx 47 | 48 | print(f"Device Map: {device_map}") 49 | print(f"Rank {args.rank}: Test Mode Is {'CPU' if args.cpu_collect else 'GPU'}") 50 | """ 51 | All transports and channels we have: 52 | 53 | V0327 07:52:54.252611 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport ibv 54 | V0327 07:52:54.252761 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport uv 55 | V0327 07:52:54.261135 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport shm 56 | V0327 07:52:54.261295 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_basic 57 | V0327 07:52:54.262006 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_xth 58 | V0327 07:52:54.262173 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cma 59 | V0327 07:52:54.276424 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_ipc 60 | V0327 07:52:54.276447 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel basic 61 | V0327 07:52:54.278730 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel mpt_uv 62 | """ 63 | 64 | if args.cpu_collect and args.test_ib: 65 | # python3 test.py -cpu_collect 1 -test_ib 1 66 | print("Transports: IBV, Channel: BASIC") 67 | rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['basic']) 68 | elif args.cpu_collect: 69 | # python3 test.py -cpu_collect 1 -test_ib 0 70 | print("Transports: UV, Channel: MPT_UV") 71 | rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['uv'], _channels=['mpt_uv']) 72 | elif args.test_ib: 73 | # python3 test.py -cpu_collect 0 -test_ib 1 74 | print("Transports: IBV, Channel: CUDA_BASIC") 75 | rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['cuda_basic']) 76 | else: 77 | # python3 test.py -cpu_collect 0 -test_ib 0 78 | print("Transports: UV, Channel: CUDA_BASIC") 79 | rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['uv'], _channels=['cuda_basic']) 80 | 81 | if args.cpu_collect and args.cpu_collect_gpu_send: 82 | 83 | # python3 test.py -cpu_collect 1 -test_ib 1 -cpu_collect_gpu_send 1 84 | print("CPU Collect and GPU Send, Update To: Transports: IBV, Channel: CUDA_BASIC") 85 | rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['cuda_basic']) 86 | 87 | debug_param = {"cpu_collect_gpu_send": args.cpu_collect_gpu_send} 88 | 89 | NUM_ELEMENT = 1000000 90 | FEATURE_DIM = 600 91 | SAMPLE_SIZE = 80000 92 | 93 | ######################### 94 | # Init With Numpy 95 | ######################## 96 | torch.cuda.set_device(args.local_rank) 97 | cached_ratio = 0.0 98 | cached_range = Range(0, int(cached_ratio * NUM_ELEMENT * args.world_size // args.device_per_node)) 99 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * args.world_size // args.device_per_node - cached_range.end) // (args.world_size // args.device_per_node) 100 | 101 | host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM) 102 | host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM) 103 | 104 | tensor = torch.from_numpy(host_tensor).type(torch.float32) 105 | 106 | 107 | shard_tensor_config = ShardTensorConfig({args.local_rank: "8G"}) 108 | shard_tensor = ShardTensor(args.local_rank, shard_tensor_config) 109 | shard_tensor.from_cpu_tensor(tensor) 110 | 111 | 112 | range_list = [] 113 | for idx in range(args.world_size // args.device_per_node): 114 | range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1)) 115 | for _ in range(args.device_per_node): 116 | range_list.append(range_item) 117 | 118 | 119 | host_indice = np.random.randint(0, high= (args.world_size // args.device_per_node) * NUM_ELEMENT - 1, size=(SAMPLE_SIZE, )) 120 | indices = torch.from_numpy(host_indice).type(torch.long) 121 | 122 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * (args.world_size // args.device_per_node)) 123 | 124 | print(f"Whole Tensor Shape: {whole_tensor.shape}") 125 | print(f"Shard Tensor Shape: {shard_tensor.shape}") 126 | 127 | # TODO Just For Debugging 128 | if args.cpu_collect_gpu_send or not args.cpu_collect: 129 | indices = indices.to(args.local_rank) 130 | 131 | if args.cpu_collect or args.cpu_collect_gpu_send: 132 | print(f"Using CPU Collect") 133 | dist_tensor = DistTensorRPC(args.world_size, args.rank, args.device_per_node, args.local_rank, tensor, range_list, rpc_option, cached_range, **debug_param) 134 | else: 135 | dist_tensor = DistTensorRPC(args.world_size, args.rank, args.device_per_node, args.local_rank, shard_tensor, range_list, rpc_option, cached_range, **debug_param) 136 | 137 | warm_up = 4 138 | for idx in range(warm_up): 139 | data = dist_tensor[indices] 140 | 141 | test_count = 100 142 | consumed_time = 0 143 | data_times = [] 144 | for idx in range(test_count): 145 | start = time.time() 146 | data = dist_tensor[indices] 147 | data_times.append(time.time() - start) 148 | 149 | data_cpu = data.cpu() 150 | indices_cpu = indices.cpu() 151 | data_gt = whole_tensor[indices_cpu] 152 | 153 | assert torch.equal(data_gt, data_cpu) 154 | 155 | data_times = np.array(data_times) 156 | data_times = np.sort(data_times) 157 | data_times = data_times[int(0.1 * test_count): -int(0.1 * test_count)] 158 | consumed_time = np.sum(data_times) 159 | print(f"Bandwidth in Rank {args.rank} = {data_times.shape[0] * torch.numel(data) * 4 / 1024 / 1024 / 1024 / consumed_time }GB/s") 160 | time.sleep(10) 161 | rpc.shutdown() 162 | -------------------------------------------------------------------------------- /tests/python/test_DistTensorServer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | import config 4 | 5 | 6 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 7 | 8 | data = torch.empty((config.NODE_COUNT, config.FEATURE_DIM), dtype=torch.float) 9 | for row in range(config.NODE_COUNT): 10 | data[row] = row 11 | 12 | dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, 2, config.QP_NUM) 13 | dist_tensor_server.serve_tensor(data) 14 | dist_tensor_server.join() 15 | -------------------------------------------------------------------------------- /tests/python/test_LocalTensorPGAS.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch_geometric.datasets import Reddit 3 | import os.path as osp 4 | import time 5 | from ogb.nodeproppred import PygNodePropPredDataset 6 | import quiver 7 | from quiver_feature import LocalTensorPGAS 8 | import quiver_feature 9 | 10 | def load_products(): 11 | root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products') 12 | dataset = PygNodePropPredDataset('ogbn-products', root) 13 | data = dataset[0] 14 | return data.x 15 | 16 | 17 | def load_reddit(): 18 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit') 19 | dataset = Reddit(path) 20 | data = dataset[0] 21 | return data.x 22 | 23 | def load_mag240_partition(): 24 | tensor = quiver_feature.shared_load("/data/dalong/front_half.pt") 25 | return tensor 26 | 27 | 28 | TEST_COUNT = 100 29 | SAMPLE_NUM = 80000 30 | 31 | def test_normal_feature_collect(dataset="reddit"): 32 | if dataset == "reddit": 33 | tensor = load_reddit() 34 | elif dataset == "mag240m": 35 | tensor = load_mag240_partition() 36 | else: 37 | tensor = load_products() 38 | 39 | consumed = 0 40 | res = None 41 | 42 | for _ in range(TEST_COUNT): 43 | indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cpu") 44 | start = time.time() 45 | res = tensor[indices] 46 | consumed += time.time() - start 47 | 48 | print(f"Throughput = {TEST_COUNT * res.numel() * 4 / consumed / 1024 / 1024 / 1024 :.4f} GB/s") 49 | 50 | def test_LocalTensorPGAS(dataset="reddit", device_nums = 1, device_cache_size = 0, cache_policy = "device_replicate"): 51 | 52 | print(f"Dataset: {dataset}, Device Num: {device_nums}, Device Cache Size: {device_cache_size}, Cache Policy: {cache_policy}") 53 | if dataset == "reddit": 54 | tensor = load_reddit() 55 | elif dataset == "mag240m": 56 | 57 | tensor = load_mag240_partition() 58 | else: 59 | tensor = load_products() 60 | 61 | tensor.share_memory_() 62 | 63 | local_tensor_pgas = LocalTensorPGAS(device_list=list(range(device_nums)), device_cache_size=device_cache_size, cache_policy=cache_policy) 64 | local_tensor_pgas.from_cpu_tensor(tensor) 65 | 66 | indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cuda:0") 67 | res = local_tensor_pgas[indices] 68 | torch.cuda.synchronize() 69 | 70 | consumed = 0 71 | res = None 72 | 73 | for _ in range(TEST_COUNT): 74 | indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cuda:0") 75 | torch.cuda.synchronize() 76 | start = time.time() 77 | res = local_tensor_pgas[indices] 78 | torch.cuda.synchronize() 79 | consumed += time.time() - start 80 | 81 | print(f"Throughput = {TEST_COUNT * res.numel() * tensor.element_size() / consumed / 1024 / 1024 / 1024 :.4f} GB/s") 82 | 83 | 84 | if __name__ == "__main__": 85 | """ 86 | Set shm size as your whole memory size 87 | sudo mount -o remount,size=377G /dev/shm 88 | """ 89 | quiver.init_p2p([0, 1]) 90 | #test_normal_feature_collect() 91 | test_LocalTensorPGAS("mag240m", device_cache_size="30G", device_nums=2, cache_policy="p2p_clique_replicate") 92 | -------------------------------------------------------------------------------- /tests/python/test_MultiMachineDistTensorClientServer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from ntpath import join 3 | import torch 4 | import numpy as np 5 | import time 6 | import threading 7 | from typing import List 8 | import qvf 9 | import config 10 | import torch.multiprocessing as mp 11 | from quiver_feature import TensorEndPoint, Range 12 | from quiver_feature import DistHelper 13 | from quiver_feature import DistTensorPGAS 14 | 15 | MASTER_IP = "155.198.152.17" 16 | HLPER_PORT = 5678 17 | 18 | NUM_ELEMENT = 10000000 * 3 * 2 * 2 * 2 * 2 19 | FEATURE_DIM = 128 20 | SAMPLE_SIZE = 250000 21 | 22 | 23 | parser = argparse.ArgumentParser(description='') 24 | parser.add_argument('-server_rank', type=int, help='server_rank') 25 | parser.add_argument('-device_per_node', type=int, help="how many process per server") 26 | 27 | args = parser.parse_args() 28 | 29 | 30 | def feature_process(rank, server_rank, tensor_endpoints, cached_range, SAMPLE_SIZE, FEATURE_DIM): 31 | 32 | torch.cuda.set_device(rank) 33 | peer_tensor_endpoint = None 34 | for tensor_endpoint in tensor_endpoints: 35 | if tensor_endpoint.server_rank != server_rank: 36 | peer_tensor_endpoint = tensor_endpoint 37 | break 38 | host_indice = np.random.randint(peer_tensor_endpoint.range.start, high= peer_tensor_endpoint.range.end, size=(SAMPLE_SIZE, )) 39 | indices = torch.from_numpy(host_indice).type(torch.long) 40 | 41 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 42 | dist_tensor = DistTensorPGAS(server_rank, tensor_endpoints, pipe_param, [SAMPLE_SIZE, FEATURE_DIM], None, cached_range) 43 | 44 | 45 | TEST_COUNT = 1 46 | start = time.time() 47 | consumed = 0 48 | for i in range(TEST_COUNT): 49 | 50 | host_indice = np.random.randint(peer_tensor_endpoint.range.start, high= peer_tensor_endpoint.range.end, size=(SAMPLE_SIZE, )) 51 | indices = torch.from_numpy(host_indice).type(torch.long) 52 | if config.TEST_TLB_OPTIMIZATION: 53 | indices, _ = torch.sort(indices) 54 | 55 | local_offsets = torch.arange(0, SAMPLE_SIZE) * 4 * FEATURE_DIM 56 | remote_offsets = (indices - peer_tensor_endpoint.range.start) * 4 * FEATURE_DIM 57 | 58 | start = time.time() 59 | dist_tensor.dist_tensor_client.sync_read(peer_tensor_endpoint.server_rank, dist_tensor.registered_tensor, local_offsets, remote_offsets) 60 | consumed += time.time() - start 61 | 62 | print(f"Result Check Successed! Throughput = {dist_tensor.registered_tensor.numel() * 4 * TEST_COUNT/ 1024 / 1024 / consumed} MB/s") 63 | 64 | 65 | 66 | 67 | if __name__ == "__main__": 68 | 69 | 70 | SERVER_WORLD_SIZE = 2 71 | START_SERVER = True 72 | CACHE_RATIO = 0 73 | LOCAL_SERVER_RANK = args.server_rank 74 | 75 | 76 | cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * SERVER_WORLD_SIZE)) 77 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * SERVER_WORLD_SIZE - cached_range.end) // SERVER_WORLD_SIZE 78 | 79 | 80 | 81 | tensor = torch.empty((UNCACHED_NUM_ELEMENT + cached_range.end, FEATURE_DIM)) 82 | 83 | print(f"Check Tensor Size: {tensor.numel() * 4 / 1024 / 1024 / 1024} GB") 84 | 85 | 86 | 87 | # Decide Range Information 88 | range_list = [] 89 | for idx in range(SERVER_WORLD_SIZE): 90 | range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1)) 91 | range_list.append(range_item) 92 | 93 | # Exchange information with each other 94 | dist_helper = DistHelper(MASTER_IP, HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK) 95 | tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK]) 96 | 97 | # Start server thread 98 | def server_thread(dist_helper): 99 | dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, SERVER_WORLD_SIZE * args.device_per_node, config.QP_NUM) 100 | dist_tensor_server.serve_tensor(tensor) 101 | dist_helper.sync_start() 102 | dist_tensor_server.join() 103 | x = threading.Thread(target=server_thread, args=(dist_helper, )) 104 | x.daemon = True 105 | x.start() 106 | 107 | # Wait all servers start 108 | dist_helper.sync_end() 109 | 110 | print(f"Check All TensorEndPoints {tensor_endpoints_list}") 111 | 112 | mp.spawn(feature_process, nprocs=args.device_per_node, args=(LOCAL_SERVER_RANK, tensor_endpoints_list, cached_range, SAMPLE_SIZE, FEATURE_DIM), join=True) 113 | 114 | time.sleep(10) 115 | dist_helper.sync_all() 116 | -------------------------------------------------------------------------------- /tests/python/test_MultiMachineDistTensorPGAS.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import numpy as np 4 | import time 5 | from typing import List 6 | import config 7 | import torch.multiprocessing as mp 8 | from quiver_feature import TensorEndPoint, Range, PipeParam, DistTensorDeviceParam, DistTensorServerParam 9 | from quiver_feature import DistHelper 10 | from quiver_feature import DistTensorPGAS 11 | 12 | NUM_ELEMENT = 1000000 13 | FEATURE_DIM = 600 14 | SAMPLE_SIZE = 80000 15 | 16 | 17 | parser = argparse.ArgumentParser(description='') 18 | parser.add_argument('-server_rank', type=int, default=0, help='server_rank') 19 | parser.add_argument('-device_per_node', type=int, default=1, help="how many process per server") 20 | parser.add_argument('-server_world_size', type=int, default=1, help="world size") 21 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache") 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | def feature_process(rank, dist_tensor, whole_tensor, SAMPLE_SIZE): 27 | 28 | torch.cuda.set_device(rank) 29 | host_indice = np.random.randint(0, high=dist_tensor.shape[0] - 1, size=(SAMPLE_SIZE, )) 30 | indices = torch.from_numpy(host_indice).type(torch.long) 31 | indices_device = indices.to(rank) 32 | 33 | # warm up 34 | data = dist_tensor[indices_device] 35 | torch.cuda.synchronize() 36 | TEST_COUNT = 1000 37 | start = time.time() 38 | consumed = 0 39 | for i in range(TEST_COUNT): 40 | host_indice = np.random.randint(0, high=dist_tensor.shape[0] - 1, size=(SAMPLE_SIZE, )) 41 | indices = torch.from_numpy(host_indice).type(torch.long) 42 | if config.TEST_TLB_OPTIMIZATION: 43 | indices, _ = torch.sort(indices) 44 | 45 | indices_device = indices.to(rank) 46 | torch.cuda.synchronize() 47 | 48 | start = time.time() 49 | data = dist_tensor[indices_device] 50 | torch.cuda.synchronize() 51 | consumed += time.time() - start 52 | assert torch.equal(data.cpu(), whole_tensor[indices]), "Result Check Failed!" 53 | 54 | 55 | print(f"Result Check Successed! Throughput = {data.numel() * data.element_size() * TEST_COUNT/ 1024 / 1024 / consumed} MB/s") 56 | 57 | 58 | 59 | 60 | if __name__ == "__main__": 61 | 62 | 63 | SERVER_WORLD_SIZE = args.server_world_size 64 | START_SERVER = True 65 | CACHE_RATIO = args.cache_ratio 66 | LOCAL_SERVER_RANK = args.server_rank 67 | 68 | 69 | cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * SERVER_WORLD_SIZE)) 70 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * SERVER_WORLD_SIZE - cached_range.end) // SERVER_WORLD_SIZE 71 | 72 | 73 | host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM) 74 | host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM) 75 | host_tensor = host_tensor.astype(np.float16) 76 | tensor = torch.from_numpy(host_tensor) 77 | 78 | 79 | # Decide Range Information 80 | range_list = [] 81 | for idx in range(SERVER_WORLD_SIZE): 82 | range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1)) 83 | range_list.append(range_item) 84 | 85 | 86 | # Exchange information with each other 87 | dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK) 88 | print("Exchange Tensor End Point Infomation With Other Ranks") 89 | tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK]) 90 | 91 | 92 | print(f"Check All TensorEndPoints {tensor_endpoints_list}") 93 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * SERVER_WORLD_SIZE) 94 | 95 | 96 | device_param = DistTensorDeviceParam(device_list=list(range(args.device_per_node)), device_cache_size="4G", cache_policy="device_replicate") 97 | server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size=args.server_world_size) 98 | buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, tensor.shape[1]] 99 | pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 100 | 101 | dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range, dtype=tensor.dtype) 102 | dist_tensor.from_cpu_tensor(tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param) 103 | 104 | 105 | mp.spawn(feature_process, nprocs=args.device_per_node, args=(dist_tensor, whole_tensor, SAMPLE_SIZE), join=True) 106 | 107 | dist_helper.sync_all() 108 | -------------------------------------------------------------------------------- /tests/python/test_MultiMachineDistTensorRPC.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import os 4 | 5 | 6 | import multiprocessing as mp 7 | 8 | 9 | def run(command): 10 | os.system(command) 11 | 12 | """ 13 | 1. CPU & IB 14 | 2. Komodo1,2,3 15 | 3. can we do some GPU sampling when waiting for network 16 | """ 17 | os.environ['MASTER_ADDR'] = '155.198.152.17' 18 | os.environ['MASTER_PORT'] = '5678' 19 | 20 | os.environ["NCCL_SOCKET_IFNAME"] = "eth0" 21 | os.environ["TP_SOCKET_IFNAME"] = "eth0" 22 | os.environ["GLOO_SOCKET_IFNAME"] = "eth0" 23 | os.environ["TP_VERBOSE_LOGGING"] = "0" 24 | 25 | 26 | 27 | parser = argparse.ArgumentParser(description='python3 test.py -rank x -world_size x -cpu_collect True for test CPU') 28 | parser.add_argument('-world_size', type=int, help="world size") 29 | parser.add_argument("-device_per_node", type=int, default=1, help ="device per node") 30 | parser.add_argument("-cpu_collect", type=int, default=0, help ="test for cpu collection") 31 | parser.add_argument("-cpu_collect_gpu_send", type=int, default=0, help ="send from gpu") 32 | parser.add_argument("-test_ib", type=int, default=1, help ="test IB") 33 | parser.add_argument("-start_rank", type=int, default=0, help ="test IB") 34 | 35 | args = parser.parse_args() 36 | 37 | command = f"python3 test_DistTensorRPC.py -device_per_node {args.device_per_node} -cpu_collect {args.cpu_collect} -cpu_collect_gpu_send {args.cpu_collect_gpu_send} -test_ib {args.test_ib} -world_size {args.world_size} -device_per_node {args.device_per_node}" 38 | 39 | process_lst = [] 40 | for local_rank in range(args.device_per_node): 41 | run_command = command + f" -rank {args.start_rank + local_rank} -local_rank {local_rank}" 42 | print(f"Run Command: {run_command}") 43 | process = mp.Process(target=run, args=(run_command, )) 44 | process.start() 45 | process_lst.append(process) 46 | 47 | for process in process_lst: 48 | process.join() 49 | -------------------------------------------------------------------------------- /tests/python/test_PipeParam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | 4 | pipe_param = qvf.PipeParam(1, 1, 1, 1) 5 | print(f"ParamVec: {pipe_param.get_param_vec()}") 6 | 7 | pipe_param2 = qvf.PipeParam() 8 | pipe_param2.set_param_vec(pipe_param.get_param_vec()) 9 | print(f"ParamVec: {pipe_param2.get_param_vec()}") 10 | 11 | -------------------------------------------------------------------------------- /tests/python/test_RealDataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch_geometric.datasets import Reddit 3 | import os.path as osp 4 | import time 5 | from ogb.nodeproppred import PygNodePropPredDataset 6 | 7 | import argparse 8 | 9 | import numpy as np 10 | import time 11 | from typing import List 12 | import config 13 | import torch.multiprocessing as mp 14 | from quiver_feature import TensorEndPoint, Range, PipeParam, DistTensorServerParam, DistTensorDeviceParam 15 | from quiver_feature import DistHelper 16 | from quiver_feature import DistTensorPGAS 17 | 18 | 19 | def load_products(): 20 | root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products') 21 | dataset = PygNodePropPredDataset('ogbn-products', root) 22 | data = dataset[0] 23 | return data.x 24 | 25 | 26 | def load_reddit(): 27 | path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit') 28 | dataset = Reddit(path) 29 | data = dataset[0] 30 | return data.x 31 | 32 | def load_paper100M(dataset="paper100m", cache_ratio = 0.0, method="certain"): 33 | return torch.load(f"/data/dalong/sorted_feature_{dataset}_{method}_{cache_ratio:.2f}.pt") 34 | 35 | SAMPLE_SIZE = 80000 36 | 37 | 38 | parser = argparse.ArgumentParser(description='') 39 | parser.add_argument('-server_rank', type=int, default=0, help='server_rank') 40 | parser.add_argument('-device_per_node', type=int, default=1, help="how many process per server") 41 | parser.add_argument('-server_world_size', type=int, default=1, help="world size") 42 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache") 43 | 44 | args = parser.parse_args() 45 | 46 | 47 | def feature_process(rank, dist_tensor, whole_tensor, SAMPLE_SIZE): 48 | 49 | torch.cuda.set_device(rank) 50 | host_indice = np.random.randint(0, high= dist_tensor.size(0) - 1, size=(SAMPLE_SIZE, )) 51 | indices = torch.from_numpy(host_indice).type(torch.long) 52 | indices_device = indices.to(rank) 53 | 54 | # warm up 55 | data = dist_tensor[indices_device] 56 | torch.cuda.synchronize() 57 | TEST_COUNT = 1000 58 | start = time.time() 59 | consumed = 0 60 | for i in range(TEST_COUNT): 61 | host_indice = np.random.randint(0, high= dist_tensor.size(0) - 1, size=(SAMPLE_SIZE, )) 62 | indices = torch.from_numpy(host_indice).type(torch.long) 63 | if config.TEST_TLB_OPTIMIZATION: 64 | indices, _ = torch.sort(indices) 65 | indices_device = indices.to(rank) 66 | torch.cuda.synchronize() 67 | 68 | start = time.time() 69 | data = dist_tensor[indices_device] 70 | torch.cuda.synchronize() 71 | consumed += time.time() - start 72 | 73 | data = data.cpu() 74 | data_gt = whole_tensor[indices] 75 | 76 | assert torch.equal(data, data_gt), "Result Check Failed!" 77 | 78 | print(f"Result Check Successed! Throughput = {data.numel() * 4 * TEST_COUNT/ 1024 / 1024 / consumed} MB/s") 79 | 80 | 81 | 82 | 83 | if __name__ == "__main__": 84 | 85 | 86 | tensor = load_reddit() 87 | SERVER_WORLD_SIZE = args.server_world_size 88 | START_SERVER = True 89 | CACHE_RATIO = 0 90 | LOCAL_SERVER_RANK = args.server_rank 91 | TOTAL_NODE_SIZE = tensor.shape[0] 92 | 93 | 94 | cached_range = Range(0, int(CACHE_RATIO * TOTAL_NODE_SIZE)) 95 | UNCACHED_NUM_ELEMENT = (TOTAL_NODE_SIZE - cached_range.end) // SERVER_WORLD_SIZE 96 | 97 | 98 | # Decide Range Information 99 | range_list = [] 100 | for idx in range(SERVER_WORLD_SIZE): 101 | range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1)) 102 | range_list.append(range_item) 103 | 104 | # Build local_tensor 105 | local_tensor = torch.cat([tensor[cached_range.start: cached_range.end], tensor[range_list[args.server_rank].start: range_list[args.server_rank].end]]).share_memory_() 106 | 107 | # Exchange information with each other 108 | dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK) 109 | tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK]) 110 | 111 | print(f"Check All TensorEndPoints {tensor_endpoints_list}") 112 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * SERVER_WORLD_SIZE) 113 | 114 | device_param = DistTensorDeviceParam(device_list=list(range(args.device_per_node)), device_cache_size="4G", cache_policy="device_replicate") 115 | server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size=args.server_world_size) 116 | buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, local_tensor.shape[1]] 117 | pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 118 | 119 | dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range) 120 | dist_tensor.from_cpu_tensor(local_tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param) 121 | 122 | 123 | 124 | mp.spawn(feature_process, nprocs=args.device_per_node, args=(dist_tensor, whole_tensor, SAMPLE_SIZE), join=True) 125 | 126 | dist_helper.sync_all() 127 | -------------------------------------------------------------------------------- /tests/python/test_RegisteredTensorTransfer.py: -------------------------------------------------------------------------------- 1 | from atexit import register 2 | import torch 3 | import qvf 4 | import threading 5 | import config 6 | 7 | import time 8 | 9 | 10 | def server_thread(): 11 | print("Start Server Thread") 12 | data = torch.empty((config.NODE_COUNT, config.FEATURE_DIM), dtype=torch.float) 13 | dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, 1, config.QP_NUM) 14 | dist_tensor_server.serve_tensor(data) 15 | time.sleep(10) 16 | 17 | x = threading.Thread(target=server_thread) 18 | x.daemon = True 19 | x.start() 20 | 21 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE) 22 | local_com_endpoint = qvf.ComEndPoint(0, config.SERVER_IP, config.PORT_NUMBER) 23 | remote_com_endpoint = qvf.ComEndPoint(1, config.SERVER_IP, config.PORT_NUMBER) 24 | dist_tensor_client = qvf.DistTensorClient(0, [local_com_endpoint, remote_com_endpoint], pipe_param) 25 | registered_tensor = torch.zeros((config.SAMPLE_NUM, config.FEATURE_DIM)) 26 | registered_tensor = registered_tensor.pin_memory() 27 | dist_tensor_client.register_float32_tensor(registered_tensor) 28 | 29 | data_cuda = registered_tensor.cuda() 30 | torch.cuda.synchronize() 31 | 32 | start = time.time() 33 | data_cuda = registered_tensor.cuda() 34 | torch.cuda.synchronize() 35 | consumed = time.time() - start 36 | 37 | print(f"Transfer Throughput is {data_cuda.numel() * 4 / 1024 / 1024 / 1024 / consumed} GB/s") 38 | -------------------------------------------------------------------------------- /tests/python/test_SharedLoader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import quiver_feature 3 | import psutil 4 | import time 5 | from multiprocessing import Process 6 | import os 7 | 8 | def measure_process(parent_process_id): 9 | consumed = 0 10 | start = time.time() 11 | mem_use_lst = [] 12 | 13 | while consumed < 20: 14 | mem_use = psutil.Process(parent_process_id).memory_info().rss / 1024 / 1024 15 | time.sleep(0.0001) 16 | consumed += time.time() - start 17 | start = time.time() 18 | mem_use_lst.append(mem_use) 19 | 20 | print(f"Max Memory Usage: {max(mem_use_lst)}") 21 | 22 | def check_shared(t: torch.Tensor): 23 | print('tensor.is_shared() = {}'.format(t.is_shared())) 24 | 25 | def save_huge_tensor(): 26 | a = torch.zeros((10, 1024, 1024, 256)) 27 | torch.save(a, 'huge.pt') 28 | 29 | def torch_load_huge_shared_tensor(): 30 | a = torch.load('huge.pt') 31 | print(f"Original Data Size = {a.numel() * 4 / 1024 / 1024} MB") 32 | 33 | print(f"Before Shared:", end="\t") 34 | check_shared(a) 35 | 36 | a.share_memory_() 37 | 38 | print(f"After Shared:", end="\t") 39 | check_shared(a) 40 | 41 | del a 42 | 43 | 44 | def qvf_load_huge_shared_tensor(): 45 | a = quiver_feature.shared_load('huge.pt') 46 | print(f"Original Data Size = {a.numel() * 4 / 1024 / 1024} MB") 47 | 48 | print(f"Before Shared:", end="\t") 49 | check_shared(a) 50 | 51 | a.share_memory_() 52 | 53 | print(f"After Shared:", end="\t") 54 | check_shared(a) 55 | 56 | del a 57 | 58 | if __name__ == '__main__': 59 | #save_huge_tensor() 60 | 61 | sub_process = Process(target=measure_process, args=(os.getpid(),)) 62 | sub_process.start() 63 | 64 | # Test Pytorch's Data Loading 65 | #torch_load_huge_shared_tensor() 66 | 67 | # Test Quiver-Feature's SharedLoader 68 | qvf_load_huge_shared_tensor() 69 | 70 | sub_process.join() -------------------------------------------------------------------------------- /tests/python/test_TensorEndPoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import qvf 3 | 4 | tensor_endpoint = qvf.TensorEndPoint("localhost", 3341, 0, 0, 60000) 5 | --------------------------------------------------------------------------------