├── .clang-format
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── build.sh
├── csrc
├── include
│ ├── infinity
│ │ ├── core
│ │ │ ├── Configuration.h
│ │ │ ├── Context.cpp
│ │ │ └── Context.h
│ │ ├── infinity.h
│ │ ├── memory
│ │ │ ├── Atomic.cpp
│ │ │ ├── Atomic.h
│ │ │ ├── Buffer.cpp
│ │ │ ├── Buffer.h
│ │ │ ├── Region.cpp
│ │ │ ├── Region.h
│ │ │ ├── RegionToken.cpp
│ │ │ ├── RegionToken.h
│ │ │ ├── RegionType.h
│ │ │ ├── RegisteredMemory.cpp
│ │ │ └── RegisteredMemory.h
│ │ ├── queues
│ │ │ ├── QueuePair.cpp
│ │ │ ├── QueuePair.h
│ │ │ ├── QueuePairFactory.cpp
│ │ │ └── QueuePairFactory.h
│ │ ├── requests
│ │ │ ├── RequestToken.cpp
│ │ │ └── RequestToken.h
│ │ └── utils
│ │ │ ├── Address.cpp
│ │ │ ├── Address.h
│ │ │ └── Debug.h
│ ├── miniz
│ │ └── miniz.h
│ └── qvf
│ │ ├── com_endpoint.h
│ │ ├── common.h
│ │ ├── dist_tensor_client.h
│ │ ├── dist_tensor_server.h
│ │ ├── pipe.h
│ │ ├── qvf.h
│ │ ├── range.h
│ │ ├── shared_loader.h
│ │ └── tensor_endpoint.h
└── src
│ ├── module.cpp
│ ├── register.cpp
│ └── shared_loader.cpp
├── docs
├── imgs
│ ├── Network Bandwidth Under 100Gbps IB.png
│ ├── consistent_memory_view.png
│ ├── e2e_feature_collection.png
│ ├── e2e_feature_collection_performance.png
│ ├── gpu0_centered_access_performance.png
│ ├── memory_usage.png
│ ├── multi_qp.png
│ ├── one_batch_feature_collection.png
│ ├── peak_memory_footprint.png
│ ├── pgas_tensor_access.png
│ ├── pgas_tensor_view.png
│ ├── range_partition.png
│ ├── rdma_mtt.png
│ ├── shared_load.png
│ ├── subset_signaled_requests.png
│ └── train_gnn_on_large_graphs.png
├── memory.md
├── partition_methods.md
└── rdma_details.md
├── examples
├── mag240m
│ ├── README.md
│ ├── config.py
│ ├── distribute_training.py
│ ├── preprocess.py
│ └── preprocess_quiver.py
├── ogb-products
│ ├── config.py
│ └── distribute_training.py
└── reddit
│ ├── config.py
│ └── distribute_training.py
├── quiver_feature
├── __init__.py
├── common.py
├── dist_helper.py
├── dist_tensor_pgas.py
├── dist_tensor_rpc.py
├── local_tensor_pgas.py
├── multiprocessing
│ ├── __init__.py
│ └── reductions.py
├── tensor_loader.py
└── utils.py
├── setup.py
└── tests
├── cpp
├── test_DistTensorClient.cpp
├── test_DistTensorServer.cpp
├── test_Pipe.cpp
└── test_main.cpp
├── infinity
├── feature_server.cpp
├── read-write-send.cpp
├── send-performance.cpp
├── test_multiread.cpp
├── test_multiread_multiqp.cpp
└── test_read.cpp
└── python
├── config.py
├── preprocess_Dataset.py
├── test_DGLUnifiedTensor.py
├── test_DistHelper.py
├── test_DistTensorClient.py
├── test_DistTensorPGAS.py
├── test_DistTensorRPC.py
├── test_DistTensorServer.py
├── test_LocalTensorPGAS.py
├── test_MultiMachineDistTensorClientServer.py
├── test_MultiMachineDistTensorPGAS.py
├── test_MultiMachineDistTensorRPC.py
├── test_PipeParam.py
├── test_RealDataset.py
├── test_RegisteredTensorTransfer.py
├── test_SharedLoader.py
├── test_TensorEndPoint.py
└── tmp.py
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Chromium
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | .vscode/
132 | build/
133 | infinity_realease/
134 |
135 | examples/reddit/processed/
136 | examples/reddit/raw/
137 | tests/data/
138 |
139 | .idea/*
140 | cmake-build-debug/
141 |
142 | # OSX
143 | .DS_Store
144 |
145 | *.pt
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.2.0
4 | hooks:
5 | - id: end-of-file-fixer
6 | - id: trailing-whitespace
7 | - id: check-yaml
8 | exclude: |
9 | (?x)^(
10 | conda/pytorch-geometric/meta.yaml|
11 | conda/pyg/meta.yaml
12 | )$
13 | # - repo: https://github.com/adrienverge/yamllint.git
14 | # rev: v1.26.3
15 | # hooks:
16 | # - id: yamllint
17 | # args: [-c=.yamllint.yml]
18 |
19 | # - repo: https://github.com/regebro/pyroma
20 | # rev: "4.0"
21 | # hooks:
22 | # - id: pyroma
23 | # name: Check packaging
24 | # args: [--min=10, .]
25 |
26 | # - repo: https://github.com/pre-commit/mirrors-yapf
27 | # rev: v0.32.0
28 | # hooks:
29 | # - id: yapf
30 | # name: Format code
31 |
32 | # - repo: https://github.com/pycqa/isort
33 | # rev: 5.10.1
34 | # hooks:
35 | # - id: isort
36 | # name: Sort imports
37 |
38 | # - repo: https://github.com/PyCQA/flake8
39 | # rev: 4.0.1
40 | # hooks:
41 | # - id: flake8
42 | # name: Check PEP8
43 |
44 | - repo: https://github.com/pre-commit/mirrors-clang-format
45 | rev: v14.0.1
46 | hooks:
47 | - id: clang-format
48 | name: Format C++ code
49 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.12)
2 | project(quiver_feature)
3 | set(CMAKE_CXX_STANDARD 14)
4 | set(CMAKE_CUDA_STANDARD 14)
5 |
6 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
7 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
8 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
9 |
10 | file(GLOB HEADERS csrc/include/qvf/*.h csrc/include/infinity/*.h csrc/include/miniz/*.h csrc/include/infinity/core/*.h csrc/include/infinity/memory/*.h csrc/include/infinity/queues/*.h csrc/include/infinity/requests/*.h csrc/include/infinity/utils/*.h)
11 | file(GLOB SOURCES csrc/src/*.cpp csrc/include/miniz/*.c csrc/include/infinity/requests/*.cpp csrc/include/infinity/core/*.cpp csrc/include/infinity/memory/*.cpp csrc/include/infinity/queues/*.cpp csrc/include/infinity/utils/*.cpp)
12 | file(GLOB TEST_SOURCES tests/cpp/*.cpp)
13 |
14 | set_source_files_properties(SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs")
15 | set_source_files_properties(TEST_SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs")
16 |
17 | find_package(Python3 COMPONENTS Interpreter Development)
18 | find_package(Torch REQUIRED)
19 | add_library(${PROJECT_NAME} SHARED ${SOURCES})
20 | find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
21 |
22 | target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
23 | target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
24 | target_link_libraries(${PROJECT_NAME} PRIVATE ibverbs)
25 |
26 | if (PROF)
27 | target_link_options(${PROJECT_NAME} PRIVATE "-pg")
28 | endif()
29 |
30 | target_include_directories(${PROJECT_NAME} PUBLIC csrc/include)
31 |
32 | include(GNUInstallDirs)
33 | include(CMakePackageConfigHelpers)
34 |
35 | install(TARGETS ${PROJECT_NAME}
36 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME})
37 | install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
38 |
39 | if(BUILD_TEST)
40 | add_executable(cpp_test ${TEST_SOURCES})
41 | target_link_libraries(cpp_test PRIVATE ${TORCH_LIBRARIES})
42 | target_link_libraries(cpp_test PRIVATE Python3::Python)
43 | target_link_libraries(cpp_test PRIVATE ${PROJECT_NAME})
44 | endif()
45 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ##################################################
2 | #
3 | # (c) 2018 Claude Barthels, ETH Zurich
4 | #
5 | # Call 'make library' to build the library
6 | # Call 'make examples' to build the examples
7 | # Call 'make all' to build everything
8 | #
9 | ##################################################
10 |
11 | PROJECT_NAME = libinfinity
12 |
13 | ##################################################
14 |
15 | CC = g++
16 | CC_FLAGS = -O3 -std=c++14
17 | LD_FLAGS = -linfinity -libverbs
18 |
19 | ##################################################
20 |
21 | SOURCE_FOLDER = csrc/include/
22 | BUILD_FOLDER = build/infinity
23 | RELEASE_FOLDER = build/infinity_release
24 | INCLUDE_FOLDER = include
25 | EXAMPLES_FOLDER = infinity/
26 |
27 | ##################################################
28 |
29 | SOURCE_FILES = $(SOURCE_FOLDER)/infinity/core/Context.cpp \
30 | $(SOURCE_FOLDER)/infinity/memory/Atomic.cpp \
31 | $(SOURCE_FOLDER)/infinity/memory/Buffer.cpp \
32 | $(SOURCE_FOLDER)/infinity/memory/Region.cpp \
33 | $(SOURCE_FOLDER)/infinity/memory/RegionToken.cpp \
34 | $(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.cpp \
35 | $(SOURCE_FOLDER)/infinity/queues/QueuePair.cpp \
36 | $(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.cpp \
37 | $(SOURCE_FOLDER)/infinity/requests/RequestToken.cpp \
38 | $(SOURCE_FOLDER)/infinity/utils/Address.cpp
39 |
40 | HEADER_FILES = $(SOURCE_FOLDER)/infinity/infinity.h \
41 | $(SOURCE_FOLDER)/infinity/core/Context.h \
42 | $(SOURCE_FOLDER)/infinity/core/Configuration.h \
43 | $(SOURCE_FOLDER)/infinity/memory/Atomic.h \
44 | $(SOURCE_FOLDER)/infinity/memory/Buffer.h \
45 | $(SOURCE_FOLDER)/infinity/memory/Region.h \
46 | $(SOURCE_FOLDER)/infinity/memory/RegionToken.h \
47 | $(SOURCE_FOLDER)/infinity/memory/RegionType.h \
48 | $(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.h \
49 | $(SOURCE_FOLDER)/infinity/queues/QueuePair.h \
50 | $(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.h \
51 | $(SOURCE_FOLDER)/infinity/requests/RequestToken.h \
52 | $(SOURCE_FOLDER)/infinity/utils/Debug.h \
53 | $(SOURCE_FOLDER)/infinity/utils/Address.h
54 |
55 | ##################################################
56 |
57 | OBJECT_FILES = $(patsubst $(SOURCE_FOLDER)/%.cpp,$(BUILD_FOLDER)/%.o,$(SOURCE_FILES))
58 | SOURCE_DIRECTORIES = $(dir $(HEADER_FILES))
59 | BUILD_DIRECTORIES = $(patsubst $(SOURCE_FOLDER)/%,$(BUILD_FOLDER)/%,$(SOURCE_DIRECTORIES))
60 |
61 | ##################################################
62 |
63 | all: library examples
64 |
65 | ##################################################
66 |
67 | $(BUILD_FOLDER)/%.o: $(SOURCE_FILES) $(HEADER_FILES)
68 | mkdir -p $(BUILD_FOLDER)
69 | mkdir -p $(BUILD_DIRECTORIES)
70 | $(CC) $(CC_FLAGS) -c $(SOURCE_FOLDER)/$*.cpp -I $(SOURCE_FOLDER) -o $(BUILD_FOLDER)/$*.o
71 |
72 | ##################################################
73 |
74 | library: $(OBJECT_FILES)
75 | mkdir -p $(RELEASE_FOLDER)
76 | ar rvs $(RELEASE_FOLDER)/$(PROJECT_NAME).a $(OBJECT_FILES)
77 | rm -rf $(RELEASE_FOLDER)/$(INCLUDE_FOLDER)
78 | cp --parents $(HEADER_FILES) $(RELEASE_FOLDER)
79 | mv $(RELEASE_FOLDER)/$(SOURCE_FOLDER)/ $(RELEASE_FOLDER)/$(INCLUDE_FOLDER)
80 |
81 | ##################################################
82 |
83 | clean:
84 | rm -rf $(BUILD_FOLDER)
85 | rm -rf $(RELEASE_FOLDER)
86 |
87 | ##################################################
88 |
89 | examples:
90 | mkdir -p $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)
91 | # $(CC) tests/infinity/read-write-send.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/read-write-send
92 | # $(CC) tests/infinity/send-performance.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/send-performance
93 | # $(CC) tests/infinity/test_read.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_read
94 | # $(CC) tests/infinity/test_multiread.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread
95 | $(CC) tests/infinity/test_multiread_multiqp.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread_multiqp
96 | $(CC) tests/cpp/test_pipe.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_pipe
97 |
98 | ##################################################
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [pypi-image]: https://badge.fury.io/py/torch-geometric.svg
2 | [pypi-url]: https://pypi.org/project/quiver-feature/
3 |
4 |
5 |
6 |
7 |
8 | --------------------------------------------------------------------------------
9 |
10 | Quiver-Feature is a RDMA-based high performance **distributed feature collection component** for **training GNN models on extremely large graphs**, It is built on [Quiver](https://github.com/quiver-team/torch-quiver) and has several novel features:
11 |
12 | 1. **High Performance**: Quiver-Feature has **5-10x throughput performance** over feature collection solutions in existing GNN systems such as [DGL](https://github.com/dmlc/dgl) and [PyG](https://github.com/pyg-team/pytorch_geometric).
13 |
14 | 2. **Maximum Hardware Resource Utilization Efficiency**: Quiver-Feature has minimal CPU usage and minimal memory bus traffic, leaving much of the CPU and memory resource to tasks like graph sampling and model training.
15 |
16 | 3. **Easy to use**: To use Quiver-Feature, developers only need to add a few lines of code in existing PyG/DGL programs. Quiver-Feature is thus easy to be adopted by PyG/DGL users and deployed in production clusters.
17 |
18 | 
19 |
20 | --------------------------------------------------------------------------------
21 |
22 | # GPU-centric Data Placement And Zero-Copy Data Access
23 |
24 | **`GPU-centric data placement`** and **`Zero-Copy data access method`** are two keys behind Quiver-Feature's high performance.
25 |
26 | **`GPU-Centric Data Placement`:** Quiver-Feature has a unified view of memories across heterogeneous devices and machines. It classifies these memories into 4 memory spaces under a GPU-centric view: **Local HBM**(Current GPU's Memory),**Neighbor HBM**, **Local DRAM**(Current machines's CPU memory) and **Remote DRAM**(Remote CPU's memory). These 4 memory spaces have connections with each other using PCIe, NVLink and RDMA etc.
27 |
28 | 
29 |
30 | Accessing different memory spaces from GPU has unbalanced performance. Considering that feature data access frequency during GNN training is also unbalanced, Quiver-Feature uses an **`application-aware and GPU-Centric data palcement algorithm`** to takes full advantage of the GPU-centric multi-level memory layers.
31 |
32 | **`Zero-Copy Data Access`:** Feature collection in GNN training involves massive data movement across network, DRAM, PCIe and NVLink and any extra memory copy hurts the e2e performance. Quiver-Feature uses one-sided commnunication methods such as `UVA` for local memory spaces access(Local HBM, Local DRAM, Neighbor HBM) and `RDMA READ` for remote memory space access(Remote DRAM), achiving zero-copy and minimum CPU intervention.([You can refer to this document for more RDMA details](docs/rdma_details.md))
33 |
34 |
35 | **`DistTensorPGAS`:** Above those memory spaces, Quiver-Feature adopts **[`PGAS`](https://en.wikipedia.org/wiki/Partitioned_global_address_space) memory model** and implements a 2-dimension distributed tensor abstraction which is called `DistTensorPGAS`. Users can use `DistTensorPGAS` just like a local torch.Tensor, such as querying `shape` and performing `slicing operation` etc.
36 |
37 | 
38 |
39 |
40 | # Performance Benchmark
41 |
42 | As far as we know, there's no public GNN system directly supports using RDMA for feature collection. `DGL` uses [TensorPipe](https://github.com/pytorch/tensorpipe) as its rpc backend, [TensorPipe](https://github.com/pytorch/tensorpipe) itself supports RDMA but `DGL` has not integrated this feature. Since [TensorPipe](https://github.com/pytorch/tensorpipe) is also the [official rpc backend](https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc) of Pytorch, we compare the feature collection performance between`Quiver-Feature` with `Pytorch-RPC Based Solution`.
43 |
44 | We have 2 machines and 100Gbps IB networks between them. We partition the data uniformly and start M GPU training processes on each machine(which we will refer as `2 Machines 2M GPUs` in the following result chart). we benchmark feature collection performance of `Quiver-Feature` and `Pytorch-RPC Based Solution` and we can see that `Quiver-Feature` is 5x better over `Pytorch-RPC Based Solution` in all settings.
45 |
46 | 
47 |
48 | # Install
49 |
50 | ## Install From Source(Recommended For Now)
51 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver).
52 |
53 | 2. Install Quiver-Feature from source
54 |
55 | $ git clone git@github.com:quiver-team/quiver-feature
56 | $ cd quiver-feature/
57 | $ pip install .
58 |
59 | ## Pip Install
60 |
61 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver).
62 |
63 | 2. Install the `Quiver-Feature` pip package.
64 |
65 | $ pip install quiver-feature
66 |
67 | We have tested Quiver with the following setup:
68 |
69 | - OS: Ubuntu 18.04, Ubuntu 20.04
70 |
71 | - CUDA: 10.2, 11.1
72 |
73 | - GPU: Nvidia P100, V100, Titan X, A6000
74 |
75 | ## Test Install
76 |
77 | You can download Quiver-Feature's examples to test installation:
78 |
79 | $ git clone git@github.com:quiver-team/quiver-feature.git
80 | $ cd quiver-feature/examples/reddit
81 | $ python3 distribute_training.py
82 |
83 | A successful run should contain the following line:
84 |
85 | `Starting Server With: xxxx`
86 |
87 |
88 | # Quick Start
89 |
90 | To use Quiver-Feature, you need to replace PyG's feature tensors with `quiver_feature.DistTensorPGAS`,this usually requires only a few changes in existing PyG programs with following 4 steps on each machine:
91 |
92 | - Load feature partition and meta data which belongs to the current machine.
93 |
94 | - Exchange feature partition meta data with other processes using `quiver_feature.DistHelper`.
95 |
96 | - Create a `quiver_feature.DistTensorPGAS` from local feature partition and meta data.
97 |
98 | - Pass the `quiver_feature.DistTensorPGAS` built above as parameter to each training process for feature collection.
99 |
100 | Here is a simple example for using Quiver-Feature in a PyG's program. You can check the [original scripts](examples/reddit/distribute_training.py) for more details.
101 |
102 | ```python
103 |
104 | def train_process(rank, dist_tensor):
105 | ...
106 | for batch_size, n_id, adjs in train_loader:
107 | ...
108 | # Using DistTensorPGAS Just Like A torch.Tensor
109 | collected_feature = dist_tensor[n_id]
110 | ...
111 |
112 | if __name__ == "__main__":
113 |
114 | # Step 1: Load Local data partition
115 | local_tensor, cached_range, local_range = load_partitioned_data(...)
116 |
117 | # Step 2: Exchange TensorPoints Information
118 | dist_helper = DistHelper(...)
119 | tensor_endpoints = dist_helper.exchange_tensor_endpoints_info()
120 |
121 |
122 | # Step 3: Build DistTensorPGAS from local feature partition
123 | dist_tensor = DistTensorPGAS(...)
124 |
125 |
126 | # Step 4: Spawn Training Processes Using DistTensor as Parameter
127 | mp.spawn(
128 | train_process,
129 | args=(..., dist_tensor, ...),
130 | nprocs=args.device_per_node,
131 | join=True
132 | )
133 | ...
134 |
135 | ```
136 |
137 | # License
138 |
139 | Quiver-Feature is licensed under the Apache License, Version 2.0
140 |
141 | # Citation
142 | If you use Quiver-Feature in your publication,please cite it by using the following BibTeX entry.
143 |
144 | @Misc{Quiver-Feature,
145 | institution = {Quiver Team},
146 | title = {Quiver-Feature:A High Performance Feature Collection Component For Training GNN On Extremely Large Graphs},
147 | howpublished = {\url{https://github.com/quiver-team/quiver-feature}},
148 | year = {2022}
149 | }
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | mkdir -p build
2 | cd build
3 | Torch_DIR=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \
4 | cmake -DBUILD_TEST=1 -DCMAKE_INSTALL_PREFIX=. ..
5 | make install
6 |
--------------------------------------------------------------------------------
/csrc/include/infinity/core/Configuration.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Core - Configuration
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef CORE_CONFIGURATION_H_
10 | #define CORE_CONFIGURATION_H_
11 |
12 | #include
13 |
14 | namespace infinity {
15 | namespace core {
16 |
17 | class Configuration {
18 |
19 | public:
20 |
21 | /**
22 | * Queue length settings
23 | */
24 |
25 | static const uint32_t SEND_COMPLETION_QUEUE_LENGTH = 8191; // Must be less than MAX_CQE
26 |
27 | static const uint32_t RECV_COMPLETION_QUEUE_LENGTH = 8191; // Must be less than MAX_CQE
28 |
29 | static const uint32_t SHARED_RECV_QUEUE_LENGTH = 8191; // Must be less than MAX_SRQ_WR
30 |
31 | static const uint32_t MAX_NUMBER_OF_OUTSTANDING_REQUESTS = 8191; // Must be less than (MAX_QP_WR * MAX_QP)
32 | // Since we use one single shared receive queue,
33 | // this number should be less than MAX_SRQ_WR
34 |
35 | static const uint32_t MAX_NUMBER_OF_SGE_ELEMENTS = 1; // Must be less than MAX_SGE
36 |
37 | public:
38 |
39 | /**
40 | * System settings
41 | */
42 |
43 | static const uint32_t PAGE_SIZE = 4096; // Memory regions will be page aligned by the Infinity library
44 |
45 | static const uint32_t MAX_CONNECTION_USER_DATA_SIZE = 1024; // Size of the user data which can be transmitted when establishing a connection
46 |
47 | static constexpr const char* DEFAULT_IB_DEVICE = "ib0"; // Default name of IB device
48 |
49 | };
50 |
51 | } /* namespace core */
52 | } /* namespace infinity */
53 |
54 | #endif /* CORE_CONFIGURATION_H_ */
55 |
--------------------------------------------------------------------------------
/csrc/include/infinity/core/Context.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Core - Context
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef CORE_CONTEXT_H_
10 | #define CORE_CONTEXT_H_
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | namespace infinity {
18 | namespace memory {
19 | class Region;
20 | class Buffer;
21 | class Atomic;
22 | class RegisteredMemory;
23 | }
24 | }
25 |
26 | namespace infinity {
27 | namespace queues {
28 | class QueuePair;
29 | class QueuePairFactory;
30 | }
31 | }
32 |
33 | namespace infinity {
34 | namespace requests {
35 | class RequestToken;
36 | }
37 | }
38 |
39 | namespace infinity {
40 | namespace core {
41 |
42 | typedef struct {
43 | infinity::memory::Buffer *buffer;
44 | uint32_t bytesWritten;
45 | uint32_t immediateValue;
46 | bool immediateValueValid;
47 | infinity::queues::QueuePair *queuePair;
48 | } receive_element_t;
49 |
50 | class Context {
51 |
52 | friend class infinity::memory::Region;
53 | friend class infinity::memory::Buffer;
54 | friend class infinity::memory::Atomic;
55 | friend class infinity::memory::RegisteredMemory;
56 | friend class infinity::queues::QueuePair;
57 | friend class infinity::queues::QueuePairFactory;
58 | friend class infinity::requests::RequestToken;
59 |
60 | public:
61 |
62 | /**
63 | * Constructors
64 | */
65 | Context(uint16_t device = 0, uint16_t devicePort = 1);
66 |
67 | /**
68 | * Destructor
69 | */
70 | ~Context();
71 |
72 | public:
73 |
74 | /**
75 | * Check if receive operation completed
76 | */
77 | bool receive(receive_element_t *receiveElement);
78 | bool receive(infinity::memory::Buffer **buffer, uint32_t *bytesWritten, uint32_t *immediateValue, bool *immediateValueValid, infinity::queues::QueuePair **queuePair = NULL);
79 |
80 | /**
81 | * Post a new buffer for receiving messages
82 | */
83 | void postReceiveBuffer(infinity::memory::Buffer *buffer);
84 |
85 | /*
86 | Poll expected signal from completion queue
87 | */
88 | int batchPollSendCompletionQueue(int poll_batch, int expected_num, ibv_wc* wc, bool force_all);
89 |
90 | public:
91 |
92 | infinity::requests::RequestToken * defaultRequestToken;
93 | infinity::memory::Atomic * defaultAtomic;
94 |
95 | protected:
96 |
97 | /**
98 | * Returns ibVerbs context
99 | */
100 | ibv_context * getInfiniBandContext();
101 |
102 | /**
103 | * Returns local device id
104 | */
105 | uint16_t getLocalDeviceId();
106 |
107 | /**
108 | * Returns device port
109 | */
110 | uint16_t getDevicePort();
111 |
112 | /**
113 | * Returns ibVerbs protection domain
114 | */
115 | ibv_pd * getProtectionDomain();
116 |
117 | protected:
118 |
119 | /**
120 | * Check if send operation completed
121 | */
122 | bool pollSendCompletionQueue();
123 |
124 | /**
125 | * Returns ibVerbs completion queue for sending
126 | */
127 | ibv_cq * getSendCompletionQueue();
128 |
129 | /**
130 | * Returns ibVerbs completion queue for receiving
131 | */
132 | ibv_cq * getReceiveCompletionQueue();
133 |
134 | /**
135 | * Returns ibVerbs shared receive queue
136 | */
137 | ibv_srq * getSharedReceiveQueue();
138 |
139 | protected:
140 |
141 | /**
142 | * IB context and protection domain
143 | */
144 | ibv_context *ibvContext;
145 | ibv_pd *ibvProtectionDomain;
146 |
147 | /**
148 | * Local device id and port
149 | */
150 | ibv_device *ibvDevice;
151 | uint16_t ibvLocalDeviceId;
152 | uint16_t ibvDevicePort;
153 |
154 | /**
155 | * IB send and receive completion queues
156 | */
157 | ibv_cq *ibvSendCompletionQueue;
158 | ibv_cq *ibvReceiveCompletionQueue;
159 | ibv_srq *ibvSharedReceiveQueue;
160 |
161 | protected:
162 |
163 | void registerQueuePair(infinity::queues::QueuePair *queuePair);
164 | std::unordered_map queuePairMap;
165 |
166 | };
167 |
168 | } /* namespace core */
169 | } /* namespace infinity */
170 |
171 | #endif /* CORE_CONTEXT_H_ */
172 |
--------------------------------------------------------------------------------
/csrc/include/infinity/infinity.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Infinity - A C++ RDMA library for InfiniBand
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef INFINITY_H_
10 | #define INFINITY_H_
11 |
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 |
26 | #endif /* INFINITY_H_ */
27 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Atomic.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Atomic
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "Atomic.h"
10 |
11 | #include
12 |
13 | namespace infinity {
14 | namespace memory {
15 |
16 | Atomic::Atomic(infinity::core::Context* context) {
17 |
18 | this->context = context;
19 | this->sizeInBytes = sizeof(uint64_t);
20 | this->memoryRegionType = RegionType::ATOMIC;
21 |
22 | this->value = 0;
23 | this->data = &value;
24 |
25 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), &(this->value), this->sizeInBytes,
26 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
27 |
28 |
29 | }
30 |
31 | uint64_t infinity::memory::Atomic::getValue() {
32 |
33 | return this->value;
34 |
35 | }
36 |
37 | void infinity::memory::Atomic::setValueNonAtomic(uint64_t value) {
38 |
39 | this->value = value;
40 |
41 | }
42 |
43 |
44 | Atomic::~Atomic() {
45 |
46 | ibv_dereg_mr(this->ibvMemoryRegion);
47 |
48 | }
49 |
50 | } /* namespace memory */
51 | } /* namespace infinity */
52 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Atomic.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Atomic
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef MEMORY_ATOMIC_H_
10 | #define MEMORY_ATOMIC_H_
11 |
12 | #include
13 |
14 | #include
15 | #include
16 |
17 |
18 | namespace infinity {
19 | namespace memory {
20 |
21 | class Atomic : public Region {
22 |
23 | public:
24 |
25 | Atomic(infinity::core::Context *context);
26 | virtual ~Atomic();
27 |
28 | public:
29 |
30 | uint64_t getValue();
31 |
32 | void setValueNonAtomic(uint64_t value);
33 |
34 | protected:
35 |
36 | uint64_t value;
37 |
38 |
39 | };
40 |
41 | } /* namespace memory */
42 | } /* namespace infinity */
43 |
44 | #endif /* MEMORY_ATOMIC_H_ */
45 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Buffer.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Buffer
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "Buffer.h"
10 |
11 | #include
12 | #include
13 |
14 | #include
15 | #include
16 |
17 | #define MIN(a, b) (((a) < (b)) ? (a) : (b))
18 |
19 | namespace infinity {
20 | namespace memory {
21 |
22 | Buffer::Buffer(infinity::core::Context* context, uint64_t sizeInBytes) {
23 | this->context = context;
24 | this->sizeInBytes = sizeInBytes;
25 | this->memoryRegionType = RegionType::BUFFER;
26 |
27 | int res = posix_memalign(
28 | &(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes);
29 | INFINITY_ASSERT(
30 | res == 0,
31 | "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n");
32 |
33 | memset(this->data, 0, sizeInBytes);
34 |
35 | this->ibvMemoryRegion = ibv_reg_mr(
36 | this->context->getProtectionDomain(), this->data, this->sizeInBytes,
37 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
38 | IBV_ACCESS_REMOTE_READ);
39 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
40 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
41 |
42 | this->memoryAllocated = true;
43 | this->memoryRegistered = true;
44 | }
45 |
46 | Buffer::Buffer(infinity::core::Context* context,
47 | infinity::memory::RegisteredMemory* memory,
48 | uint64_t offset,
49 | uint64_t sizeInBytes) {
50 | this->context = context;
51 | this->sizeInBytes = sizeInBytes;
52 | this->memoryRegionType = RegionType::BUFFER;
53 |
54 | this->data = reinterpret_cast(memory->getData()) + offset;
55 | this->ibvMemoryRegion = memory->getRegion();
56 |
57 | this->memoryAllocated = false;
58 | this->memoryRegistered = false;
59 | }
60 |
61 | Buffer::Buffer(infinity::core::Context* context,
62 | void* memory,
63 | uint64_t sizeInBytes) {
64 | this->context = context;
65 | this->sizeInBytes = sizeInBytes;
66 | this->memoryRegionType = RegionType::BUFFER;
67 |
68 | this->data = memory;
69 | this->ibvMemoryRegion = ibv_reg_mr(
70 | this->context->getProtectionDomain(), this->data, this->sizeInBytes,
71 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
72 | IBV_ACCESS_REMOTE_READ);
73 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
74 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
75 |
76 | this->memoryAllocated = false;
77 | this->memoryRegistered = true;
78 | }
79 |
80 | Buffer::Buffer(infinity::core::Context* context,
81 | uint64_t sizeInBytes,
82 | int device) {
83 | this->context = context;
84 | this->sizeInBytes = sizeInBytes;
85 | this->memoryRegionType = RegionType::BUFFER;
86 |
87 | cudaSetDevice(device);
88 | int cap = sizeInBytes + infinity::core::Configuration::PAGE_SIZE;
89 | int res = cudaMalloc(&this->data, cap);
90 | INFINITY_ASSERT(
91 | res == 0,
92 | "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n");
93 |
94 | void* temp = this->data;
95 | if (uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE) {
96 | uint64_t head =
97 | infinity::core::Configuration::PAGE_SIZE -
98 | uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE;
99 | temp += head;
100 | }
101 | cudaMemset(this->data, 0, cap);
102 |
103 | this->ibvMemoryRegion =
104 | ibv_reg_mr(this->context->getProtectionDomain(), temp, this->sizeInBytes,
105 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
106 | IBV_ACCESS_REMOTE_READ);
107 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
108 | "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
109 |
110 | this->memoryAllocated = true;
111 | this->memoryRegistered = true;
112 | this->cuda = true;
113 | }
114 |
115 | Buffer::~Buffer() {
116 | if (this->memoryRegistered) {
117 | ibv_dereg_mr(this->ibvMemoryRegion);
118 | }
119 | if (this->memoryAllocated) {
120 | if (!this->cuda) {
121 | free(this->data);
122 | } else {
123 | cudaFree(this->data);
124 | }
125 | }
126 | }
127 |
128 | void* Buffer::getData() {
129 | return reinterpret_cast(this->getAddress());
130 | }
131 |
132 | void Buffer::resize(uint64_t newSize, void* newData) {
133 | void* oldData = this->data;
134 | uint32_t oldSize = this->sizeInBytes;
135 |
136 | if (newData == NULL) {
137 | newData = this->data;
138 | }
139 |
140 | if (oldData != newData) {
141 | uint64_t copySize = MIN(newSize, oldSize);
142 | memcpy(newData, oldData, copySize);
143 | }
144 |
145 | if (memoryRegistered) {
146 | ibv_dereg_mr(this->ibvMemoryRegion);
147 | this->ibvMemoryRegion =
148 | ibv_reg_mr(this->context->getProtectionDomain(), newData, newSize,
149 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
150 | IBV_ACCESS_REMOTE_READ);
151 | this->data = newData;
152 | this->sizeInBytes = newSize;
153 | } else {
154 | INFINITY_ASSERT(false,
155 | "[INFINITY][MEMORY][BUFFER] You can only resize memory "
156 | "which has registered by this buffer.\n");
157 | }
158 | }
159 |
160 | } /* namespace memory */
161 | } /* namespace infinity */
162 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Buffer.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Buffer
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef MEMORY_BUFFER_H_
10 | #define MEMORY_BUFFER_H_
11 |
12 | #include
13 | #include
14 | #include
15 |
16 | #include
17 |
18 | namespace infinity {
19 | namespace memory {
20 |
21 | class Buffer : public Region {
22 | public:
23 | Buffer(infinity::core::Context* context, uint64_t sizeInBytes);
24 | Buffer(infinity::core::Context* context, uint64_t sizeInBytes, int device);
25 | Buffer(infinity::core::Context* context,
26 | infinity::memory::RegisteredMemory* memory,
27 | uint64_t offset,
28 | uint64_t sizeInBytes);
29 | Buffer(infinity::core::Context* context, void* memory, uint64_t sizeInBytes);
30 | ~Buffer();
31 |
32 | public:
33 | void* getData();
34 | void resize(uint64_t newSize, void* newData = NULL);
35 |
36 | protected:
37 | bool memoryRegistered;
38 | bool memoryAllocated;
39 | bool cuda;
40 | };
41 |
42 | } /* namespace memory */
43 | } /* namespace infinity */
44 |
45 | #endif /* MEMORY_BUFFER_H_ */
46 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Region.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Region
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "Buffer.h"
10 |
11 | #include
12 | #include
13 |
14 | namespace infinity {
15 | namespace memory {
16 |
17 | Region::~Region() {
18 | // To be overwritten in sub class
19 | }
20 |
21 | RegionToken* Region::createRegionToken() {
22 | return new RegionToken(this, getMemoryRegionType(), getSizeInBytes(), getAddress(), getLocalKey(), getRemoteKey());
23 | }
24 |
25 | RegionToken * Region::createRegionToken(uint64_t offset) {
26 | return new RegionToken(this, getMemoryRegionType(), getRemainingSizeInBytes(offset), getAddressWithOffset(offset), getLocalKey(), getRemoteKey());
27 | }
28 |
29 | RegionToken * Region::createRegionToken(uint64_t offset, uint64_t size) {
30 | return new RegionToken(this, getMemoryRegionType(), size, getAddressWithOffset(offset), getLocalKey(), getRemoteKey());
31 | }
32 |
33 | RegionType Region::getMemoryRegionType() {
34 | return this->memoryRegionType;
35 | }
36 |
37 | uint64_t Region::getSizeInBytes() {
38 | return this->sizeInBytes;
39 | }
40 |
41 | uint64_t Region::getRemainingSizeInBytes(uint64_t offset) {
42 | return this->sizeInBytes - offset;
43 | }
44 |
45 | uint64_t Region::getAddress() {
46 | return reinterpret_cast(this->data);
47 | }
48 |
49 | uint64_t Region::getAddressWithOffset(uint64_t offset) {
50 | return reinterpret_cast(this->data) + offset;
51 | }
52 |
53 | uint32_t Region::getLocalKey() {
54 | return this->ibvMemoryRegion->lkey;
55 | }
56 |
57 | uint32_t Region::getRemoteKey() {
58 | return this->ibvMemoryRegion->rkey;
59 | }
60 |
61 | } /* namespace memory */
62 | } /* namespace infinity */
63 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Region.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Region
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef MEMORY_REGION_H_
10 | #define MEMORY_REGION_H_
11 |
12 | #include
13 | #include
14 |
15 | #include
16 | #include
17 |
18 | namespace infinity {
19 | namespace memory {
20 |
21 | class RegionToken;
22 |
23 | class Region {
24 |
25 | public:
26 |
27 | virtual ~Region();
28 |
29 | RegionToken * createRegionToken();
30 | RegionToken * createRegionToken(uint64_t offset);
31 | RegionToken * createRegionToken(uint64_t offset, uint64_t size);
32 |
33 | public:
34 |
35 | RegionType getMemoryRegionType();
36 | uint64_t getSizeInBytes();
37 | uint64_t getRemainingSizeInBytes(uint64_t offset);
38 | uint64_t getAddress();
39 | uint64_t getAddressWithOffset(uint64_t offset);
40 | uint32_t getLocalKey();
41 | uint32_t getRemoteKey();
42 |
43 | protected:
44 |
45 | infinity::core::Context* context;
46 | RegionType memoryRegionType;
47 | ibv_mr *ibvMemoryRegion;
48 |
49 | protected:
50 |
51 | void * data;
52 | uint64_t sizeInBytes;
53 |
54 | };
55 |
56 | } /* namespace memory */
57 | } /* namespace infinity */
58 |
59 | #endif /* MEMORY_REGION_H_ */
60 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionToken.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Region Token
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include
10 |
11 | namespace infinity {
12 | namespace memory {
13 |
14 | RegionToken::RegionToken() :
15 | memoryRegion (NULL),
16 | memoryRegionType (UNKNOWN),
17 | sizeInBytes(0),
18 | address(0),
19 | localKey(0),
20 | remoteKey(0) {
21 |
22 | // Nothing to do here
23 |
24 | }
25 |
26 | RegionToken::RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey) :
27 | memoryRegion (memoryRegion),
28 | memoryRegionType (memoryRegionType),
29 | sizeInBytes(sizeInBytes),
30 | address(address),
31 | localKey(localKey),
32 | remoteKey(remoteKey) {
33 |
34 | // Nothing to do here
35 |
36 | }
37 |
38 | Region* RegionToken::getMemoryRegion() {
39 | return memoryRegion;
40 | }
41 |
42 | RegionType RegionToken::getMemoryRegionType() {
43 | return this->memoryRegionType;
44 | }
45 |
46 | uint64_t RegionToken::getSizeInBytes() {
47 | return this->sizeInBytes;
48 | }
49 |
50 | uint64_t RegionToken::getRemainingSizeInBytes(uint64_t offset) {
51 | return this->sizeInBytes-offset;
52 | }
53 |
54 | uint64_t RegionToken::getAddress() {
55 | return address;
56 | }
57 |
58 | uint64_t RegionToken::getAddressWithOffset(uint64_t offset) {
59 | return address + offset;
60 | }
61 |
62 | uint32_t RegionToken::getLocalKey() {
63 | return this->localKey;
64 | }
65 |
66 | uint32_t RegionToken::getRemoteKey() {
67 | return this->remoteKey;
68 | }
69 |
70 |
71 | } /* namespace memory */
72 | } /* namespace infinity */
73 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionToken.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Region Token
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef MEMORY_REGIONTOKEN_H_
10 | #define MEMORY_REGIONTOKEN_H_
11 |
12 | #include
13 | #include
14 | #include
15 |
16 | namespace infinity {
17 | namespace memory {
18 |
19 | class RegionToken {
20 |
21 | public:
22 |
23 | RegionToken();
24 | RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey);
25 |
26 | public:
27 |
28 | Region * getMemoryRegion();
29 | RegionType getMemoryRegionType();
30 | uint64_t getSizeInBytes();
31 | uint64_t getRemainingSizeInBytes(uint64_t offset);
32 | uint64_t getAddress();
33 | uint64_t getAddressWithOffset(uint64_t offset);
34 | uint32_t getLocalKey();
35 | uint32_t getRemoteKey();
36 |
37 | protected:
38 |
39 | Region *memoryRegion;
40 | const RegionType memoryRegionType;
41 | const uint64_t sizeInBytes;
42 | const uint64_t address;
43 | const uint32_t localKey;
44 | const uint32_t remoteKey;
45 |
46 | };
47 |
48 | } /* namespace memory */
49 | } /* namespace infinity */
50 |
51 | #endif /* MEMORY_REGIONTOKEN_H_ */
52 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionType.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Region Type
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef MEMORY_REGIONTYPE_H_
10 | #define MEMORY_REGIONTYPE_H_
11 |
12 | namespace infinity {
13 | namespace memory {
14 |
15 | enum RegionType {BUFFER, ATOMIC, UNKNOWN};
16 |
17 | } /* namespace memory */
18 | } /* namespace infinity */
19 |
20 | #endif /* MEMORY_REGIONTYPE_H_ */
21 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegisteredMemory.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Registered Memory
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "RegisteredMemory.h"
10 |
11 | #include
12 | #include
13 |
14 | #include
15 | #include
16 |
17 | namespace infinity {
18 | namespace memory {
19 |
20 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, uint64_t sizeInBytes) {
21 |
22 | this->context = context;
23 | this->sizeInBytes = sizeInBytes;
24 | this->memoryAllocated = true;
25 |
26 | int res = posix_memalign(&(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes);
27 | INFINITY_ASSERT(res == 0, "[INFINITY][MEMORY][REGISTERED] Cannot allocate and align buffer.\n");
28 |
29 | memset(this->data, 0, sizeInBytes);
30 |
31 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes,
32 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
33 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n");
34 | }
35 |
36 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, void *data, uint64_t sizeInBytes) {
37 |
38 | this->context = context;
39 | this->sizeInBytes = sizeInBytes;
40 | this->memoryAllocated = false;
41 |
42 | this->data = data;
43 |
44 | this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes,
45 | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
46 | INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n");
47 | }
48 |
49 |
50 | RegisteredMemory::~RegisteredMemory() {
51 |
52 | ibv_dereg_mr(this->ibvMemoryRegion);
53 |
54 | if(this->memoryAllocated) {
55 | free(this->data);
56 | }
57 |
58 | }
59 |
60 | void* RegisteredMemory::getData() {
61 |
62 | return this->data;
63 |
64 | }
65 |
66 | uint64_t RegisteredMemory::getSizeInBytes() {
67 |
68 | return this->sizeInBytes;
69 |
70 | }
71 |
72 | ibv_mr* RegisteredMemory::getRegion() {
73 |
74 | return this->ibvMemoryRegion;
75 |
76 | }
77 |
78 | } /* namespace pool */
79 | } /* namespace ivory */
80 |
--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegisteredMemory.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Memory - Registered Memory
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef INFINITY_MEMORY_REGISTEREDMEMORY_H_
10 | #define INFINITY_MEMORY_REGISTEREDMEMORY_H_
11 |
12 | #include
13 |
14 | namespace infinity {
15 | namespace memory {
16 |
17 | class RegisteredMemory {
18 |
19 | public:
20 |
21 | RegisteredMemory(infinity::core::Context *context, uint64_t sizeInBytes);
22 | RegisteredMemory(infinity::core::Context *context, void *data, uint64_t sizeInBytes);
23 | ~RegisteredMemory();
24 |
25 | void * getData();
26 |
27 | uint64_t getSizeInBytes();
28 |
29 | ibv_mr * getRegion();
30 |
31 |
32 | protected:
33 |
34 | infinity::core::Context* context;
35 |
36 | void *data;
37 | uint64_t sizeInBytes;
38 |
39 | ibv_mr *ibvMemoryRegion;
40 |
41 | protected:
42 |
43 | bool memoryAllocated;
44 |
45 | };
46 |
47 | } /* namespace infinity */
48 | } /* namespace memory */
49 |
50 | #endif /* INFINITY_MEMORY_REGISTEREDMEMORY_H_ */
51 |
--------------------------------------------------------------------------------
/csrc/include/infinity/queues/QueuePair.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Queues - Queue Pair
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef QUEUES_QUEUEPAIR_H_
10 | #define QUEUES_QUEUEPAIR_H_
11 |
12 | #include
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 |
21 | namespace infinity {
22 | namespace queues {
23 | class QueuePairFactory;
24 | }
25 | } // namespace infinity
26 |
27 | namespace infinity {
28 | namespace queues {
29 | struct SendRequestBuffer {
30 | std::vector sges;
31 | std::vector requests;
32 | SendRequestBuffer() {}
33 | SendRequestBuffer(int num) {
34 | sges.resize(num);
35 | requests.resize(num);
36 | }
37 | void resize(int num) {
38 | sges.resize(num);
39 | requests.resize(num);
40 | }
41 | void reset() {
42 | memset(sges.data(), 0, sizeof(ibv_sge));
43 | memset(requests.data(), 0, sizeof(ibv_send_wr));
44 | }
45 | };
46 | } // namespace queues
47 | } // namespace infinity
48 |
49 | namespace infinity {
50 | namespace queues {
51 |
52 | struct IbvWcBuffer {
53 | ibv_wc* wc;
54 | int size_;
55 | IbvWcBuffer() {}
56 | IbvWcBuffer(int size) {
57 | wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size);
58 | size_ = size;
59 | }
60 | void resize(int size) {
61 | wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size);
62 | size_ = size;
63 | }
64 |
65 | ibv_wc* ptr() { return wc; }
66 | int size() { return size_; }
67 | };
68 | } // namespace queues
69 | } // namespace infinity
70 |
71 | namespace infinity {
72 | namespace queues {
73 |
74 | class OperationFlags {
75 | public:
76 | bool fenced;
77 | bool signaled;
78 | bool inlined;
79 |
80 | OperationFlags() : fenced(false), signaled(false), inlined(false){};
81 |
82 | /**
83 | * Turn the bools into a bit field.
84 | */
85 | int ibvFlags();
86 | };
87 |
88 | class QueuePair {
89 | friend class infinity::queues::QueuePairFactory;
90 |
91 | public:
92 | /**
93 | * Constructor
94 | */
95 | QueuePair(infinity::core::Context* context);
96 |
97 | /**
98 | * Destructor
99 | */
100 | ~QueuePair();
101 |
102 | protected:
103 | /**
104 | * Activation methods
105 | */
106 |
107 | void activate(uint16_t remoteDeviceId,
108 | uint32_t remoteQueuePairNumber,
109 | uint32_t remoteSequenceNumber);
110 | void setRemoteUserData(void* userData, uint32_t userDataSize);
111 |
112 | public:
113 | /**
114 | * User data received during connection setup
115 | */
116 |
117 | bool hasUserData();
118 | uint32_t getUserDataSize();
119 | void* getUserData();
120 |
121 | public:
122 | /**
123 | * Queue pair information
124 | */
125 |
126 | uint16_t getLocalDeviceId();
127 | uint32_t getQueuePairNumber();
128 | uint32_t getSequenceNumber();
129 |
130 | public:
131 | /**
132 | * Buffer operations
133 | */
134 |
135 | void send(infinity::memory::Buffer* buffer,
136 | infinity::requests::RequestToken* requestToken = NULL);
137 | void send(infinity::memory::Buffer* buffer,
138 | uint32_t sizeInBytes,
139 | infinity::requests::RequestToken* requestToken = NULL);
140 | void send(infinity::memory::Buffer* buffer,
141 | uint64_t localOffset,
142 | uint32_t sizeInBytes,
143 | OperationFlags flags,
144 | infinity::requests::RequestToken* requestToken = NULL);
145 |
146 | void write(infinity::memory::Buffer* buffer,
147 | infinity::memory::RegionToken* destination,
148 | infinity::requests::RequestToken* requestToken = NULL);
149 | void write(infinity::memory::Buffer* buffer,
150 | infinity::memory::RegionToken* destination,
151 | uint32_t sizeInBytes,
152 | infinity::requests::RequestToken* requestToken = NULL);
153 | void write(infinity::memory::Buffer* buffer,
154 | uint64_t localOffset,
155 | infinity::memory::RegionToken* destination,
156 | uint64_t remoteOffset,
157 | uint32_t sizeInBytes,
158 | OperationFlags flags,
159 | infinity::requests::RequestToken* requestToken = NULL);
160 |
161 | void read(infinity::memory::Buffer* buffer,
162 | infinity::memory::RegionToken* source,
163 | infinity::requests::RequestToken* requestToken = NULL);
164 | void read(infinity::memory::Buffer* buffer,
165 | infinity::memory::RegionToken* source,
166 | uint32_t sizeInBytes,
167 | infinity::requests::RequestToken* requestToken = NULL);
168 | void read(infinity::memory::Buffer* buffer,
169 | uint64_t localOffset,
170 | infinity::memory::RegionToken* source,
171 | uint64_t remoteOffset,
172 | uint32_t sizeInBytes,
173 | OperationFlags flags,
174 | infinity::requests::RequestToken* requestToken = NULL);
175 |
176 | public:
177 | /**
178 | * Complex buffer operations
179 | */
180 |
181 | void multiWrite(infinity::memory::Buffer** buffers,
182 | uint32_t* sizesInBytes,
183 | uint64_t* localOffsets,
184 | uint32_t numberOfElements,
185 | infinity::memory::RegionToken* destination,
186 | uint64_t remoteOffset,
187 | OperationFlags flags,
188 | infinity::requests::RequestToken* requestToken = NULL);
189 |
190 | void multiRead(uint32_t batch_size,
191 | infinity::memory::Buffer* buffer,
192 | int64_t* localOffset,
193 | infinity::memory::RegionToken* source,
194 | int64_t* remoteOffset,
195 | uint32_t sizeInBytes,
196 | OperationFlags send_flags,
197 | infinity::requests::RequestToken* requestToken,
198 | infinity::queues::SendRequestBuffer& send_buffer);
199 |
200 | void sendWithImmediate(infinity::memory::Buffer* buffer,
201 | uint64_t localOffset,
202 | uint32_t sizeInBytes,
203 | uint32_t immediateValue,
204 | OperationFlags flags,
205 | infinity::requests::RequestToken* requestToken = NULL);
206 |
207 | void writeWithImmediate(
208 | infinity::memory::Buffer* buffer,
209 | uint64_t localOffset,
210 | infinity::memory::RegionToken* destination,
211 | uint64_t remoteOffset,
212 | uint32_t sizeInBytes,
213 | uint32_t immediateValue,
214 | OperationFlags flags,
215 | infinity::requests::RequestToken* requestToken = NULL);
216 |
217 | void multiWriteWithImmediate(
218 | infinity::memory::Buffer** buffers,
219 | uint32_t* sizesInBytes,
220 | uint64_t* localOffsets,
221 | uint32_t numberOfElements,
222 | infinity::memory::RegionToken* destination,
223 | uint64_t remoteOffset,
224 | uint32_t immediateValue,
225 | OperationFlags flags,
226 | infinity::requests::RequestToken* requestToken = NULL);
227 |
228 | public:
229 | /**
230 | * Atomic value operations
231 | */
232 |
233 | void compareAndSwap(infinity::memory::RegionToken* destination,
234 | uint64_t compare,
235 | uint64_t swap,
236 | infinity::requests::RequestToken* requestToken = NULL);
237 | void compareAndSwap(infinity::memory::RegionToken* destination,
238 | infinity::memory::Atomic* previousValue,
239 | uint64_t compare,
240 | uint64_t swap,
241 | OperationFlags flags,
242 | infinity::requests::RequestToken* requestToken = NULL);
243 | void fetchAndAdd(infinity::memory::RegionToken* destination,
244 | uint64_t add,
245 | infinity::requests::RequestToken* requestToken = NULL);
246 | void fetchAndAdd(infinity::memory::RegionToken* destination,
247 | infinity::memory::Atomic* previousValue,
248 | uint64_t add,
249 | OperationFlags flags,
250 | infinity::requests::RequestToken* requestToken = NULL);
251 |
252 | protected:
253 | infinity::core::Context* const context;
254 |
255 | ibv_qp* ibvQueuePair;
256 | uint32_t sequenceNumber;
257 |
258 | void* userData;
259 | uint32_t userDataSize;
260 | };
261 |
262 | } /* namespace queues */
263 | } /* namespace infinity */
264 |
265 | #endif /* QUEUES_QUEUEPAIR_H_ */
266 |
--------------------------------------------------------------------------------
/csrc/include/infinity/queues/QueuePairFactory.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Queues - Queue Pair Factory
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef QUEUES_QUEUEPAIRFACTORY_H_
10 | #define QUEUES_QUEUEPAIRFACTORY_H_
11 |
12 | #include
13 | #include
14 |
15 | #include
16 | #include
17 |
18 | namespace infinity {
19 | namespace queues {
20 |
21 | class QueuePairFactory {
22 | public:
23 |
24 | QueuePairFactory(infinity::core::Context *context);
25 | ~QueuePairFactory();
26 |
27 | /**
28 | * Bind to port for listening to incoming connections
29 | */
30 | void bindToPort(uint16_t port);
31 |
32 | /**
33 | * Accept incoming connection request (passive side)
34 | */
35 | QueuePair * acceptIncomingConnection(void *userData = NULL, uint32_t userDataSizeInBytes = 0);
36 |
37 | /**
38 | * Connect to remote machine (active side)
39 | */
40 | QueuePair * connectToRemoteHost(const char* hostAddress, uint16_t port, void *userData = NULL, uint32_t userDataSizeInBytes = 0);
41 |
42 | /**
43 | * Create loopback queue pair
44 | */
45 | QueuePair * createLoopback(void *userData = NULL, uint32_t userDataSizeInBytes = 0);
46 |
47 | protected:
48 |
49 | infinity::core::Context * context;
50 |
51 | int32_t serverSocket;
52 |
53 | };
54 |
55 | } /* namespace queues */
56 | } /* namespace infinity */
57 |
58 | #endif /* QUEUES_QUEUEPAIRFACTORY_H_ */
59 |
--------------------------------------------------------------------------------
/csrc/include/infinity/requests/RequestToken.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Requests - Request Token
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "RequestToken.h"
10 |
11 | namespace infinity {
12 | namespace requests {
13 |
14 | RequestToken::RequestToken(infinity::core::Context *context) :
15 | context(context) {
16 | this->success.store(false);
17 | this->completed.store(false);
18 | this->region = NULL;
19 | this->userData = NULL;
20 | this->userDataValid = false;
21 | this->userDataSize = 0;
22 | this->immediateValue = 0;
23 | this->immediateValueValid = false;
24 | }
25 |
26 | void RequestToken::setCompleted(bool success) {
27 | this->success.store(success);
28 | this->completed.store(true);
29 | }
30 |
31 | bool RequestToken::checkIfCompleted() {
32 | if (this->completed.load()) {
33 | return true;
34 | } else {
35 | this->context->pollSendCompletionQueue();
36 | return this->completed.load();
37 | }
38 | }
39 |
40 | void RequestToken::waitUntilCompleted() {
41 | while (!this->completed.load()) {
42 | this->context->pollSendCompletionQueue();
43 | }
44 | }
45 |
46 | bool RequestToken::wasSuccessful() {
47 | return this->success.load();
48 | }
49 |
50 | void RequestToken::reset() {
51 | this->success.store(false);
52 | this->completed.store(false);
53 | this->region = NULL;
54 | this->userData = NULL;
55 | this->userDataValid = false;
56 | this->userDataSize = 0;
57 | this->immediateValue = 0;
58 | this->immediateValueValid = false;
59 | }
60 |
61 | void RequestToken::setRegion(infinity::memory::Region* region) {
62 | this->region = region;
63 | }
64 |
65 | infinity::memory::Region* RequestToken::getRegion() {
66 | return this->region;
67 | }
68 |
69 | void RequestToken::setUserData(void* userData, uint32_t userDataSize) {
70 | this->userData = userData;
71 | this->userDataSize = userDataSize;
72 | this->userDataValid = true;
73 | }
74 |
75 | void* RequestToken::getUserData() {
76 | return this->userData;
77 | }
78 |
79 | bool RequestToken::hasUserData() {
80 | return this->userDataValid;
81 | }
82 |
83 | uint32_t RequestToken::getUserDataSize() {
84 | return this->userDataSize;
85 | }
86 |
87 | void RequestToken::setImmediateValue(uint32_t immediateValue) {
88 | this->immediateValue = immediateValue;
89 | this->immediateValueValid = true;
90 | }
91 |
92 | uint32_t RequestToken::getImmediateValue() {
93 | return this->immediateValue;
94 | }
95 |
96 | bool RequestToken::hasImmediateValue() {
97 | return this->immediateValueValid;
98 | }
99 |
100 | } /* namespace requests */
101 | } /* namespace infinity */
102 |
--------------------------------------------------------------------------------
/csrc/include/infinity/requests/RequestToken.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Requests - Request Token
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef REQUESTS_REQUESTTOKEN_H_
10 | #define REQUESTS_REQUESTTOKEN_H_
11 |
12 | #include
13 | #include
14 |
15 | #include
16 | #include
17 |
18 | namespace infinity {
19 | namespace requests {
20 |
21 | class RequestToken {
22 |
23 | public:
24 |
25 | RequestToken(infinity::core::Context *context);
26 |
27 | void reset();
28 |
29 | void setRegion(infinity::memory::Region * region);
30 | infinity::memory::Region * getRegion();
31 |
32 | void setCompleted(bool success);
33 | bool wasSuccessful();
34 |
35 | bool checkIfCompleted();
36 | void waitUntilCompleted();
37 |
38 | void setImmediateValue(uint32_t immediateValue);
39 | bool hasImmediateValue();
40 | uint32_t getImmediateValue();
41 |
42 | void setUserData(void* userData, uint32_t userDataSize);
43 | bool hasUserData();
44 | void* getUserData();
45 | uint32_t getUserDataSize();
46 |
47 | protected:
48 |
49 | infinity::core::Context * const context;
50 | infinity::memory::Region * region;
51 |
52 | std::atomic completed;
53 | std::atomic success;
54 |
55 | void *userData;
56 | uint32_t userDataSize;
57 | bool userDataValid;
58 |
59 | uint32_t immediateValue;
60 | bool immediateValueValid;
61 |
62 | };
63 |
64 | } /* namespace requests */
65 | } /* namespace infinity */
66 |
67 | #endif /* REQUESTS_REQUESTTOKEN_H_ */
68 |
--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Address.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Utils - Address
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include "Address.h"
10 |
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 |
18 | #include
19 |
20 | namespace infinity {
21 | namespace utils {
22 |
23 | char* Address::getIpAddressOfInterface(const char* interfaceName) {
24 |
25 | struct ifaddrs *ifAddr;
26 | struct ifaddrs *ifa;
27 | char *ipAddress = (char*) calloc(16, sizeof(char));
28 |
29 | int returnValue = getifaddrs(&ifAddr);
30 | INFINITY_ASSERT(returnValue != -1, "[INFINITY][UTILS][ADDRESS] Cannot read interface list.\n");
31 |
32 | for (ifa = ifAddr; ifa != NULL; ifa = ifa->ifa_next) {
33 | if (ifa->ifa_addr == NULL) {
34 | continue;
35 | }
36 | if ((ifa->ifa_addr->sa_family == AF_INET) && (strcasecmp(interfaceName, ifa->ifa_name) == 0)) {
37 | sprintf(ipAddress, "%s", inet_ntoa(((struct sockaddr_in *) ifa->ifa_addr)->sin_addr));
38 | break;
39 | }
40 | }
41 | INFINITY_ASSERT(ifa != NULL, "[INFINITY][UTILS][ADDRESS] Cannot find interface named %s.\n", interfaceName);
42 |
43 | freeifaddrs(ifAddr);
44 |
45 | return ipAddress;
46 |
47 | }
48 |
49 | uint32_t Address::getIpAddressAsUint32(const char* ipAddress) {
50 |
51 | uint32_t ipAddressNumbers[4];
52 | sscanf(ipAddress, "%d.%d.%d.%d", &ipAddressNumbers[3], &ipAddressNumbers[2], &ipAddressNumbers[1], &ipAddressNumbers[0]);
53 | uint32_t ipAddressNumber(ipAddressNumbers[0] | ipAddressNumbers[1] << 8 | ipAddressNumbers[2] << 16 | ipAddressNumbers[3] << 24);
54 | return ipAddressNumber;
55 | }
56 |
57 | } /* namespace utils */
58 | } /* namespace infinity */
59 |
--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Address.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Utils - Address
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef UTILS_ADDRESS_H_
10 | #define UTILS_ADDRESS_H_
11 |
12 | #include
13 |
14 | namespace infinity {
15 | namespace utils {
16 |
17 | class Address {
18 |
19 | public:
20 |
21 | static char * getIpAddressOfInterface(const char *interfaceName);
22 | static uint32_t getIpAddressAsUint32(const char *ipAddress);
23 |
24 | };
25 |
26 | } /* namespace utils */
27 | } /* namespace infinity */
28 |
29 | #endif /* UTILS_ADDRESS_H_ */
30 |
--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Debug.h:
--------------------------------------------------------------------------------
1 | /**
2 | * Utils - Debug
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #ifndef UTILS_DEBUG_H_
10 | #define UTILS_DEBUG_H_
11 |
12 | #include
13 | #include
14 |
15 | #ifdef INFINITY_DEBUG_ON
16 | #define INFINITY_DEBUG(X, ...) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout);}
17 | #else
18 | #define INFINITY_DEBUG(X, ...) {}
19 | #endif
20 |
21 | #ifdef INFINITY_ASSERT_ON
22 | #define INFINITY_ASSERT(B, X, ...) {if(!(B)) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout); exit(-1);}}
23 | #else
24 | #define INFINITY_ASSERT(B, X, ...) {}
25 | #endif
26 |
27 | #endif /* UTILS_DEBUG_H_ */
28 |
--------------------------------------------------------------------------------
/csrc/include/qvf/com_endpoint.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | namespace qvf {
4 | class ComEndPoint {
5 | private:
6 | std::string ip_address;
7 | int port;
8 | int rank;
9 |
10 | public:
11 | ComEndPoint() {}
12 |
13 | ComEndPoint(int rank, std::string ip_address, int port)
14 | : rank(rank), ip_address(ip_address), port(port) {}
15 |
16 | ComEndPoint& operator=(const ComEndPoint& other) {
17 | this->rank = other.rank;
18 | this->ip_address = other.ip_address;
19 | this->port = other.port;
20 | return *this;
21 | }
22 |
23 | void set_data(int rank, std::string ip_address, int port) {
24 | this->rank = rank;
25 | this->ip_address = ip_address;
26 | this->port = port;
27 | }
28 |
29 | std::string get_address(void) { return ip_address; }
30 | int get_port(void) { return port; }
31 | int get_rank(void) { return rank; }
32 | };
33 | } // namespace qvf
34 |
--------------------------------------------------------------------------------
/csrc/include/qvf/common.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 |
5 | #define QUIVER_FEATURE_ASSERT(B, X, ...) \
6 | { \
7 | if (!(B)) { \
8 | fprintf(stdout, X, ##__VA_ARGS__); \
9 | fflush(stdout); \
10 | exit(-1); \
11 | } \
12 | }
13 |
--------------------------------------------------------------------------------
/csrc/include/qvf/dist_tensor_client.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 |
20 | namespace qvf {
21 | struct CollectionTask {
22 | public:
23 | void* base_address;
24 | int collect_from;
25 | int64_t* local_offsets;
26 | int64_t* remote_offsets;
27 | int64_t size;
28 |
29 | public:
30 | CollectionTask() {}
31 | CollectionTask(void* base_address,
32 | int64_t* local_offsets,
33 | int64_t* remote_offsets,
34 | int64_t size,
35 | int collect_from)
36 | : base_address(base_address),
37 | local_offsets(local_offsets),
38 | remote_offsets(remote_offsets),
39 | size(size),
40 | collect_from(collect_from) {}
41 | };
42 | class DistTensorClient {
43 | public:
44 | std::vector pipes;
45 | std::vector com_endpoints;
46 |
47 | // About communication
48 | PipeParam pipe_param;
49 | int server_size;
50 | int server_rank;
51 |
52 | // About IB
53 | infinity::core::Context* context;
54 | infinity::queues::QueuePairFactory* qpFactory;
55 |
56 | infinity::memory::Buffer* tensor_buffer;
57 | infinity::memory::RegionToken* tensor_token;
58 |
59 | // about feature client
60 | std::deque task_queue;
61 |
62 | public:
63 | DistTensorClient(int server_rank,
64 | std::vector com_endpoints,
65 | PipeParam pipe_param) {
66 | this->server_rank = server_rank;
67 | this->com_endpoints = com_endpoints;
68 | this->pipe_param = pipe_param;
69 | server_size = com_endpoints.size();
70 | init_connection();
71 | }
72 |
73 | void init_connection() {
74 | context = new infinity::core::Context();
75 | qpFactory = new infinity::queues::QueuePairFactory(context);
76 | pipes.resize(server_size);
77 | for (int idx = 0; idx < server_size; idx++) {
78 | if (com_endpoints[idx].get_rank() == server_rank) {
79 | continue;
80 | }
81 | pipes[com_endpoints[idx].get_rank()] =
82 | new Pipe(context, qpFactory, com_endpoints[idx], pipe_param);
83 | pipes[com_endpoints[idx].get_rank()]->connect();
84 | }
85 | }
86 |
87 | torch::Tensor create_registered_float32_tensor(
88 | std::vector tensor_shape) {
89 | QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2,
90 | "Only support 2-dimensional tensor");
91 | auto tensor_option = torch::TensorOptions().dtype(torch::kFloat32);
92 | uint64_t size_in_bytes = 4;
93 | for (int index = 0; index < tensor_shape.size(); index++) {
94 | size_in_bytes *= tensor_shape[index];
95 | }
96 | tensor_buffer = new infinity::memory::Buffer(context, size_in_bytes);
97 | tensor_token = tensor_buffer->createRegionToken();
98 | return torch::from_blob(tensor_buffer->getData(),
99 | {tensor_shape[0], tensor_shape[1]}, tensor_option);
100 | }
101 |
102 | void register_float_tensor(torch::Tensor& float_tensor) {
103 | QUIVER_FEATURE_ASSERT(
104 | float_tensor.dim() == 2,
105 | "Only support 2-dimensional tensor, But got %d-dimensional tensor\n",
106 | float_tensor.dim());
107 |
108 | uint64_t size_in_bytes = float_tensor.element_size() * float_tensor.numel();
109 |
110 | tensor_buffer = new infinity::memory::Buffer(
111 | context, float_tensor.data_ptr(), size_in_bytes);
112 |
113 | tensor_token = tensor_buffer->createRegionToken();
114 | }
115 |
116 | torch::Tensor create_registered_float32_tensor_cuda(
117 | std::vector tensor_shape,
118 | int device) {
119 | QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2,
120 | "Only support 2-dimensional tensor");
121 | uint64_t size_in_bytes = 4;
122 | for (int index = 0; index < tensor_shape.size(); index++) {
123 | size_in_bytes *= tensor_shape[index];
124 | }
125 | tensor_buffer =
126 | new infinity::memory::Buffer(context, size_in_bytes, device);
127 | tensor_token = tensor_buffer->createRegionToken();
128 | auto tensor_option = torch::TensorOptions()
129 | .dtype(torch::kFloat32)
130 | .device(torch::kCUDA, device);
131 | return torch::from_blob(tensor_buffer->getData(),
132 | {tensor_shape[0], tensor_shape[1]}, tensor_option);
133 | }
134 |
135 | void sync_read(int server_rank,
136 | torch::Tensor& res_tensor,
137 | torch::Tensor& local_offsets,
138 | torch::Tensor& remote_offsets) {
139 | QUIVER_FEATURE_ASSERT(
140 | reinterpret_cast(res_tensor.data_ptr()) ==
141 | tensor_buffer->getAddress(),
142 | "Result Tensor is not created from registered buffer");
143 |
144 | pipes[server_rank]->read(tensor_buffer, local_offsets, remote_offsets,
145 | res_tensor.size(1) * res_tensor.element_size());
146 | }
147 |
148 | void collect_inner(CollectionTask collection_task) {
149 | task_queue.push_back(collection_task);
150 | }
151 |
152 | void start_feature_client() {}
153 | };
154 | } // namespace qvf
155 |
--------------------------------------------------------------------------------
/csrc/include/qvf/dist_tensor_server.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #include
14 | #include
15 | #include
16 | #include
17 |
18 | #include
19 | #include
20 |
21 | namespace qvf {
22 | class DistTensorServer {
23 | private:
24 | int port;
25 | int world_size;
26 | int qp_per_pipe;
27 |
28 | infinity::core::Context* context;
29 | infinity::queues::QueuePairFactory* qpFactory;
30 | infinity::memory::Buffer* feature_buffer;
31 | infinity::memory::RegionToken* bufferToken;
32 |
33 | std::thread server_thread;
34 |
35 | public:
36 | DistTensorServer(int port, int world_size, int qp_per_pipe)
37 | : port(port), world_size(world_size), qp_per_pipe(qp_per_pipe) {
38 | context = new infinity::core::Context();
39 | qpFactory = new infinity::queues::QueuePairFactory(context);
40 | qpFactory->bindToPort(port);
41 | }
42 |
43 | void join() { server_thread.join(); }
44 |
45 | void serve(void* data, int64_t size_in_bytes) {
46 | feature_buffer =
47 | new infinity::memory::Buffer(context, data, (uint64_t)size_in_bytes);
48 | bufferToken = feature_buffer->createRegionToken();
49 | server_thread =
50 | std::thread(run, qpFactory, bufferToken, qp_per_pipe * world_size);
51 | }
52 |
53 | void serve_tensor(torch::Tensor& data) {
54 | std::cout << "Registering Buffer, Please Wait..." << std::endl;
55 | uint64_t size_in_bytes = data.numel() * data.element_size();
56 |
57 | feature_buffer = new infinity::memory::Buffer(
58 | context, data.data_ptr(), size_in_bytes);
59 | bufferToken = feature_buffer->createRegionToken();
60 | server_thread = std::thread(run, qpFactory, bufferToken,
61 | qp_per_pipe * (world_size - 1));
62 | }
63 |
64 | static void run(infinity::queues::QueuePairFactory* qpFactory,
65 | infinity::memory::RegionToken* bufferToken,
66 | int total_qp_num) {
67 | std::cout << "Buffer Registeration Done! Ready To Receive Connections, "
68 | "Start Your Clients Now"
69 | << std::endl;
70 | for (int qp_index = 0; qp_index < total_qp_num; qp_index++) {
71 | qpFactory->acceptIncomingConnection(
72 | bufferToken, sizeof(infinity::memory::RegionToken));
73 | }
74 |
75 | while (1) {
76 | std::this_thread::sleep_for(std::chrono::seconds(10)); // 10s
77 | }
78 | }
79 | };
80 |
81 | } // namespace qvf
82 |
--------------------------------------------------------------------------------
/csrc/include/qvf/pipe.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #include
15 | #include
16 | #include
17 |
18 | namespace qvf {
19 |
20 | // Pipe are used for single side RDMA read to remote data servers
21 | struct PipeParam {
22 | int qp_num;
23 | int ctx_poll_batch;
24 | int tx_depth;
25 | int post_list_size;
26 | PipeParam() {}
27 | PipeParam(int qp_num,
28 | int ctx_poll_batch,
29 | int tx_depth,
30 | int post_list_size) {
31 | this->qp_num = qp_num;
32 | this->ctx_poll_batch = ctx_poll_batch;
33 | this->tx_depth = tx_depth;
34 | this->post_list_size = post_list_size;
35 | }
36 | void set_params(int qp_num,
37 | int ctx_poll_batch,
38 | int tx_depth,
39 | int post_list_size) {
40 | this->qp_num = qp_num;
41 | this->ctx_poll_batch = ctx_poll_batch;
42 | this->tx_depth = tx_depth;
43 | this->post_list_size = post_list_size;
44 | }
45 | void set_param_vec(std::vector param_vec){
46 | qp_num = param_vec[0];
47 | ctx_poll_batch = param_vec[1];
48 | tx_depth = param_vec[2];
49 | post_list_size = param_vec[3];
50 | }
51 |
52 | std::vector get_param_vec(){
53 | std::vector params;
54 | params.push_back(qp_num);
55 | params.push_back(ctx_poll_batch);
56 | params.push_back(tx_depth);
57 | params.push_back(post_list_size);
58 | return params;
59 | }
60 |
61 | PipeParam& operator=(const PipeParam& pipe_param) {
62 | set_params(pipe_param.qp_num, pipe_param.ctx_poll_batch,
63 | pipe_param.tx_depth, pipe_param.post_list_size);
64 | return *this;
65 | }
66 | };
67 |
68 | class Pipe {
69 | private:
70 | ComEndPoint remote_end;
71 | PipeParam pipe_param;
72 | std::vector remote_buffer_tokens;
73 | std::vector qps;
74 | std::vector requests;
75 | infinity::queues::SendRequestBuffer send_buffer;
76 | infinity::core::Context* context;
77 | infinity::queues::QueuePairFactory* qpFactory;
78 | infinity::queues::IbvWcBuffer wc_buffer;
79 | int requests_size;
80 | bool connected;
81 |
82 | public:
83 | Pipe() : connected(false) {}
84 | Pipe(infinity::core::Context* context,
85 | infinity::queues::QueuePairFactory* qpFactory,
86 | ComEndPoint com_endpoint,
87 | PipeParam pipe_param) {
88 | this->context = context;
89 | this->qpFactory = qpFactory;
90 | this->remote_end = com_endpoint;
91 | this->pipe_param = pipe_param;
92 | connected = false;
93 | }
94 |
95 | Pipe& operator=(const Pipe& pipe) {
96 | if (pipe.connected) {
97 | fprintf(stderr, "Pipe can only be assigned before connect");
98 | }
99 | this->remote_end = pipe.remote_end;
100 | this->pipe_param = pipe.pipe_param;
101 | this->context = pipe.context;
102 | this->qpFactory = pipe.qpFactory;
103 | connected = false;
104 | return *this;
105 | }
106 |
107 | void connect() {
108 | qps.resize(pipe_param.qp_num);
109 | remote_buffer_tokens.resize(pipe_param.qp_num);
110 | requests_size =
111 | pipe_param.tx_depth / pipe_param.post_list_size;
112 | requests.resize(requests_size);
113 | send_buffer.resize(pipe_param.post_list_size);
114 | wc_buffer.resize(pipe_param.ctx_poll_batch);
115 | for (int qp_index = 0; qp_index < pipe_param.qp_num; qp_index++) {
116 | qps[qp_index] = qpFactory->connectToRemoteHost(
117 | remote_end.get_address().c_str(), remote_end.get_port());
118 | remote_buffer_tokens[qp_index] =
119 | (infinity::memory::RegionToken*)qps[qp_index]->getUserData();
120 | }
121 |
122 | for (int request_index = 0; request_index < requests.size();
123 | request_index++) {
124 | requests[request_index] = new infinity::requests::RequestToken(context);
125 | }
126 | connected = true;
127 | }
128 |
129 | void read(infinity::memory::Buffer* local_buffer,
130 | std::vector local_offsets,
131 | std::vector remote_offsets,
132 | uint64_t stride) {
133 | uint64_t post_list_cnt =
134 | (local_offsets.size() + pipe_param.post_list_size - 1) /
135 | pipe_param.post_list_size;
136 |
137 | // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check
138 | // Local_Offset_Size "<< remote_offsets.size()<multiRead(
150 | batch_read_size, local_buffer,
151 | &local_offsets[post_index * pipe_param.post_list_size],
152 | remote_buffer_tokens[post_index % pipe_param.qp_num],
153 | &remote_offsets[post_index * pipe_param.post_list_size], stride,
154 | infinity::queues::OperationFlags(), requests[epoch_scnt],
155 | send_buffer);
156 | epoch_scnt += 1;
157 |
158 | if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) {
159 | context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch,
160 | epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1);
161 | epoch_scnt = 0;
162 | }
163 | }
164 | }
165 |
166 | void read(infinity::memory::Buffer* local_buffer,
167 | torch::Tensor& local_offsets_tensor,
168 | torch::Tensor& remote_offsets_tensor,
169 | uint64_t stride) {
170 | QUIVER_FEATURE_ASSERT(local_offsets_tensor.dim() == 1,
171 | "local_offsets should be 1-dimensional tensor");
172 | QUIVER_FEATURE_ASSERT(remote_offsets_tensor.dim() == 1,
173 | "local_offsets should be 1-dimensional tensor");
174 | QUIVER_FEATURE_ASSERT(
175 | remote_offsets_tensor.size(0) == local_offsets_tensor.size(0),
176 | "local_offsets and remote_offsets should have the same length");
177 |
178 | int64_t* local_offsets = local_offsets_tensor.data_ptr();
179 | int64_t* remote_offsets = remote_offsets_tensor.data_ptr();
180 |
181 | uint64_t post_list_cnt =
182 | (local_offsets_tensor.size(0) + pipe_param.post_list_size - 1) /
183 | pipe_param.post_list_size;
184 |
185 | // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check
186 | // Local_Offset_Size "<< remote_offsets.size()<multiRead(
203 | batch_read_size, local_buffer,
204 | &local_offsets[post_index * pipe_param.post_list_size],
205 | remote_buffer_tokens[post_index % pipe_param.qp_num],
206 | &remote_offsets[post_index * pipe_param.post_list_size], stride,
207 | infinity::queues::OperationFlags(), requests[epoch_scnt],
208 | send_buffer);
209 | epoch_scnt += 1;
210 |
211 | if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) {
212 | int cq_num = context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch,
213 | epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1);
214 | epoch_scnt -= cq_num;
215 | }
216 | }
217 | }
218 | };
219 | } // namespace qvf
220 |
--------------------------------------------------------------------------------
/csrc/include/qvf/qvf.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
--------------------------------------------------------------------------------
/csrc/include/qvf/range.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | namespace qvf {
5 | class Range {
6 | private:
7 | int64_t start;
8 | int64_t end;
9 |
10 | public:
11 | Range() {}
12 | Range(int64_t start, int64_t end) : start(start), end(end) {}
13 | void set_params(int64_t start, int64_t end) {
14 | this->start = start;
15 | this->end = end;
16 | }
17 | Range& operator=(const Range& other) {
18 | this->start = other.start;
19 | this->end = other.end;
20 | return *this;
21 | }
22 | int64_t range_start() { return start; }
23 | int64_t range_end() { return end; }
24 | };
25 | } // namespace qvf
26 |
--------------------------------------------------------------------------------
/csrc/include/qvf/shared_loader.h:
--------------------------------------------------------------------------------
1 | //
2 | // Created by joker on 2022/5/15.
3 | //
4 |
5 | #ifndef QUIVER_FEATURE_SHAREDLOADER_H
6 | #define QUIVER_FEATURE_SHAREDLOADER_H
7 |
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | namespace qvf {
14 |
15 | using caffe2::serialize::PyTorchStreamReader;
16 | using caffe2::serialize::ReadAdapterInterface;
17 |
18 | template
19 | struct Rob {
20 | friend typename Tag::type get(Tag) { return M; }
21 | };
22 |
23 | #define ROB_FIELD_FROM_READER(FieldType, FieldName) \
24 | struct PyTorchStreamReader_##FieldName { \
25 | typedef FieldType PyTorchStreamReader::*type; \
26 | friend type get(PyTorchStreamReader_##FieldName); \
27 | }; \
28 | template struct Rob
30 |
31 | ROB_FIELD_FROM_READER(std::string, archive_name_plus_slash_);
32 | ROB_FIELD_FROM_READER(std::unique_ptr, ar_);
33 | ROB_FIELD_FROM_READER(std::mutex, reader_lock_);
34 |
35 | struct TORCH_API SharedLoader {
36 | PyTorchStreamReader reader;
37 | explicit SharedLoader(const std::string& file_name) : reader(file_name) {}
38 | explicit SharedLoader(std::istream* in) : reader(in) {}
39 | explicit SharedLoader(std::shared_ptr in)
40 | : reader(in) {}
41 | void valid(const char* what, const char* info = "");
42 | std::tuple getRecord(const std::string& name);
43 | size_t getRecordID(const std::string& name);
44 | size_t getRecordOffset(const std::string& name) {
45 | return reader.getRecordOffset(name);
46 | }
47 | bool hasRecord(const std::string& name) { return reader.hasRecord(name); }
48 | std::vector getAllRecords() { return reader.getAllRecords(); }
49 | };
50 |
51 | } // namespace qvf
52 | #endif // QUIVER_FEATURE_SHAREDLOADER_H
53 |
--------------------------------------------------------------------------------
/csrc/include/qvf/tensor_endpoint.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | namespace qvf {
5 | class TensorEndPoint {
6 | public:
7 | ComEndPoint com_endpoint;
8 | Range range;
9 |
10 | public:
11 | TensorEndPoint(ComEndPoint com_endpoint, Range range) {
12 | this->com_endpoint = com_endpoint;
13 | this->range = range;
14 | }
15 |
16 | TensorEndPoint(int rank,
17 | std::string ip,
18 | int port,
19 | int64_t range_start,
20 | int64_t range_end) {
21 | this->com_endpoint = ComEndPoint(rank, ip, port);
22 | this->range = Range(range_start, range_end);
23 | }
24 |
25 | TensorEndPoint(std::string ip, int port, int rank, Range range) {
26 | this->com_endpoint = ComEndPoint(rank, ip, port);
27 | this->range = range;
28 | }
29 |
30 | TensorEndPoint& operator=(const TensorEndPoint& other) {
31 | this->com_endpoint = other.com_endpoint;
32 | this->range = other.range;
33 | return *this;
34 | }
35 | };
36 | } // namespace qvf
37 |
--------------------------------------------------------------------------------
/csrc/src/module.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | void register_TensorEndPoint(pybind11::module& m);
8 | void register_DistTensorServer(pybind11::module& m);
9 | void register_PipeParam(pybind11::module& m);
10 | void register_DistTensorClient(pybind11::module& m);
11 | void register_ComEndPoint(pybind11::module& m);
12 | void register_SharedStorageReader(pybind11::module& m);
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 | register_TensorEndPoint(m);
15 | register_DistTensorServer(m);
16 | register_PipeParam(m);
17 | register_DistTensorClient(m);
18 | register_ComEndPoint(m);
19 | register_SharedStorageReader(m);
20 | }
21 |
--------------------------------------------------------------------------------
/csrc/src/register.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | void register_TensorEndPoint(pybind11::module& m) {
10 | // define TensorEndPoint
11 | py::class_(m, "TensorEndPoint")
12 | .def(py::init());
13 | }
14 |
15 | void register_ComEndPoint(pybind11::module& m) {
16 | // define ComEndPoint
17 | py::class_(m, "ComEndPoint")
18 | .def(py::init())
19 | .def(py::init<>())
20 | .def("rank", &qvf::ComEndPoint::get_rank, py::call_guard())
21 | .def("address", &qvf::ComEndPoint::get_address, py::call_guard())
22 | .def("port", &qvf::ComEndPoint::get_port, py::call_guard());
23 | }
24 |
25 | void register_DistTensorServer(pybind11::module& m) {
26 | // define TensorEndPoint
27 | py::class_(m, "DistTensorServer")
28 | .def(py::init())
29 | .def("serve_tensor", &qvf::DistTensorServer::serve_tensor,
30 | py::call_guard())
31 | .def("join", &qvf::DistTensorServer::join,
32 | py::call_guard());
33 | }
34 |
35 | void register_PipeParam(pybind11::module& m) {
36 | py::class_(m, "PipeParam")
37 | .def(py::init())
38 | .def(py::init<>())
39 | .def("get_param_vec", &qvf::PipeParam::get_param_vec, py::call_guard())
40 | .def("set_param_vec", &qvf::PipeParam::set_param_vec, py::call_guard())
41 | ;
42 | }
43 |
44 | void register_DistTensorClient(pybind11::module& m) {
45 | py::class_(m, "DistTensorClient")
46 | .def(py::init, qvf::PipeParam>())
47 | .def("create_registered_float32_tensor",
48 | &qvf::DistTensorClient::create_registered_float32_tensor,
49 | py::call_guard())
50 | .def("register_float_tensor",
51 | &qvf::DistTensorClient::register_float_tensor,
52 | py::call_guard())
53 | .def("create_registered_float32_tensor_cuda",
54 | &qvf::DistTensorClient::create_registered_float32_tensor_cuda,
55 | py::call_guard())
56 |
57 | .def("sync_read", &qvf::DistTensorClient::sync_read,
58 | py::call_guard());
59 | }
60 |
61 | void register_SharedStorageReader(pybind11::module& m) {
62 | class BufferAdapter : public caffe2::serialize::ReadAdapterInterface {
63 | public:
64 | BufferAdapter(const py::object& buffer) : buffer_(buffer) {
65 | // Jump to the end of the buffer to get its size
66 | auto current = buffer.attr("tell")();
67 | start_offset_ = py::cast(current);
68 | buffer.attr("seek")(current, py::module::import("os").attr("SEEK_END"));
69 | size_ = py::cast(buffer.attr("tell")()) - start_offset_;
70 | buffer.attr("seek")(current);
71 |
72 | // If we can read directly into a buffer, do that instead of an extra copy
73 | use_readinto_ = py::hasattr(buffer, "readinto");
74 | }
75 |
76 | size_t size() const override { return size_; }
77 |
78 | THPObjectPtr getMemview(void* buf, size_t n) const {
79 | THPObjectPtr memview(PyMemoryView_FromMemory(reinterpret_cast(buf),
80 | n, PyBUF_WRITE));
81 | if (!memview) {
82 | throw python_error();
83 | }
84 | return memview;
85 | }
86 |
87 | size_t read(uint64_t pos,
88 | void* buf,
89 | size_t n,
90 | const char* what) const override {
91 | // Seek to desired position (NB: this has to be a Py_ssize_t or Python
92 | // throws a weird error)
93 | Py_ssize_t absolute_pos = start_offset_ + pos;
94 | buffer_.attr("seek")(absolute_pos);
95 |
96 | if (use_readinto_) {
97 | auto memview = getMemview(buf, n);
98 | auto res =
99 | PyObject_CallMethod(buffer_.ptr(), "readinto", "O", memview.get());
100 | if (res) {
101 | int64_t i = static_cast(PyLong_AsLongLong(res));
102 | if (i > 0) {
103 | return i;
104 | }
105 | }
106 | }
107 |
108 | // Read bytes into `buf` from the buffer
109 | std::string bytes = py::cast(buffer_.attr("read")(n));
110 | std::copy(bytes.data(), bytes.data() + bytes.size(),
111 | reinterpret_cast(buf));
112 | return bytes.size();
113 | }
114 |
115 | py::object buffer_;
116 | size_t size_;
117 | size_t start_offset_;
118 | bool use_readinto_;
119 | };
120 | py::class_>(
121 | m, "SharedTensorLoader")
122 | .def(py::init())
123 | .def(py::init([](const py::object& buffer) {
124 | auto adapter = std::make_unique(buffer);
125 | return std::make_shared(std::move(adapter));
126 | }))
127 | .def("get_record",
128 | [](qvf::SharedLoader& self, const std::string& key) {
129 | at::DataPtr data;
130 | size_t size = 0;
131 | std::tie(data, size) = self.getRecord(key);
132 | return py::bytes(reinterpret_cast(data.get()), size);
133 | })
134 | .def("has_record",
135 | [](qvf::SharedLoader& self, const std::string& key) {
136 | return self.hasRecord(key);
137 | })
138 | .def("get_storage_from_record",
139 | [](qvf::SharedLoader& self, const std::string& key, size_t numel,
140 | py::object data_type_obj) {
141 | at::DataPtr data(std::get<0>(self.getRecord(key)));
142 | auto scalar_type =
143 | reinterpret_cast(data_type_obj.ptr())->scalar_type;
144 |
145 | c10::Storage storage(c10::Storage::use_byte_size_t(),
146 | numel * elementSize(scalar_type),
147 | std::move(data),
148 | /*allocator=*/nullptr,
149 | /*resizable=*/false);
150 | auto ptr =
151 | c10::make_intrusive(
152 | std::move(storage), at::DispatchKeySet(),
153 | at::CPU(scalar_type).typeMeta());
154 | return at::Tensor(std::move(ptr));
155 | })
156 | .def("get_all_records",
157 | [](qvf::SharedLoader& self) { return self.getAllRecords(); });
158 | }
--------------------------------------------------------------------------------
/csrc/src/shared_loader.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | extern "C" {
5 | #include
6 | }
7 |
8 | #define RB(x) get(PyTorchStreamReader_##x())
9 |
10 | at::DataPtr new_fd_storage(ptrdiff_t size) {
11 | int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE |
12 | at::ALLOCATOR_MAPPED_KEEPFD | at::ALLOCATOR_MAPPED_UNLINK;
13 | std::string handle = at::NewProcessWideShmHandle();
14 | auto sptr = at::MapAllocator::makeDataPtr(handle.c_str(), flags,
15 | size * sizeof(uint8_t), nullptr);
16 |
17 | return sptr;
18 | }
19 |
20 | size_t qvf::SharedLoader::getRecordID(const std::string& name) {
21 | std::string ss = reader.*RB(archive_name_plus_slash_) + name;
22 | size_t result = mz_zip_reader_locate_file((reader.*RB(ar_)).get(), ss.c_str(),
23 | nullptr, 0);
24 | valid("locating file ", name.c_str());
25 | return result;
26 | }
27 |
28 | std::tuple qvf::SharedLoader::getRecord(
29 | const std::string& name) {
30 | std::lock_guard guard(reader.*RB(reader_lock_));
31 | size_t key = getRecordID(name);
32 | mz_zip_archive_file_stat stat;
33 | mz_zip_reader_file_stat((reader.*RB(ar_)).get(), key, &stat);
34 | valid("retrieving file meta-data for ", name.c_str());
35 | at::DataPtr retval = new_fd_storage(stat.m_uncomp_size);
36 | mz_zip_reader_extract_to_mem((reader.*RB(ar_)).get(), key, retval.get(),
37 | stat.m_uncomp_size, 0);
38 | valid("reading file ", name.c_str());
39 |
40 | return std::make_tuple(std::move(retval), stat.m_uncomp_size);
41 | }
42 |
43 | void qvf::SharedLoader::valid(const char* what, const char* info) {
44 | const auto err = mz_zip_get_last_error((reader.*RB(ar_)).get());
45 | TORCH_CHECK(err == MZ_ZIP_NO_ERROR, "PytorchStreamReader failed ", what, info,
46 | ": ", mz_zip_get_error_string(err));
47 | }
--------------------------------------------------------------------------------
/docs/imgs/Network Bandwidth Under 100Gbps IB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/Network Bandwidth Under 100Gbps IB.png
--------------------------------------------------------------------------------
/docs/imgs/consistent_memory_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/consistent_memory_view.png
--------------------------------------------------------------------------------
/docs/imgs/e2e_feature_collection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection.png
--------------------------------------------------------------------------------
/docs/imgs/e2e_feature_collection_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection_performance.png
--------------------------------------------------------------------------------
/docs/imgs/gpu0_centered_access_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/gpu0_centered_access_performance.png
--------------------------------------------------------------------------------
/docs/imgs/memory_usage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/memory_usage.png
--------------------------------------------------------------------------------
/docs/imgs/multi_qp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/multi_qp.png
--------------------------------------------------------------------------------
/docs/imgs/one_batch_feature_collection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/one_batch_feature_collection.png
--------------------------------------------------------------------------------
/docs/imgs/peak_memory_footprint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/peak_memory_footprint.png
--------------------------------------------------------------------------------
/docs/imgs/pgas_tensor_access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_access.png
--------------------------------------------------------------------------------
/docs/imgs/pgas_tensor_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_view.png
--------------------------------------------------------------------------------
/docs/imgs/range_partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/range_partition.png
--------------------------------------------------------------------------------
/docs/imgs/rdma_mtt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/rdma_mtt.png
--------------------------------------------------------------------------------
/docs/imgs/shared_load.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/shared_load.png
--------------------------------------------------------------------------------
/docs/imgs/subset_signaled_requests.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/subset_signaled_requests.png
--------------------------------------------------------------------------------
/docs/imgs/train_gnn_on_large_graphs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/train_gnn_on_large_graphs.png
--------------------------------------------------------------------------------
/docs/memory.md:
--------------------------------------------------------------------------------
1 | # Peak Memory Footprint Optimization In Quiver-Feature
2 |
3 | By default, Quiver-Feature use `range partition` method to partition original gaint feature array onto different machines. This is pretty easy to understand, let's talk more about memory usage in each machine.
4 |
5 | 
6 |
7 | On each machine:
8 | 1. The feature tensor needs to be pinned so that RNIC and GPU can access its memory directly.
9 |
10 | 2. The feature tensor should be in SHM because multiple processes needs to access its data.
11 |
12 | 
13 |
14 | Pinning memory doesnt consume extra memory but moving a torch.Tensor to SHM will cause
15 | 2x the peak memory of the original data size.
16 |
17 | To solve this problem, we implement `quiver_feature.shared_load` to replace the original `torch.load`. **`quiver_feature.shared_load` is almost the same as `torch.load` except that it loads data directly into SHM**. So the peak memory during creating `DistTensorPGAS` using `quiver_feature.shared_load` will just be around the original data size, **half of that when using torch.load**.
18 |
19 | 
20 |
21 | 
22 |
23 | You can check our [test script](../tests/python/test_SharedLoader.py) for more details.
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/docs/partition_methods.md:
--------------------------------------------------------------------------------
1 | # Partition Methods
2 |
3 | This doc will mainly describe feature partition methods we use in `Quiver-Feature`.
4 |
5 | # Metadata Of Each Partition
6 |
7 | Default metadata for each partition is `TensorEndPoint` which records `Range` information of each serverßß.
8 |
9 | ```python
10 | Range = namedtuple("Range", ["start", "end"])
11 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"])
12 |
13 | ```
14 | For example, in the following partition settting, we have a list of `TensorEndPoint` like shown below. With this list, we can easily compute the `server_rank` and `local offset` of a certrain node idx.
15 |
16 | ```python
17 | [
18 | TensorEndPoint(server_rank=0, ip=ip0, port=port0, range=Range(start=0, end=M)),
19 | TensorEndPoint(server_rank=1, ip=ip1, port=port1, range=Range(start=M, end=N))
20 | ]
21 | ```
22 |
23 | 
24 |
25 | # Range Partition
26 | Range partition is the default partition method we support for now. Take the following partition setting as example, we just assign [0, M) to Machine0 and assign [M, N) to Machine1.
27 |
28 | 
--------------------------------------------------------------------------------
/docs/rdma_details.md:
--------------------------------------------------------------------------------
1 | # RDMA Details
2 |
3 | This doc will mainly describe how we use RDMA for remote data access and summarize techniques we use to get the best RDMA performance.
4 |
5 | Before we start, we would like to show our appreciation to [@claudebarthels](https://github.com/claudebarthels) for developing [infinity](https://github.com/claudebarthels/infinity) which is a lightweight C++ RDMA library for IB and is also the code base for our RDMA implementation.
6 |
7 |
8 | ## Use RDMA READ for Feature Collection
9 |
10 | As we mentioned in the [REAMDE](../README.md), `quiver_feature.DistTensorPGAS` is a 2-dimension distributed tensor abstraction above different memory spaces using `PGAS` model(Partitioned Global Address Space) and **`quiver_feature.DistTensorPGAS` is partitioned by row onto different machines**.
11 | 
12 |
13 | By default, we use `range partition`, when we want to access a certain row of `quiver_feature.DistTensorPGAS`, **we can compute the target machine's index and the memory offset of this row on that target machine from row index**.
14 |
15 | 
16 |
17 |
18 | Since each row's data size can be known in advance, **we can use one single `RDMA READ` to fetch this wanted row's data(which corresponds to a single node's feature)**.
19 |
20 | 
21 |
22 | So **each batch's feature collection involves millons of `RDMA READ`**, each `READ` for one node's feature.
23 |
24 | 
25 |
26 | ## 4 Techniques We Use
27 | Feature collection invloves millions of small `RDMA READs`(each `READ` may read just 2KB data), and we use these 4 techniques to get the best performance.
28 |
29 | ### Rule 1: Use Multi QPs Per Client
30 |
31 | RDMA hosts use Queue Pair(QP) to communicate with each other. Nowadays, RNICs contains a pool of processing units(PUs) and we believe that requests in the same QP is always processed by the same PU to avoid cross-PU synchronization. But CPU is much powerful than a PU so if we only use one QP per RDMA client, the performance can be easily bottlenecked by the PU's ability. So we use multi QPs per RDMA client and dispatch READ requests evenly to these QPs to take full advantage of RNIC's parallel processing ability.
32 |
33 | 
34 |
35 |
36 | ### Rule 2: Only Set A Subset Of All Requests as Signaled
37 |
38 | Each RDMA read request can be set as signaled or unsignaled. Signaled requests need CPU intervention but users can check result status by polling CQs(Completion Queue). Unsignaled requests dont involve CPU, but users have to decide their own way to check if these requests are completed successfully.
39 |
40 | Like we said before, each batch's feature collection involves millions of `RDMA READ` requests. For each QP, we sequentially send these requests but only set one request out of `CQ_MOD`(which we often set as 128) requests as signaled, i.e. we only set 1/128 of all requests as signaled and check their result status. We also set the last request as signaled and wait until its completion to make sure that all requests in this QP are completed. If these signaled requests' result status are all successful, we think all requests are completed sucessfully.
41 |
42 | In the future we may add more mechanisms about failures: If we find a signaled request is failed, we will retry this group of `CQ_MOD` requests again. Even with that, We could not guarantee that all requests are completed successfully.
43 |
44 | 
45 |
46 |
47 | ## Set QP's max_rd_atomic as the RNIC's max_qp_rd_atom
48 |
49 | `max_rd_atomic` is a crucial QP attribute for performance, it is the number of RDMA Reads & atomic operations outstanding at any time that can be handled by a RC QP as an initiator. We suggest you set it as RNIC's `max_qp_rd_atom` which you can get by calling `ibv_query_device()`. You can refer to [our code](https://github.com/quiver-team/quiver-feature/blob/main/csrc/include/infinity/queues/QueuePair.cpp#L38) to see how to set this attribute.
50 |
51 | ## Reduce Address Translation Overhead
52 |
53 | RNIC uses DMA to access system memory, since DMA can only handle physical addresses, the memory region which is exposed to RNIC must be registered so that RNIC stores virtual-to-physical mapping of this memory region in its MTT(Memory Translation Table). MTT is stored in system memory but RNIC's SRAM will cache some. Every time RNIC receive a RDMA read/write requests, it will first translate user's virtual address to physical address by looking up it's MTT cache, if the cache is missed, it will send requsts through PCIe to check this mapping in system memory which may bring severe overhead and thus cause RDMA performance degradation.
54 |
55 | 
56 |
57 | To reduce this address translation overhead, we choose to sort our requested node ids before sending RDMA requests to increase memory accessing locality so that RNIC's cache could get higher hit rate.
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/examples/mag240m/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | Distributed training setting on MAG240M dataset is almost the same as the [official example in DGL](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) except that we use `Quiver-Feature` for distributed feature collection.
4 |
5 | Our implementation is much faster than DGL's offical example while achieving similar accuracy.
6 |
7 | # Data Preprocess & Partition
8 |
9 | First, please run [preprocess.py](./preprocess.py) to generate `graph.dgl` and `full.npy`, you can check [DGL's official guide](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) for more details.
10 |
11 | Then we use [Range Partition](../../docs/partition_methods.md) to partition feature data, it is very easy to understand, you can check [process_quiver.py](./process_quiver.py) for more details.
12 |
13 | 
14 |
15 |
16 | # Running Training Script
17 |
18 | On each machine, please run:
19 |
20 | python3 distributed_training.py \
21 | --rootdir . \
22 | --graph-path ./graph.dgl \
23 | --feature-partition-path ./feature_part.pt \
24 | --server_world_size 2
25 | --server_rank 0
26 |
27 | Remember to:
28 |
29 | - Set shm size limit as large as your physical memory size. You can set by:
30 |
31 | sudo mount -o remount,size=300G /dev/shm
32 |
33 | - Set `MASTER_IP` as your master node's IP
34 |
35 |
36 | The validation accuracy is 0.680. We do not have ground truth test labels so we do not report test accuracy.
37 |
38 | # Performance
39 |
40 | With 2 machines and 1 GPU per machine, we need 2 minutes 10 seconds to train and 15 seconds to validate for each epoch. This is 3x faster than [DGL's performance result](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M).
41 |
42 | From logs we can see that most of the training time of each iteration is spended on model computation.
43 |
44 | Avg_Sample: 0.0051s, Avg_Feature: 0.0176s, Avg_Model: 0.1801s, Avg_Feature_BandWidth = 14588.4937 MB/s
45 |
46 | # Hardware configurations
47 |
48 | We have 2 machines, each have 377G memory and they are connected with 100Gbps IB. Running training script will consume around 256GB memory.
49 |
--------------------------------------------------------------------------------
/examples/mag240m/config.py:
--------------------------------------------------------------------------------
1 | PORT_NUMBER = 3344
2 | MASTER_IP = "155.198.152.17"
3 | #MASTER_IP = "127.0.0.1"
4 | HLPER_PORT = 5678
5 | NODE_COUNT = 1200000
6 | FEATURE_DIM = 128
7 | FEATURE_TYPE_SIZE = 4
8 | SAMPLE_NUM = 80000
9 | ITER_NUM = 10
10 | POST_LIST_SIZE = 128
11 | QP_NUM = 8
12 | TX_DEPTH = 2048
13 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
14 | TEST_TLB_OPTIMIZATION = True
15 |
16 | # For MAG240M Training
17 | SAMPLE_PARAM = [15, 25]
18 | BATCH_SIZE = 1024
19 |
--------------------------------------------------------------------------------
/examples/mag240m/preprocess.py:
--------------------------------------------------------------------------------
1 | import ogb
2 | from ogb.lsc import MAG240MDataset
3 | import tqdm
4 | import numpy as np
5 | import torch
6 | import dgl
7 | import dgl.function as fn
8 | import argparse
9 | import os
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--rootdir', type=str, default='.', help='Directory to download the OGB dataset.')
13 | parser.add_argument('--author-output-path', type=str, help='Path to store the author features.')
14 | parser.add_argument('--inst-output-path', type=str,
15 | help='Path to store the institution features.')
16 | parser.add_argument('--graph-output-path', type=str, help='Path to store the graph.')
17 | parser.add_argument('--graph-format', type=str, default='csc', help='Graph format (coo, csr or csc).')
18 | parser.add_argument('--graph-as-homogeneous', action='store_true', help='Store the graph as DGL homogeneous graph.')
19 | parser.add_argument('--full-output-path', type=str,
20 | help='Path to store features of all nodes. Effective only when graph is homogeneous.')
21 | args = parser.parse_args()
22 |
23 | print('Building graph')
24 | dataset = MAG240MDataset(root=args.rootdir)
25 | ei_writes = dataset.edge_index('author', 'writes', 'paper')
26 | ei_cites = dataset.edge_index('paper', 'paper')
27 | ei_affiliated = dataset.edge_index('author', 'institution')
28 |
29 | # We sort the nodes starting with the papers, then the authors, then the institutions.
30 | author_offset = 0
31 | inst_offset = author_offset + dataset.num_authors
32 | paper_offset = inst_offset + dataset.num_institutions
33 |
34 | g = dgl.heterograph({
35 | ('author', 'write', 'paper'): (ei_writes[0], ei_writes[1]),
36 | ('paper', 'write-by', 'author'): (ei_writes[1], ei_writes[0]),
37 | ('author', 'affiliate-with', 'institution'): (ei_affiliated[0], ei_affiliated[1]),
38 | ('institution', 'affiliate', 'author'): (ei_affiliated[1], ei_affiliated[0]),
39 | ('paper', 'cite', 'paper'): (np.concatenate([ei_cites[0], ei_cites[1]]), np.concatenate([ei_cites[1], ei_cites[0]]))
40 | })
41 |
42 | paper_feat = dataset.paper_feat
43 | author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features))
44 | inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features))
45 |
46 | # Iteratively process author features along the feature dimension.
47 | BLOCK_COLS = 16
48 | with tqdm.trange(0, dataset.num_paper_features, BLOCK_COLS) as tq:
49 | for start in tq:
50 | tq.set_postfix_str('Reading paper features...')
51 | g.nodes['paper'].data['x'] = torch.FloatTensor(paper_feat[:, start:start + BLOCK_COLS].astype('float32'))
52 | # Compute author features...
53 | tq.set_postfix_str('Computing author features...')
54 | g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='write-by')
55 | # Then institution features...
56 | tq.set_postfix_str('Computing institution features...')
57 | g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='affiliate-with')
58 | tq.set_postfix_str('Writing author features...')
59 | author_feat[:, start:start + BLOCK_COLS] = g.nodes['author'].data['x'].numpy().astype('float16')
60 | tq.set_postfix_str('Writing institution features...')
61 | inst_feat[:, start:start + BLOCK_COLS] = g.nodes['institution'].data['x'].numpy().astype('float16')
62 | del g.nodes['paper'].data['x']
63 | del g.nodes['author'].data['x']
64 | del g.nodes['institution'].data['x']
65 | author_feat.flush()
66 | inst_feat.flush()
67 |
68 | # Convert to homogeneous if needed. (The RGAT baseline needs homogeneous graph)
69 | if args.graph_as_homogeneous:
70 | # Process graph
71 | g = dgl.to_homogeneous(g)
72 | # DGL ensures that nodes with the same type are put together with the order preserved.
73 | # DGL also ensures that the node types are sorted in ascending order.
74 | assert torch.equal(
75 | g.ndata[dgl.NTYPE],
76 | torch.cat([torch.full((dataset.num_authors,), 0),
77 | torch.full((dataset.num_institutions,), 1),
78 | torch.full((dataset.num_papers,), 2)]))
79 | assert torch.equal(
80 | g.ndata[dgl.NID],
81 | torch.cat([torch.arange(dataset.num_authors),
82 | torch.arange(dataset.num_institutions),
83 | torch.arange(dataset.num_papers)]))
84 | g.edata['etype'] = g.edata[dgl.ETYPE].byte()
85 | del g.edata[dgl.ETYPE]
86 | del g.ndata[dgl.NTYPE]
87 | del g.ndata[dgl.NID]
88 |
89 | # Process feature
90 | full_feat = np.memmap(
91 | args.full_output_path, mode='w+', dtype='float16',
92 | shape=(dataset.num_authors + dataset.num_institutions + dataset.num_papers, dataset.num_paper_features))
93 | BLOCK_ROWS = 100000
94 | for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS):
95 | end = min(dataset.num_authors, start + BLOCK_ROWS)
96 | full_feat[author_offset + start:author_offset + end] = author_feat[start:end]
97 | for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS):
98 | end = min(dataset.num_institutions, start + BLOCK_ROWS)
99 | full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end]
100 | for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS):
101 | end = min(dataset.num_papers, start + BLOCK_ROWS)
102 | full_feat[paper_offset + start:paper_offset + end] = paper_feat[start:end]
103 |
104 | # Convert the graph to the given format and save. (The RGAT baseline needs CSC graph)
105 | g = g.formats(args.graph_format)
106 | dgl.save_graphs(args.graph_output_path, g)
--------------------------------------------------------------------------------
/examples/mag240m/preprocess_quiver.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | meta = torch.load("/data/mag/mag240m_kddcup2021/meta.pt")
6 |
7 | print("Dataset Loading Finished")
8 |
9 | paper_offset = meta["author"] + meta["institution"]
10 | num_nodes = paper_offset + meta["paper"]
11 | num_features = 768
12 |
13 | feats = np.memmap("/data/dalong/full.npy", mode='r', dtype='float16', shape=(num_nodes, num_features))
14 |
15 | print("Paper Loading Finished")
16 |
17 | print("Creating Float32 Tensor")
18 | tensor_feature = torch.HalfTensor(feats[num_nodes//2: ])
19 |
20 | torch.save(tensor_feature, "/data/dalong/second_half.pt")
21 |
22 |
23 |
--------------------------------------------------------------------------------
/examples/ogb-products/config.py:
--------------------------------------------------------------------------------
1 | PORT_NUMBER = 3344
2 | MASTER_IP = "127.0.0.1"
3 | HLPER_PORT = 5678
4 | NODE_COUNT = 1200000
5 | FEATURE_DIM = 128
6 | FEATURE_TYPE_SIZE = 4
7 | SAMPLE_NUM = 80000
8 | ITER_NUM = 10
9 | POST_LIST_SIZE = 128
10 | QP_NUM = 8
11 | TX_DEPTH = 2048
12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
13 | TEST_TLB_OPTIMIZATION = True
14 |
15 | # For Reddit Training
16 | SAMPLE_PARAM = [15, 10, 5]
17 | BATCH_SIZE = 1024
18 |
--------------------------------------------------------------------------------
/examples/reddit/config.py:
--------------------------------------------------------------------------------
1 | PORT_NUMBER = 3344
2 | MASTER_IP = "127.0.0.1"
3 | HLPER_PORT = 5678
4 | NODE_COUNT = 1200000
5 | FEATURE_DIM = 128
6 | FEATURE_TYPE_SIZE = 4
7 | SAMPLE_NUM = 80000
8 | ITER_NUM = 10
9 | POST_LIST_SIZE = 128
10 | QP_NUM = 8
11 | TX_DEPTH = 2048
12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
13 | TEST_TLB_OPTIMIZATION = True
14 |
15 | # For Reddit Training
16 | SAMPLE_PARAM = [25, 10]
17 | BATCH_SIZE = 256
18 |
--------------------------------------------------------------------------------
/quiver_feature/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from . import multiprocessing
3 | from .dist_tensor_rpc import DistTensorRPC
4 | from .common import Range, TensorEndPoint, DistTensorDeviceParam, DistTensorServerParam
5 | from .dist_tensor_pgas import DistTensor as DistTensorPGAS
6 | from .dist_helper import DistHelper
7 | from .local_tensor_pgas import LocalTensorPGAS
8 | from .tensor_loader import shared_load
9 | from .utils import serve_tensor_for_remote_access
10 | from qvf import PipeParam, DistTensorServer
11 |
12 | __all__ = ["DistTensorRPC", "DistTensorPGAS", "LocalTensorPGAS" , "Range", "TensorEndPoint", "DistHelper",
13 | 'shared_load', "PipeParam", "DistTensorServer", "serve_tensor_for_remote_access", "DistTensorServerParam", "DistTensorDeviceParam"]
14 |
--------------------------------------------------------------------------------
/quiver_feature/common.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | Range = namedtuple("Range", ["start", "end"])
3 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"])
4 | DistTensorServerParam = namedtuple("DistTensorServerParam", ["port_num", "server_world_size", "device_per_server"])
5 | DistTensorServerParam.__new__.__defaults__ = (3344, 1, 1)
6 | DistTensorDeviceParam = namedtuple("DistTensorDeviceParam", ["device_list", "device_cache_size", "cache_policy"])
7 | DistTensorDeviceParam.__new__.__defaults__ = ([], 0, "device_replicate")
--------------------------------------------------------------------------------
/quiver_feature/dist_helper.py:
--------------------------------------------------------------------------------
1 | import torch.distributed as torch_dist
2 | import socket
3 | import pickle
4 | from datetime import timedelta
5 | from .common import TensorEndPoint, Range
6 |
7 | def resolve_my_ip():
8 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
9 | s.connect(("8.8.8.8", 80))
10 | my_ip = s.getsockname()[0]
11 | return my_ip
12 |
13 | class DistHelper:
14 | def __init__(self, master_ip: str, master_port: int, world_size: int, my_rank: int):
15 | self.tcp_store = torch_dist.TCPStore(master_ip, master_port, world_size, my_rank == 0, wait_for_workers = True, multi_tenant=True)
16 | self.my_server_rank = my_rank
17 | self.server_world_size = world_size
18 | self.sync_point = 0
19 |
20 | def exchange_tensor_endpoints_info(self, local_tensor_range: Range, dist_tensor_server_port=3344):
21 | my_ip = resolve_my_ip()
22 |
23 | local_tensor_endpoint = TensorEndPoint(server_rank=self.my_server_rank, ip=my_ip, port=dist_tensor_server_port, range=local_tensor_range)
24 | pickled_data = pickle.dumps(local_tensor_endpoint)
25 | self.tcp_store.set(f"worker{self.my_server_rank}_data", pickled_data)
26 |
27 |
28 | tensor_endpoints = [0] * self.server_world_size
29 | tensor_endpoints[self.my_server_rank] = local_tensor_endpoint
30 | for rank in range(self.server_world_size):
31 | if rank != self.my_server_rank:
32 | tensor_endpoints[rank] = pickle.loads(self.tcp_store.get(f"worker{rank}_data"))
33 |
34 | self.tcp_store.set(f"worker{self.my_server_rank}_status", "DONE")
35 |
36 | keys = [f"worker{rank}_status" for rank in range(self.server_world_size)]
37 | if self.my_server_rank == 0:
38 | while True:
39 | try:
40 | self.tcp_store.wait(keys, timedelta(seconds=1))
41 | break
42 | except:
43 | pass
44 |
45 |
46 | return tensor_endpoints
47 |
48 | def sync_all(self):
49 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC1")
50 |
51 | keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)]
52 | while True:
53 | try:
54 | self.tcp_store.wait(keys, timedelta(seconds=1))
55 | break
56 | except:
57 | pass
58 |
59 |
60 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1")
61 |
62 | keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)]
63 | if self.my_server_rank == 0:
64 | while True:
65 | try:
66 | self.tcp_store.wait(keys, timedelta(seconds=1))
67 | break
68 | except:
69 | pass
70 |
71 |
72 | # TODO Delete Keys
73 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}")
74 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}")
75 | self.sync_point += 1
76 |
77 | def sync_start(self):
78 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC")
79 |
80 | def sync_end(self):
81 |
82 |
83 | keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)]
84 | while True:
85 | try:
86 | self.tcp_store.wait(keys, timedelta(seconds=1))
87 | break
88 | except:
89 | pass
90 |
91 | self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1")
92 |
93 | keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)]
94 | if self.my_server_rank == 0:
95 | while True:
96 | try:
97 | self.tcp_store.wait(keys, timedelta(seconds=1))
98 | break
99 | except:
100 | pass
101 | # TODO Delete Keys
102 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}")
103 | #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}")
104 | self.sync_point += 1
105 |
--------------------------------------------------------------------------------
/quiver_feature/dist_tensor_pgas.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import qvf
3 | from typing import List
4 | from .common import Range, TensorEndPoint, DistTensorServerParam, DistTensorDeviceParam
5 | from .dist_helper import DistHelper
6 | from .local_tensor_pgas import LocalTensorPGAS
7 | from .utils import serve_tensor_for_remote_access
8 |
9 | FloatType = [torch.float32, torch.float64, torch.float16, torch.bfloat16]
10 | IntType = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
11 |
12 | class DistTensor:
13 | def __init__(self, server_rank, tensor_endpoints: List[TensorEndPoint], pipe_param: qvf.PipeParam, buffer_tensor_shape, cached_range: Range= Range(start=0, end=0), order_transform:torch.Tensor=None, dtype=torch.float32)-> None:
14 |
15 | # About DistTensorClient
16 | self.server_rank = server_rank
17 | self.world_size = len(tensor_endpoints)
18 | self.tensor_endpoints = sorted(tensor_endpoints, key= lambda x: x.server_rank)
19 | self.buffer_tensor_shape = buffer_tensor_shape
20 | self.pipe_param = pipe_param
21 | self.com_endpoints = [qvf.ComEndPoint(item.server_rank, item.ip, item.port) for item in tensor_endpoints]
22 |
23 | self.data_type = dtype
24 |
25 | # About Lazy Init
26 | self.inited = False
27 |
28 | # About ShardTensor
29 | self.local_tensor_pgas = None
30 | self.cached_range = cached_range
31 | self.device_rank = -1
32 | self.order_transform = order_transform
33 |
34 | @property
35 | def dtype(self):
36 | return self.data_type
37 |
38 | def lazy_init(self):
39 | if self.inited:
40 | return
41 | self.inited = True
42 |
43 | self.device_rank = torch.cuda.current_device()
44 |
45 | # Create DistTensorClient
46 | self.dist_tensor_client = qvf.DistTensorClient(self.server_rank, self.com_endpoints, self.pipe_param)
47 | self.registered_tensor = torch.zeros(self.buffer_tensor_shape, dtype=self.dtype).pin_memory()
48 | self.dist_tensor_client.register_float_tensor(self.registered_tensor)
49 |
50 | if self.order_transform is not None:
51 | self.order_transform = self.order_transform.to(self.device_rank)
52 |
53 |
54 | def from_cpu_tensor(self, cpu_tensor, dist_helper:DistHelper, server_param:DistTensorServerParam= None, device_param:DistTensorDeviceParam=None):
55 |
56 | self.data_type = cpu_tensor.dtype
57 |
58 | server_param: DistTensorServerParam = server_param or DistTensorServerParam()
59 | device_param: DistTensorDeviceParam = device_param or DistTensorDeviceParam()
60 |
61 | cpu_tensor.share_memory_()
62 |
63 | # Start Server
64 | serve_tensor_for_remote_access(server_param.port_num, self.pipe_param.get_param_vec()[0], server_param.server_world_size, server_param.device_per_server, cpu_tensor, dist_helper)
65 |
66 | # Build Local Tensor
67 | self.local_tensor_pgas = LocalTensorPGAS(device_param.device_list, device_param.device_cache_size, device_param.cache_policy)
68 | self.local_tensor_pgas.from_cpu_tensor(cpu_tensor)
69 |
70 |
71 | def to(self, device_rank):
72 | self.device_rank = device_rank
73 | if self.order_transform is not None:
74 | self.order_transform = self.order_transform.to(device_rank)
75 |
76 | return self
77 |
78 | def size(self, dim):
79 | assert dim < 2, "DistTensorPGAS is 2-dimensional"
80 | if dim == 1:
81 | return self.buffer_tensor_shape[1]
82 | if dim == 0:
83 | all_ends = [item.range.end for item in self.tensor_endpoints]
84 | all_ends.sort()
85 | return all_ends[-1]
86 |
87 | @property
88 | def shape(self):
89 | return [self.size(0), self.size(1)]
90 |
91 | def collect(self, nodes):
92 | nodes -= self.tensor_endpoints[self.server_rank].range.start
93 | nodes += self.cached_range.end
94 | data = self.local_tensor_pgas[nodes]
95 | return data
96 |
97 | def collect_cached_data(self, nodes):
98 | data = self.local_tensor_pgas[nodes]
99 | return data
100 |
101 | def cal_remote_offsets(self, nodes, server_rank):
102 | remote_offsets = (nodes - self.tensor_endpoints[server_rank].range.start + self.cached_range.end) * self.buffer_tensor_shape[1] * self.registered_tensor.element_size()
103 | return remote_offsets
104 |
105 | def __getitem__(self, nodes):
106 |
107 | self.lazy_init()
108 | nodes = nodes.cuda()
109 | if self.order_transform is not None:
110 | nodes = self.order_transform[nodes]
111 |
112 | input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device)
113 |
114 | feature = torch.empty(nodes.shape[0], self.shape[1], device = nodes.device, dtype=self.dtype)
115 |
116 | cache_nodes_mask = None
117 | local_nodes_mask = None
118 |
119 |
120 | # Load cache data
121 | if self.cached_range.end > 0:
122 | cache_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end)
123 | cache_request_nodes = torch.masked_select(nodes, cache_nodes_mask)
124 | cache_part_orders = torch.masked_select(input_orders, cache_nodes_mask)
125 | if cache_request_nodes.shape[0] > 0:
126 | feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes)
127 |
128 |
129 |
130 |
131 | # Load local data
132 | range_item = self.tensor_endpoints[self.server_rank].range
133 | local_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
134 | local_request_nodes = torch.masked_select(nodes, local_nodes_mask)
135 | local_part_orders = torch.masked_select(input_orders, local_nodes_mask)
136 | if local_request_nodes.shape[0] > 0:
137 | feature[local_part_orders] = self.collect(local_request_nodes)
138 |
139 |
140 | # Collect Remote Data
141 | if cache_nodes_mask is None:
142 | all_remote_nodes_mask = torch.logical_not(local_nodes_mask)
143 | else:
144 | all_remote_nodes_mask = torch.logical_not(torch.logical_or(local_nodes_mask, cache_nodes_mask))
145 |
146 | all_remote_nodes = torch.masked_select(nodes, all_remote_nodes_mask)
147 | all_remote_orders = torch.masked_select(input_orders, all_remote_nodes_mask)
148 |
149 | assert all_remote_nodes.shape[0] <= self.registered_tensor.shape[0], "Collected Data Exceeds Buffer Size"
150 |
151 | for server_rank in range(self.world_size):
152 |
153 | range_item = self.tensor_endpoints[server_rank].range
154 | if server_rank != self.server_rank:
155 | request_nodes_mask = (all_remote_nodes >= range_item.start) & (all_remote_nodes < range_item.end)
156 | request_nodes = torch.masked_select(all_remote_nodes, request_nodes_mask)
157 | if request_nodes.shape[0] > 0:
158 | local_orders = torch.masked_select(input_orders[:all_remote_nodes.shape[0]], request_nodes_mask)
159 | local_offsets = local_orders * self.registered_tensor.shape[1] * self.registered_tensor.element_size()
160 | remote_offsets = self.cal_remote_offsets(request_nodes, server_rank)
161 | self.dist_tensor_client.sync_read(server_rank, self.registered_tensor, local_offsets.cpu(), remote_offsets.cpu())
162 |
163 | feature[all_remote_orders] = self.registered_tensor[:all_remote_nodes.shape[0]].to(self.device_rank)
164 | return feature
165 |
--------------------------------------------------------------------------------
/quiver_feature/dist_tensor_rpc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.distributed.rpc as rpc
3 | from typing import List
4 | from .common import Range
5 |
6 | class Task:
7 | def __init__(self, prev_order, fut):
8 | self.prev_order_ = prev_order
9 | self.fut_ = fut
10 | self.data_ = None
11 |
12 | def wait(self):
13 | self.data_ = self.fut_.wait()
14 |
15 | @property
16 | def data(self):
17 | return self.data_
18 |
19 | @property
20 | def prev_order(self):
21 | return self.prev_order_
22 |
23 | class Singleton(object):
24 | def __init__(self, cls):
25 | self._cls = cls
26 | self._instance = {}
27 | def __call__(self, *args, **kwargs):
28 | if self._cls not in self._instance:
29 | self._instance[self._cls] = self._cls()
30 | self._instance[self._cls].init(*args, **kwargs)
31 | return self._instance[self._cls]
32 |
33 |
34 | def collect(nodes):
35 | dist_tensor = DistTensorRPC()
36 | return dist_tensor.collect(nodes)
37 |
38 |
39 | @Singleton
40 | class DistTensorRPC(object):
41 |
42 | def __init__(self):
43 | pass
44 |
45 | def init(self, world_size, rank, local_size, local_rank, shard_tensor, range_list: List[Range], rpc_option, cached_range = Range(start=0, end=0), order_transform=None, **debug_params) -> None:
46 | self.shard_tensor = shard_tensor
47 | self.range_list = range_list
48 | self.cached_range = cached_range
49 | self.order_transform = None
50 | if order_transform is not None:
51 | self.order_transform = order_transform.to(local_rank)
52 | self.rank = rank
53 | self.local_rank = local_rank
54 | self.world_size = world_size
55 | self.local_size = local_size
56 | self.debug_params = debug_params
57 |
58 | rpc.init_rpc(f"worker{rank}", rank=self.rank, world_size= world_size, rpc_backend_options=rpc_option)
59 |
60 | def collect(self, nodes):
61 |
62 | # TODO Just For Debugging
63 | if nodes.is_cuda:
64 | torch.cuda.set_device(self.local_rank)
65 | nodes -= self.range_list[self.rank].start
66 | nodes += self.cached_range.end
67 | data = self.shard_tensor[nodes]
68 |
69 | return data
70 |
71 | def collect_cached_data(self, nodes):
72 | # TODO Just For Debugging
73 | if nodes.is_cuda:
74 | torch.cuda.set_device(self.local_rank)
75 | data = self.shard_tensor[nodes]
76 |
77 | return data
78 |
79 | def __getitem__(self, nodes):
80 |
81 | task_list: List[Task] = []
82 | if self.order_transform is not None:
83 | nodes = self.order_transform[nodes]
84 | input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device)
85 |
86 | remote_collect = 0
87 | for worker_id in range(self.local_rank, self.world_size, self.local_size):
88 | range_item = self.range_list[worker_id]
89 | if worker_id != self.rank:
90 | request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
91 | request_nodes = torch.masked_select(nodes, request_nodes_mask)
92 | if request_nodes.shape[0] > 0:
93 | remote_collect += request_nodes.shape[0]
94 | part_orders = torch.masked_select(input_orders, request_nodes_mask)
95 | fut = rpc.rpc_async(f"worker{worker_id}", collect, args=(request_nodes, ))
96 | task_list.append(Task(part_orders, fut))
97 |
98 | feature = torch.zeros(nodes.shape[0], self.shard_tensor.shape[1], device = f"cuda:{self.local_rank}")
99 |
100 | # Load Cached Data
101 | if self.cached_range.end > 0:
102 | request_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end)
103 | cache_request_nodes = torch.masked_select(nodes, request_nodes_mask)
104 | cache_part_orders = torch.masked_select(input_orders, request_nodes_mask)
105 | if cache_request_nodes.shape[0] > 0:
106 | feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes).to(self.local_rank)
107 |
108 |
109 | # Load local data
110 | range_item = self.range_list[self.rank]
111 | request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
112 | local_request_nodes = torch.masked_select(nodes, request_nodes_mask)
113 | local_part_orders = torch.masked_select(input_orders, request_nodes_mask)
114 | if local_request_nodes.shape[0] > 0:
115 | feature[local_part_orders] = self.collect(local_request_nodes).to(self.local_rank)
116 |
117 | for task in task_list:
118 | task.wait()
119 | feature[task.prev_order] = task.data.to(self.local_rank)
120 | return feature
121 |
--------------------------------------------------------------------------------
/quiver_feature/multiprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .reductions import init_reductions
2 |
3 | init_reductions()
--------------------------------------------------------------------------------
/quiver_feature/multiprocessing/reductions.py:
--------------------------------------------------------------------------------
1 | from multiprocessing.reduction import ForkingPickler
2 | import qvf
3 | from ..local_tensor_pgas import LocalTensorPGAS
4 |
5 | def rebuild_qvf_pipeparam(ipc_handle):
6 |
7 | pipe_param = qvf.PipeParam()
8 | pipe_param.set_param_vec(ipc_handle)
9 | return pipe_param
10 |
11 | def reduce_qvf_pipeparam(pipe_param):
12 | param_vec = pipe_param.get_param_vec()
13 | return(rebuild_qvf_pipeparam, (param_vec, ))
14 |
15 |
16 | def rebuild_qvf_comendpoint(ipc_handle):
17 |
18 | com_endpoint = qvf.ComEndPoint(ipc_handle[0], ipc_handle[1], ipc_handle[2])
19 | return com_endpoint
20 |
21 | def reduce_qvf_comendpoint(com_endpoint):
22 | param_vec = (com_endpoint.rank(), com_endpoint.address(), com_endpoint.port())
23 | return (rebuild_qvf_comendpoint, (param_vec, ))
24 |
25 | def rebuild_localtensorpgas(ipc_handle):
26 |
27 | feature = LocalTensorPGAS.lazy_from_ipc_handle(ipc_handle)
28 | return feature
29 |
30 |
31 | def reduce_localtensorpgas(feature):
32 |
33 | ipc_handle = feature.share_ipc()
34 | return (rebuild_localtensorpgas, (ipc_handle, ))
35 |
36 | def init_reductions():
37 | ForkingPickler.register(qvf.PipeParam, reduce_qvf_pipeparam)
38 | ForkingPickler.register(qvf.ComEndPoint, reduce_qvf_comendpoint)
39 | ForkingPickler.register(LocalTensorPGAS, reduce_localtensorpgas)
--------------------------------------------------------------------------------
/quiver_feature/tensor_loader.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import qvf
3 | import torch.serialization as se
4 | from torch.serialization import *
5 |
6 |
7 | class _open_zipfile_reader(torch.serialization._opener):
8 | def __init__(self, name_or_buffer) -> None:
9 | super(_open_zipfile_reader, self).__init__(qvf.SharedTensorLoader(name_or_buffer))
10 |
11 |
12 | def shared_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
13 | se._check_dill_version(pickle_module)
14 |
15 | if 'encoding' not in pickle_load_args.keys():
16 | pickle_load_args['encoding'] = 'utf-8'
17 |
18 | with se._open_file_like(f, 'rb') as opened_file:
19 | if se._is_zipfile(opened_file):
20 | # The zipfile reader is going to advance the current file position.
21 | # If we want to actually tail call to torch.jit.load, we need to
22 | # reset back to the original position.
23 | orig_position = opened_file.tell()
24 | with _open_zipfile_reader(opened_file) as opened_zipfile:
25 | if se._is_torchscript_zip(opened_zipfile):
26 | warnings.warn("'torch.load' received a zip file that looks like a TorchScript archive"
27 | " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
28 | " silence this warning)", UserWarning)
29 | opened_file.seek(orig_position)
30 | return torch.jit.load(opened_file)
31 | return se._load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
32 | return se._legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
33 |
--------------------------------------------------------------------------------
/quiver_feature/utils.py:
--------------------------------------------------------------------------------
1 | import threading
2 | from qvf import DistTensorServer
3 |
4 | def server_thread(port_number, qp_num, world_size, tensor, dist_helper):
5 | dist_tensor_server = DistTensorServer(port_number, world_size, qp_num)
6 | dist_tensor_server.serve_tensor(tensor)
7 | dist_helper.sync_start()
8 | dist_tensor_server.join()
9 |
10 | def serve_tensor_for_remote_access(port_number, qp_num, server_world_size, device_per_server, cpu_tensor, dist_helper):
11 | server = threading.Thread(target=server_thread, args=(port_number, qp_num, server_world_size * device_per_server, cpu_tensor, dist_helper))
12 | server.daemon = True
13 | server.start()
14 | dist_helper.sync_end()
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import os.path as osp
5 | from itertools import product
6 | from setuptools import setup, find_packages
7 | import platform
8 |
9 | import torch
10 | from torch.__config__ import parallel_info
11 | from torch.utils.cpp_extension import BuildExtension
12 | from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
13 | import torch.utils.cpp_extension as cpp_extension
14 |
15 | WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
16 | suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
17 | if os.getenv('FORCE_CUDA', '0') == '1':
18 | suffices = ['cuda', 'cpu']
19 | if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
20 | suffices = ['cuda']
21 | if os.getenv('FORCE_ONLY_CPU', '0') == '1':
22 | suffices = ['cpu']
23 |
24 | BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
25 |
26 | WITH_SYMBOLS = True if os.getenv('WITH_SYMBOLS', '0') == '1' else False
27 |
28 |
29 | def get_torch_includes():
30 | lib_include = os.path.join(cpp_extension._TORCH_PATH, 'include')
31 | paths = [
32 | osp.join(lib_include, 'ATen'),
33 | osp.join(lib_include, 'c10'),
34 | osp.join(lib_include, 'caffe2'),
35 | ]
36 |
37 | return paths
38 |
39 |
40 | def get_extensions():
41 | extensions = []
42 | libraries = ['ibverbs']
43 |
44 | extensions_dir = osp.join('csrc')
45 |
46 | srcs = glob.glob(osp.join(extensions_dir, 'src', '*.cpp'))
47 | srcs += glob.glob(osp.join(extensions_dir, 'src', '*.cu'))
48 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/core", '*.cpp'))
49 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/memory", '*.cpp'))
50 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/queues", '*.cpp'))
51 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/requests", '*.cpp'))
52 | srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/utils", '*.cpp'))
53 | srcs += glob.glob(osp.join(extensions_dir, 'include', "miniz", '*.c'))
54 | includes = osp.join(extensions_dir, 'include/')
55 |
56 | define_macros = [('WITH_PYTHON', None)]
57 | extra_compile_args = {
58 | 'cxx': ['-O3', '-std=c++17', '-libverbs'],
59 | '/usr/local/cuda/bin/nvcc': ['-O3', '--expt-extended-lambda', '-std=c++17', '-libverbs']}
60 | extra_link_args = [] if WITH_SYMBOLS else ['-s']
61 |
62 | Extension = CUDAExtension
63 | extension = Extension(
64 | 'qvf',
65 | srcs,
66 | include_dirs=[includes] + get_torch_includes(),
67 | define_macros=define_macros,
68 | extra_compile_args=extra_compile_args,
69 | extra_link_args=extra_link_args,
70 | libraries=libraries,
71 | )
72 | extensions += [extension]
73 | return extensions
74 |
75 |
76 | install_requires = []
77 | setup_requires = []
78 | tests_require = ['pytest', 'pytest-runner', 'pytest-cov']
79 |
80 | setup(
81 | name='quiver_feature',
82 | version='0.0.1',
83 | author='quiver-team',
84 | author_email='',
85 | url='https://github.com/quiver-team/quiver_feature',
86 | description=('PyTorch Library for graph learning sampling'),
87 | keywords=['pytorch', 'sparse', 'graph'],
88 | license='Apache',
89 | python_requires='>=3.6',
90 | install_requires=install_requires,
91 | setup_requires=setup_requires,
92 | tests_require=tests_require,
93 | extras_require={'test': tests_require},
94 | ext_modules=get_extensions() if not BUILD_DOCS else [],
95 | cmdclass={
96 | 'build_ext':
97 | BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
98 | },
99 | packages=find_packages(),
100 | )
101 |
--------------------------------------------------------------------------------
/tests/cpp/test_DistTensorClient.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | #define PORT_NUMBER 3344
12 | #define SERVER_IP "155.198.152.17"
13 |
14 | #define NODE_COUNT 120000LL
15 | #define FEATURE_DIM 256LL
16 | #define FEATURE_TYPE_SIZE 4LL
17 | #define SAMPLE_NUM 80960LL
18 | #define TEST_COUNT 8192LL
19 | #define ITER_NUM 10LL
20 | #define POST_LIST_SIZE 16LL
21 | #define CQ_MOD 16LL
22 | #define QP_NUM 2LL
23 | #define TX_DEPTH 2048LL
24 | #define CTX_POLL_BATCH 16LL
25 |
26 | int min(int a, int b);
27 |
28 | void print_tensor_res(torch::Tensor& res_tensor) {
29 | float* res = res_tensor.data_ptr();
30 | for (int col = 0; col < res_tensor.size(1); col++) {
31 | std::cout << res[0 * res_tensor.size(1) + col] << " ";
32 | }
33 | std::cout << std::endl;
34 | }
35 | void check_tensor_res(torch::Tensor& res_tensor,
36 | torch::Tensor& remote_offsets) {
37 | float* res = res_tensor.data_ptr();
38 | int stride = res_tensor.size(1);
39 | int64_t* offsets = remote_offsets.data_ptr();
40 | for (int row = 0; row < remote_offsets.size(0); row++) {
41 | for (int col = 0; col < res_tensor.size(1); col++) {
42 | float expected_value =
43 | float(offsets[row]) / (FEATURE_DIM * FEATURE_TYPE_SIZE);
44 | QUIVER_FEATURE_ASSERT(
45 | res[row * stride + col] == expected_value,
46 | "Result Check Failed At (%d, %d)!, Expected %f, Got %f\n", row, col,
47 | expected_value, res[row * stride + col]);
48 | }
49 | }
50 | printf("Result Check Passed, Congrats!\n");
51 | }
52 |
53 | void test_dist_tensor_client(int argc, char** argv) {
54 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
55 | POST_LIST_SIZE);
56 |
57 | qvf::ComEndPoint local_com_end_point(0, SERVER_IP, PORT_NUMBER);
58 | qvf::ComEndPoint remote_com_end_point(1, SERVER_IP, PORT_NUMBER);
59 | std::vector com_endpoints{local_com_end_point,
60 | remote_com_end_point};
61 | qvf::DistTensorClient dist_tensor_client(0, com_endpoints, pipe_param);
62 | std::vector shape{SAMPLE_NUM, FEATURE_DIM};
63 |
64 | torch::Tensor registered_tensor =
65 | dist_tensor_client.create_registered_float32_tensor(shape);
66 |
67 | std::vector local_offsets(SAMPLE_NUM);
68 | std::vector remote_offsets(SAMPLE_NUM);
69 |
70 | for (int index = 0; index < SAMPLE_NUM; index++) {
71 | local_offsets[index] = index * FEATURE_DIM * FEATURE_TYPE_SIZE;
72 | remote_offsets[index] =
73 | rand() % NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE;
74 | // remote_offsets[index] = FEATURE_DIM * FEATURE_TYPE_SIZE;
75 | }
76 |
77 | for (int index = 0; index < min(1, SAMPLE_NUM); index++) {
78 | std::cout << "Collect Node "
79 | << remote_offsets[index] / (FEATURE_DIM * FEATURE_TYPE_SIZE)
80 | << ": " << local_offsets[index] << "<-" << remote_offsets[index]
81 | << std::endl;
82 | }
83 | std::cout << std::endl;
84 |
85 | auto tensor_option = torch::TensorOptions().dtype(torch::kInt64);
86 | torch::Tensor local_offsets_tensor =
87 | torch::from_blob(&local_offsets[0], {SAMPLE_NUM}, tensor_option);
88 | torch::Tensor remote_offsets_tensor =
89 | torch::from_blob(&remote_offsets[0], {SAMPLE_NUM}, tensor_option);
90 |
91 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets_tensor,
92 | remote_offsets_tensor);
93 | // print_tensor_res(registered_tensor);
94 | check_tensor_res(registered_tensor, remote_offsets_tensor);
95 | }
96 |
--------------------------------------------------------------------------------
/tests/cpp/test_DistTensorServer.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #define PORT_NUMBER 3344
5 | #define SERVER_IP "155.198.152.17"
6 |
7 | #define NODE_COUNT 120000LL
8 | #define FEATURE_DIM 256LL
9 | #define FEATURE_TYPE_SIZE 4LL
10 | #define TEST_COUNT 8192LL
11 | #define ITER_NUM 10LL
12 | #define POST_LIST_SIZE 16LL
13 | #define CQ_MOD 16LL
14 | #define QP_NUM 2LL
15 | #define TX_DEPTH 2048LL
16 | #define CTX_POLL_BATCH 16LL
17 |
18 | float* allocate_float_feature(bool set_value) {
19 | float* buffer = (float*)malloc(NODE_COUNT * FEATURE_DIM * sizeof(float));
20 | float index = 0;
21 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
22 | for (int dim = 0; dim < FEATURE_DIM; dim++) {
23 | if (set_value)
24 | buffer[start * FEATURE_DIM + dim] = index;
25 | else
26 | buffer[start * FEATURE_DIM + dim] = 0;
27 | }
28 | index += 1;
29 | }
30 | return buffer;
31 | }
32 |
33 | void test_dist_tensor_server(int argc, char** argv) {
34 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
35 | POST_LIST_SIZE);
36 | qvf::DistTensorServer dist_tensor_server(PORT_NUMBER, 1, 1);
37 | float* server_data_buffer = allocate_float_feature(true);
38 | dist_tensor_server.serve(server_data_buffer,
39 | NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
40 | }
41 |
--------------------------------------------------------------------------------
/tests/cpp/test_Pipe.cpp:
--------------------------------------------------------------------------------
1 | /**
2 | * Examples - Read/Write/Send Operations
3 | *
4 | * (c) 2018 Claude Barthels, ETH Zurich
5 | * Contact: claudeb@inf.ethz.ch
6 | *
7 | */
8 |
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 |
29 | #include
30 | #include
31 | #include
32 | #include
33 |
34 | #define PORT_NUMBER 3344
35 | #define SERVER_IP "155.198.152.17"
36 |
37 | #define NODE_COUNT 120000LL
38 | #define FEATURE_DIM 256LL
39 | #define FEATURE_TYPE_SIZE 4LL
40 | #define TEST_COUNT 8192LL
41 | #define ITER_NUM 10LL
42 | #define POST_LIST_SIZE 16LL
43 | #define CQ_MOD 16LL
44 | #define QP_NUM 2LL
45 | #define TX_DEPTH 2048LL
46 | #define CTX_POLL_BATCH 16LL
47 |
48 | int min(int a, int b) {
49 | if (a < b) {
50 | return a;
51 | }
52 | return b;
53 | }
54 |
55 | uint64_t timeDiff(struct timeval stop, struct timeval start) {
56 | return (stop.tv_sec * 1000000L + stop.tv_usec) -
57 | (start.tv_sec * 1000000L + start.tv_usec);
58 | }
59 |
60 | float* allocate_float_feature(bool set_value);
61 |
62 | bool mem_check(float* data_buffer) {
63 | float index = 0;
64 | bool have_valid_data = false;
65 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
66 | for (int dim = 0; dim < FEATURE_DIM; dim++) {
67 | if (data_buffer[start * FEATURE_DIM + dim] != 0) {
68 | have_valid_data = true;
69 | }
70 | }
71 | }
72 | QUIVER_FEATURE_ASSERT(have_valid_data == true, "No valid data is copied")
73 |
74 | for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
75 | float expected_value =
76 | (data_buffer[start * FEATURE_DIM] == 0) ? 0 : float(start);
77 | std::cout << data_buffer[start * FEATURE_DIM] << " ";
78 | for (u_int64_t dim = 0; dim < FEATURE_DIM; dim++) {
79 | QUIVER_FEATURE_ASSERT(
80 | data_buffer[start * FEATURE_DIM + dim] == expected_value,
81 | "Result Check Failed At (%lld, %lld)!, Expected %f, Got %f\n", start,
82 | dim, expected_value, data_buffer[start * FEATURE_DIM + dim]);
83 | }
84 | }
85 | return true;
86 | }
87 |
88 | void test_pipe(int argc, char** argv) {
89 | bool random = true;
90 | bool sort_index = false;
91 |
92 | while (argc > 1) {
93 | if (argv[1][0] == '-') {
94 | switch (argv[1][1]) {
95 | case 'l': {
96 | random = false;
97 | break;
98 | }
99 | case 't': {
100 | sort_index = true;
101 | break;
102 | }
103 | }
104 | }
105 | ++argv;
106 | --argc;
107 | }
108 | if (random) {
109 | printf("Test Random Data Access \n");
110 | } else {
111 | printf("Test Sequential Data Access \n");
112 | }
113 | if (sort_index) {
114 | printf("Test Data Access With TLB Optimization\n");
115 | }
116 |
117 | std::vector qps;
118 | infinity::core::Context* context = new infinity::core::Context();
119 | infinity::queues::QueuePairFactory* qpFactory =
120 | new infinity::queues::QueuePairFactory(context);
121 |
122 | qps.resize(QP_NUM);
123 | qvf::ComEndPoint endpoint(0, SERVER_IP, PORT_NUMBER);
124 | qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
125 | POST_LIST_SIZE);
126 | qvf::Pipe quiver_pipe(context, qpFactory, endpoint, pipe_param);
127 | quiver_pipe.connect();
128 |
129 | printf("Creating buffers\n");
130 | std::vector buffers;
131 | float* client_data_buffer = allocate_float_feature(false);
132 | infinity::memory::Buffer* buffer1Sided = new infinity::memory::Buffer(
133 | context, client_data_buffer,
134 | NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
135 | infinity::memory::Buffer* buffer2Sided =
136 | new infinity::memory::Buffer(context, 128 * sizeof(char));
137 |
138 | printf("Reading content from remote buffer\n");
139 | infinity::requests::RequestToken requestToken(context);
140 |
141 | printf("Start Real Test \n");
142 | // auto start = std::chrono::system_clock::now();
143 | struct timeval start, stop;
144 | uint64_t time_consumed = 0;
145 | std::vector local_offsets(TEST_COUNT * POST_LIST_SIZE);
146 | std::vector remote_offsets(TEST_COUNT * POST_LIST_SIZE);
147 | if (sort_index) {
148 | for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) {
149 | std::vector all_request_nodes(TEST_COUNT * POST_LIST_SIZE);
150 | for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) {
151 | all_request_nodes[i] = rand() % NODE_COUNT;
152 | }
153 | std::sort(all_request_nodes.begin(), all_request_nodes.end());
154 | for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) {
155 | uint64_t remote_node_offset =
156 | all_request_nodes[i] * FEATURE_DIM * FEATURE_TYPE_SIZE;
157 | local_offsets[i] = remote_node_offset;
158 | remote_offsets[i] = remote_node_offset;
159 | }
160 | gettimeofday(&start, NULL);
161 |
162 | quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets,
163 | FEATURE_DIM * FEATURE_TYPE_SIZE);
164 | gettimeofday(&stop, NULL);
165 | time_consumed += timeDiff(stop, start);
166 | }
167 | } else {
168 | for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) {
169 | for (int k = 0; k < TEST_COUNT * POST_LIST_SIZE; k++) {
170 | int request_node = k % NODE_COUNT;
171 | if (random) {
172 | request_node = rand() % NODE_COUNT;
173 | }
174 | uint64_t remote_node_offset =
175 | request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
176 | local_offsets[k] = remote_node_offset;
177 | remote_offsets[k] = remote_node_offset;
178 | }
179 | gettimeofday(&start, NULL);
180 | quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets,
181 | FEATURE_DIM * FEATURE_TYPE_SIZE);
182 | gettimeofday(&stop, NULL);
183 | time_consumed += timeDiff(stop, start);
184 | }
185 | }
186 |
187 | printf("Avg Bandwidth is %f MB/s\n",
188 | (POST_LIST_SIZE * TEST_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE *
189 | ITER_NUM / (1024.0 * 1024.0)) /
190 | (((double)time_consumed) / 1000000L));
191 |
192 | printf("Memory checking..., Please wait...\n");
193 | if (!mem_check(client_data_buffer)) {
194 | fprintf(stderr, "Memory Check Failed, Benchmark Failed!\n");
195 | } else {
196 | printf("Memory check success! Congrats!\n");
197 | }
198 |
199 | delete buffer1Sided;
200 | delete buffer2Sided;
201 |
202 | for (int index = 0; index < QP_NUM; index++) {
203 | delete qps[index];
204 | }
205 | delete qpFactory;
206 | delete context;
207 | }
208 |
--------------------------------------------------------------------------------
/tests/cpp/test_main.cpp:
--------------------------------------------------------------------------------
1 |
2 | // Usage: ./progam -s for server and ./program for client component
3 |
4 | #include