├── .clang-format
├── .gitignore
├── .pre-commit-config.yaml
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── build.sh
├── csrc
    ├── include
    │   ├── infinity
    │   │   ├── core
    │   │   │   ├── Configuration.h
    │   │   │   ├── Context.cpp
    │   │   │   └── Context.h
    │   │   ├── infinity.h
    │   │   ├── memory
    │   │   │   ├── Atomic.cpp
    │   │   │   ├── Atomic.h
    │   │   │   ├── Buffer.cpp
    │   │   │   ├── Buffer.h
    │   │   │   ├── Region.cpp
    │   │   │   ├── Region.h
    │   │   │   ├── RegionToken.cpp
    │   │   │   ├── RegionToken.h
    │   │   │   ├── RegionType.h
    │   │   │   ├── RegisteredMemory.cpp
    │   │   │   └── RegisteredMemory.h
    │   │   ├── queues
    │   │   │   ├── QueuePair.cpp
    │   │   │   ├── QueuePair.h
    │   │   │   ├── QueuePairFactory.cpp
    │   │   │   └── QueuePairFactory.h
    │   │   ├── requests
    │   │   │   ├── RequestToken.cpp
    │   │   │   └── RequestToken.h
    │   │   └── utils
    │   │   │   ├── Address.cpp
    │   │   │   ├── Address.h
    │   │   │   └── Debug.h
    │   ├── miniz
    │   │   └── miniz.h
    │   └── qvf
    │   │   ├── com_endpoint.h
    │   │   ├── common.h
    │   │   ├── dist_tensor_client.h
    │   │   ├── dist_tensor_server.h
    │   │   ├── pipe.h
    │   │   ├── qvf.h
    │   │   ├── range.h
    │   │   ├── shared_loader.h
    │   │   └── tensor_endpoint.h
    └── src
    │   ├── module.cpp
    │   ├── register.cpp
    │   └── shared_loader.cpp
├── docs
    ├── imgs
    │   ├── Network Bandwidth Under 100Gbps IB.png
    │   ├── consistent_memory_view.png
    │   ├── e2e_feature_collection.png
    │   ├── e2e_feature_collection_performance.png
    │   ├── gpu0_centered_access_performance.png
    │   ├── memory_usage.png
    │   ├── multi_qp.png
    │   ├── one_batch_feature_collection.png
    │   ├── peak_memory_footprint.png
    │   ├── pgas_tensor_access.png
    │   ├── pgas_tensor_view.png
    │   ├── range_partition.png
    │   ├── rdma_mtt.png
    │   ├── shared_load.png
    │   ├── subset_signaled_requests.png
    │   └── train_gnn_on_large_graphs.png
    ├── memory.md
    ├── partition_methods.md
    └── rdma_details.md
├── examples
    ├── mag240m
    │   ├── README.md
    │   ├── config.py
    │   ├── distribute_training.py
    │   ├── preprocess.py
    │   └── preprocess_quiver.py
    ├── ogb-products
    │   ├── config.py
    │   └── distribute_training.py
    └── reddit
    │   ├── config.py
    │   └── distribute_training.py
├── quiver_feature
    ├── __init__.py
    ├── common.py
    ├── dist_helper.py
    ├── dist_tensor_pgas.py
    ├── dist_tensor_rpc.py
    ├── local_tensor_pgas.py
    ├── multiprocessing
    │   ├── __init__.py
    │   └── reductions.py
    ├── tensor_loader.py
    └── utils.py
├── setup.py
└── tests
    ├── cpp
        ├── test_DistTensorClient.cpp
        ├── test_DistTensorServer.cpp
        ├── test_Pipe.cpp
        └── test_main.cpp
    ├── infinity
        ├── feature_server.cpp
        ├── read-write-send.cpp
        ├── send-performance.cpp
        ├── test_multiread.cpp
        ├── test_multiread_multiqp.cpp
        └── test_read.cpp
    └── python
        ├── config.py
        ├── preprocess_Dataset.py
        ├── test_DGLUnifiedTensor.py
        ├── test_DistHelper.py
        ├── test_DistTensorClient.py
        ├── test_DistTensorPGAS.py
        ├── test_DistTensorRPC.py
        ├── test_DistTensorServer.py
        ├── test_LocalTensorPGAS.py
        ├── test_MultiMachineDistTensorClientServer.py
        ├── test_MultiMachineDistTensorPGAS.py
        ├── test_MultiMachineDistTensorRPC.py
        ├── test_PipeParam.py
        ├── test_RealDataset.py
        ├── test_RegisteredTensorTransfer.py
        ├── test_SharedLoader.py
        ├── test_TensorEndPoint.py
        └── tmp.py


/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Chromium
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .vscode/
132 | build/
133 | infinity_realease/
134 | 
135 | examples/reddit/processed/
136 | examples/reddit/raw/
137 | tests/data/
138 | 
139 | .idea/*
140 | cmake-build-debug/
141 | 
142 | # OSX
143 | .DS_Store
144 | 
145 | *.pt


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.2.0
 4 |     hooks:
 5 |       - id: end-of-file-fixer
 6 |       - id: trailing-whitespace
 7 |       - id: check-yaml
 8 |         exclude: |
 9 |           (?x)^(
10 |               conda/pytorch-geometric/meta.yaml|
11 |               conda/pyg/meta.yaml
12 |           )$
13 |   # - repo: https://github.com/adrienverge/yamllint.git
14 |   #   rev: v1.26.3
15 |   #   hooks:
16 |   #     - id: yamllint
17 |   #       args: [-c=.yamllint.yml]
18 | 
19 |   # - repo: https://github.com/regebro/pyroma
20 |   #   rev: "4.0"
21 |   #   hooks:
22 |   #     - id: pyroma
23 |   #       name: Check packaging
24 |   #       args: [--min=10, .]
25 | 
26 |   # - repo: https://github.com/pre-commit/mirrors-yapf
27 |   #   rev: v0.32.0
28 |   #   hooks:
29 |   #     - id: yapf
30 |   #       name: Format code
31 | 
32 |   # - repo: https://github.com/pycqa/isort
33 |   #   rev: 5.10.1
34 |   #   hooks:
35 |   #     - id: isort
36 |   #       name: Sort imports
37 | 
38 |   # - repo: https://github.com/PyCQA/flake8
39 |   #   rev: 4.0.1
40 |   #   hooks:
41 |   #     - id: flake8
42 |   #       name: Check PEP8
43 | 
44 |   - repo: https://github.com/pre-commit/mirrors-clang-format
45 |     rev: v14.0.1
46 |     hooks:
47 |       - id: clang-format
48 |         name: Format C++ code
49 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)
 2 | project(quiver_feature)
 3 | set(CMAKE_CXX_STANDARD 14)
 4 | set(CMAKE_CUDA_STANDARD 14)
 5 | 
 6 | set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 7 | set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 8 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
 9 | 
10 | file(GLOB HEADERS csrc/include/qvf/*.h csrc/include/infinity/*.h csrc/include/miniz/*.h csrc/include/infinity/core/*.h csrc/include/infinity/memory/*.h csrc/include/infinity/queues/*.h csrc/include/infinity/requests/*.h csrc/include/infinity/utils/*.h)
11 | file(GLOB SOURCES csrc/src/*.cpp csrc/include/miniz/*.c csrc/include/infinity/requests/*.cpp csrc/include/infinity/core/*.cpp csrc/include/infinity/memory/*.cpp csrc/include/infinity/queues/*.cpp csrc/include/infinity/utils/*.cpp)
12 | file(GLOB TEST_SOURCES tests/cpp/*.cpp)
13 | 
14 | set_source_files_properties(SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs")
15 | set_source_files_properties(TEST_SOURCES PROPERTIES COMPILE_OPTIONS "-libverbs")
16 | 
17 | find_package(Python3 COMPONENTS Interpreter Development)
18 | find_package(Torch REQUIRED)
19 | add_library(${PROJECT_NAME} SHARED ${SOURCES})
20 | find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
21 | 
22 | target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY})
23 | target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python)
24 | target_link_libraries(${PROJECT_NAME} PRIVATE ibverbs)
25 | 
26 | if (PROF)
27 |   target_link_options(${PROJECT_NAME} PRIVATE "-pg")
28 | endif()
29 | 
30 | target_include_directories(${PROJECT_NAME} PUBLIC csrc/include)
31 | 
32 | include(GNUInstallDirs)
33 | include(CMakePackageConfigHelpers)
34 | 
35 | install(TARGETS ${PROJECT_NAME}
36 |   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME})
37 | install(FILES ${HEADERS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME})
38 | 
39 | if(BUILD_TEST)
40 |   add_executable(cpp_test ${TEST_SOURCES})
41 |   target_link_libraries(cpp_test PRIVATE ${TORCH_LIBRARIES})
42 |   target_link_libraries(cpp_test PRIVATE Python3::Python)
43 |   target_link_libraries(cpp_test PRIVATE ${PROJECT_NAME})
44 | endif()
45 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ##################################################
 2 | #
 3 | # (c) 2018 Claude Barthels, ETH Zurich
 4 | #
 5 | # Call 'make library' to build the library
 6 | # Call 'make examples' to build the examples
 7 | # Call 'make all' to build everything
 8 | #
 9 | ##################################################
10 | 
11 | PROJECT_NAME = libinfinity
12 | 
13 | ##################################################
14 | 
15 | CC 					= g++
16 | CC_FLAGS 		= -O3 -std=c++14
17 | LD_FLAGS		= -linfinity -libverbs
18 | 
19 | ##################################################
20 | 
21 | SOURCE_FOLDER		= csrc/include/
22 | BUILD_FOLDER		= build/infinity
23 | RELEASE_FOLDER	= build/infinity_release
24 | INCLUDE_FOLDER	= include
25 | EXAMPLES_FOLDER	=  infinity/
26 | 
27 | ##################################################
28 | 
29 | SOURCE_FILES =	$(SOURCE_FOLDER)/infinity/core/Context.cpp \
30 | 						$(SOURCE_FOLDER)/infinity/memory/Atomic.cpp \
31 | 						$(SOURCE_FOLDER)/infinity/memory/Buffer.cpp \
32 | 						$(SOURCE_FOLDER)/infinity/memory/Region.cpp \
33 | 						$(SOURCE_FOLDER)/infinity/memory/RegionToken.cpp \
34 | 						$(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.cpp \
35 | 						$(SOURCE_FOLDER)/infinity/queues/QueuePair.cpp \
36 | 						$(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.cpp \
37 | 						$(SOURCE_FOLDER)/infinity/requests/RequestToken.cpp \
38 | 						$(SOURCE_FOLDER)/infinity/utils/Address.cpp
39 | 
40 | HEADER_FILES	=	$(SOURCE_FOLDER)/infinity/infinity.h \
41 | 						$(SOURCE_FOLDER)/infinity/core/Context.h \
42 | 						$(SOURCE_FOLDER)/infinity/core/Configuration.h \
43 | 						$(SOURCE_FOLDER)/infinity/memory/Atomic.h \
44 | 						$(SOURCE_FOLDER)/infinity/memory/Buffer.h \
45 | 						$(SOURCE_FOLDER)/infinity/memory/Region.h \
46 | 						$(SOURCE_FOLDER)/infinity/memory/RegionToken.h \
47 | 						$(SOURCE_FOLDER)/infinity/memory/RegionType.h \
48 | 						$(SOURCE_FOLDER)/infinity/memory/RegisteredMemory.h \
49 | 						$(SOURCE_FOLDER)/infinity/queues/QueuePair.h \
50 | 						$(SOURCE_FOLDER)/infinity/queues/QueuePairFactory.h \
51 | 						$(SOURCE_FOLDER)/infinity/requests/RequestToken.h \
52 | 						$(SOURCE_FOLDER)/infinity/utils/Debug.h \
53 | 						$(SOURCE_FOLDER)/infinity/utils/Address.h
54 | 
55 | ##################################################
56 | 
57 | OBJECT_FILES		= $(patsubst $(SOURCE_FOLDER)/%.cpp,$(BUILD_FOLDER)/%.o,$(SOURCE_FILES))
58 | SOURCE_DIRECTORIES	= $(dir $(HEADER_FILES))
59 | BUILD_DIRECTORIES	= $(patsubst $(SOURCE_FOLDER)/%,$(BUILD_FOLDER)/%,$(SOURCE_DIRECTORIES))
60 | 
61 | ##################################################
62 | 
63 | all: library examples
64 | 
65 | ##################################################
66 | 
67 | $(BUILD_FOLDER)/%.o: $(SOURCE_FILES) $(HEADER_FILES)
68 | 	mkdir -p $(BUILD_FOLDER)
69 | 	mkdir -p $(BUILD_DIRECTORIES)
70 | 	$(CC) $(CC_FLAGS) -c $(SOURCE_FOLDER)/$*.cpp -I $(SOURCE_FOLDER) -o $(BUILD_FOLDER)/$*.o
71 | 
72 | ##################################################
73 | 
74 | library: $(OBJECT_FILES)
75 | 	mkdir -p $(RELEASE_FOLDER)
76 | 	ar rvs $(RELEASE_FOLDER)/$(PROJECT_NAME).a $(OBJECT_FILES)
77 | 	rm -rf $(RELEASE_FOLDER)/$(INCLUDE_FOLDER)
78 | 	cp --parents $(HEADER_FILES) $(RELEASE_FOLDER)
79 | 	mv $(RELEASE_FOLDER)/$(SOURCE_FOLDER)/ $(RELEASE_FOLDER)/$(INCLUDE_FOLDER)
80 | 
81 | ##################################################
82 | 
83 | clean:
84 | 	rm -rf $(BUILD_FOLDER)
85 | 	rm -rf $(RELEASE_FOLDER)
86 | 
87 | ##################################################
88 | 
89 | examples:
90 | 	mkdir -p $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)
91 | #	$(CC) tests/infinity/read-write-send.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/read-write-send
92 | #	$(CC) tests/infinity/send-performance.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/send-performance
93 | #	$(CC) tests/infinity/test_read.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_read
94 | #	$(CC) tests/infinity/test_multiread.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread
95 | 	$(CC) tests/infinity/test_multiread_multiqp.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_multiread_multiqp
96 | 	$(CC) tests/cpp/test_pipe.cpp $(CC_FLAGS) $(LD_FLAGS) -I $(RELEASE_FOLDER)/$(INCLUDE_FOLDER) -L $(RELEASE_FOLDER) -o $(RELEASE_FOLDER)/$(EXAMPLES_FOLDER)/test_pipe
97 | 
98 | ##################################################
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [pypi-image]: https://badge.fury.io/py/torch-geometric.svg
  2 | [pypi-url]: https://pypi.org/project/quiver-feature/
  3 | 
  4 | <p align="center">
  5 |   <img height="150" src="https://github.com/quiver-team/torch-quiver/blob/main/docs/multi_medias/imgs/quiver-logo-min.png" />
  6 | </p>
  7 | 
  8 | --------------------------------------------------------------------------------
  9 | 
 10 | Quiver-Feature is a RDMA-based high performance **distributed feature collection component** for **training GNN models on extremely large graphs**, It is built on [Quiver](https://github.com/quiver-team/torch-quiver) and has several novel features:
 11 | 
 12 | 1. **High Performance**: Quiver-Feature has **5-10x throughput performance** over feature collection solutions in existing GNN systems such as [DGL](https://github.com/dmlc/dgl) and [PyG](https://github.com/pyg-team/pytorch_geometric). 
 13 | 
 14 | 2. **Maximum Hardware Resource Utilization Efficiency**: Quiver-Feature has minimal CPU usage and minimal memory bus traffic, leaving much of the CPU and memory resource to tasks like graph sampling and model training.
 15 | 
 16 | 3. **Easy to use**: To use Quiver-Feature, developers only need to add a few lines of code in existing PyG/DGL programs. Quiver-Feature is thus easy to be adopted by PyG/DGL users and deployed in production clusters.
 17 | 
 18 | ![train_gnn_models_on_large_graph](docs/imgs/train_gnn_on_large_graphs.png)
 19 | 
 20 | --------------------------------------------------------------------------------
 21 | 
 22 | # GPU-centric Data Placement And Zero-Copy Data Access
 23 | 
 24 | **`GPU-centric data placement`** and **`Zero-Copy data access method`** are two keys behind Quiver-Feature's high performance. 
 25 | 
 26 | **`GPU-Centric Data Placement`:** Quiver-Feature has a unified view of memories across heterogeneous devices and machines. It classifies these memories into 4 memory spaces under a GPU-centric view: **Local HBM**(Current GPU's Memory),**Neighbor HBM**, **Local DRAM**(Current machines's CPU memory) and **Remote DRAM**(Remote CPU's memory). These 4 memory spaces have connections with each other using PCIe, NVLink and RDMA etc.
 27 | 
 28 | ![memory_view](docs/imgs/consistent_memory_view.png)
 29 | 
 30 | Accessing different memory spaces from GPU has unbalanced performance. Considering that feature data access frequency during GNN training is also unbalanced, Quiver-Feature uses an **`application-aware and GPU-Centric data palcement algorithm`** to takes full advantage of the GPU-centric multi-level memory layers.
 31 | 
 32 | **`Zero-Copy Data Access`:** Feature collection in GNN training involves massive data movement across network, DRAM, PCIe and NVLink and any extra memory copy hurts the e2e performance. Quiver-Feature uses one-sided commnunication methods such as `UVA` for local memory spaces access(Local HBM, Local DRAM, Neighbor HBM) and `RDMA READ` for remote memory space access(Remote DRAM), achiving zero-copy and minimum CPU intervention.([You can refer to this document for more RDMA details](docs/rdma_details.md))
 33 | 
 34 | 
 35 | **`DistTensorPGAS`:** Above those memory spaces, Quiver-Feature adopts **[`PGAS`](https://en.wikipedia.org/wiki/Partitioned_global_address_space) memory model** and implements a 2-dimension distributed tensor abstraction which is called `DistTensorPGAS`. Users can use `DistTensorPGAS` just like a local torch.Tensor, such as querying `shape` and performing `slicing operation` etc.
 36 | 
 37 | ![pgas_tensor](docs/imgs/pgas_tensor_view.png)
 38 | 
 39 | 
 40 | # Performance Benchmark
 41 | 
 42 | As far as we know, there's no public GNN system directly supports using RDMA for feature collection. `DGL` uses [TensorPipe](https://github.com/pytorch/tensorpipe) as its rpc backend, [TensorPipe](https://github.com/pytorch/tensorpipe) itself supports RDMA but `DGL` has not integrated this feature. Since [TensorPipe](https://github.com/pytorch/tensorpipe) is also the [official rpc backend](https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc) of Pytorch, we compare the feature collection performance between`Quiver-Feature` with `Pytorch-RPC Based Solution`. 
 43 | 
 44 | We have 2 machines and 100Gbps IB networks between them. We partition the data uniformly and start M GPU training processes on each machine(which we will refer as `2 Machines 2M GPUs` in the following result chart). we benchmark feature collection performance of `Quiver-Feature` and `Pytorch-RPC Based Solution` and we can see that `Quiver-Feature` is 5x better over `Pytorch-RPC Based Solution` in all settings.
 45 | 
 46 | ![img](docs/imgs/e2e_feature_collection.png)
 47 | 
 48 | # Install
 49 | 
 50 | ## Install From Source(Recommended For Now)
 51 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver).
 52 | 
 53 | 2. Install Quiver-Feature from source
 54 | 
 55 |         $ git clone git@github.com:quiver-team/quiver-feature
 56 |         $ cd quiver-feature/
 57 |         $ pip install .
 58 | 
 59 | ## Pip Install
 60 | 
 61 | 1. Install [Quiver](https://github.com/quiver-team/torch-quiver).
 62 | 
 63 | 2. Install the `Quiver-Feature` pip package.
 64 | 
 65 |         $ pip install quiver-feature
 66 | 
 67 | We have tested Quiver with the following setup:
 68 | 
 69 |  - OS: Ubuntu 18.04, Ubuntu 20.04
 70 | 
 71 |  - CUDA: 10.2, 11.1
 72 | 
 73 |  - GPU: Nvidia P100, V100, Titan X, A6000
 74 | 
 75 | ## Test Install
 76 | 
 77 | You can download Quiver-Feature's examples to test installation:
 78 | 
 79 |         $ git clone git@github.com:quiver-team/quiver-feature.git
 80 |         $ cd quiver-feature/examples/reddit
 81 |         $ python3 distribute_training.py 
 82 | 
 83 | A successful run should contain the following line:
 84 | 
 85 | `Starting Server With: xxxx`
 86 | 
 87 | 
 88 | # Quick Start
 89 | 
 90 | To use Quiver-Feature, you need to replace PyG's feature tensors with `quiver_feature.DistTensorPGAS`,this usually requires only a few changes in existing PyG programs with following 4 steps on each machine:
 91 | 
 92 | - Load feature partition and meta data which belongs to the current machine.
 93 | 
 94 | - Exchange feature partition meta data with other processes using `quiver_feature.DistHelper`.
 95 | 
 96 | - Create a `quiver_feature.DistTensorPGAS` from local feature partition and meta data.
 97 | 
 98 | - Pass the `quiver_feature.DistTensorPGAS` built above as parameter to each training process for feature collection.
 99 | 
100 | Here is a simple example for using Quiver-Feature in a PyG's program. You can check the [original scripts](examples/reddit/distribute_training.py) for more details.
101 | 
102 | ```python
103 |     
104 |     def train_process(rank, dist_tensor):
105 |         ...
106 |         for batch_size, n_id, adjs in train_loader:
107 |                 ...
108 |                 # Using DistTensorPGAS Just Like A torch.Tensor
109 |                 collected_feature = dist_tensor[n_id]
110 |                 ...
111 | 
112 |     if __name__ == "__main__":
113 | 
114 |         # Step 1: Load Local data partition
115 |         local_tensor, cached_range, local_range = load_partitioned_data(...)
116 | 
117 |         # Step 2: Exchange TensorPoints Information
118 |         dist_helper = DistHelper(...)
119 |         tensor_endpoints = dist_helper.exchange_tensor_endpoints_info()
120 | 
121 |         
122 |         # Step 3:  Build DistTensorPGAS from local feature partition
123 |         dist_tensor = DistTensorPGAS(...)
124 | 
125 | 
126 |         # Step 4: Spawn Training Processes Using DistTensor as Parameter
127 |         mp.spawn(
128 |                 train_process,
129 |                 args=(..., dist_tensor, ...),
130 |                 nprocs=args.device_per_node,
131 |                 join=True
132 |         )
133 |         ...
134 | 
135 | ```
136 | 
137 | # License
138 | 
139 | Quiver-Feature is licensed under the Apache License, Version 2.0
140 | 
141 | # Citation
142 | If you use Quiver-Feature in your publication,please cite it by using the following BibTeX entry.
143 | 
144 |     @Misc{Quiver-Feature,
145 |         institution = {Quiver Team},
146 |         title =  {Quiver-Feature:A High Performance Feature Collection Component For Training GNN On Extremely Large Graphs},
147 |         howpublished = {\url{https://github.com/quiver-team/quiver-feature}},
148 |         year = {2022}
149 |     }


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | mkdir -p build
2 | cd build
3 | Torch_DIR=`python -c 'import torch;print(torch.utils.cmake_prefix_path)'` \
4 | cmake -DBUILD_TEST=1 -DCMAKE_INSTALL_PREFIX=. ..
5 | make install
6 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/core/Configuration.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Core - Configuration
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef CORE_CONFIGURATION_H_
10 | #define CORE_CONFIGURATION_H_
11 | 
12 | #include <stdint.h>
13 | 
14 | namespace infinity {
15 | namespace core {
16 | 
17 | class Configuration {
18 | 
19 | public:
20 | 
21 | 	/**
22 | 	 * Queue length settings
23 | 	 */
24 | 
25 | 	static const uint32_t SEND_COMPLETION_QUEUE_LENGTH = 8191; 		// Must be less than MAX_CQE
26 | 
27 | 	static const uint32_t RECV_COMPLETION_QUEUE_LENGTH = 8191; 		// Must be less than MAX_CQE
28 | 
29 | 	static const uint32_t SHARED_RECV_QUEUE_LENGTH = 8191; 			// Must be less than MAX_SRQ_WR
30 | 
31 | 	static const uint32_t MAX_NUMBER_OF_OUTSTANDING_REQUESTS = 8191;	// Must be less than (MAX_QP_WR * MAX_QP)
32 | 																		// Since we use one single shared receive queue,
33 | 																		// this number should be less than MAX_SRQ_WR
34 | 
35 | 	static const uint32_t MAX_NUMBER_OF_SGE_ELEMENTS = 1;				// Must be less than MAX_SGE
36 | 
37 | public:
38 | 
39 | 	/**
40 | 	 * System settings
41 | 	 */
42 | 
43 | 	static const uint32_t PAGE_SIZE = 4096; 							// Memory regions will be page aligned by the Infinity library
44 | 
45 | 	static const uint32_t MAX_CONNECTION_USER_DATA_SIZE = 1024;			// Size of the user data which can be transmitted when establishing a connection
46 | 
47 | 	static constexpr const char* DEFAULT_IB_DEVICE = "ib0";				// Default name of IB device
48 | 
49 | };
50 | 
51 | } /* namespace core */
52 | } /* namespace infinity */
53 | 
54 | #endif /* CORE_CONFIGURATION_H_ */
55 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/core/Context.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Core - Context
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #ifndef CORE_CONTEXT_H_
 10 | #define CORE_CONTEXT_H_
 11 | 
 12 | #include <stdlib.h>
 13 | #include <stdint.h>
 14 | #include <unordered_map>
 15 | #include <infiniband/verbs.h>
 16 | 
 17 | namespace infinity {
 18 | namespace memory {
 19 | class Region;
 20 | class Buffer;
 21 | class Atomic;
 22 | class RegisteredMemory;
 23 | }
 24 | }
 25 | 
 26 | namespace infinity {
 27 | namespace queues {
 28 | class QueuePair;
 29 | class QueuePairFactory;
 30 | }
 31 | }
 32 | 
 33 | namespace infinity {
 34 | namespace requests {
 35 | class RequestToken;
 36 | }
 37 | }
 38 | 
 39 | namespace infinity {
 40 | namespace core {
 41 | 
 42 | typedef struct {
 43 | 	infinity::memory::Buffer *buffer;
 44 | 	uint32_t bytesWritten;
 45 | 	uint32_t immediateValue;
 46 | 	bool immediateValueValid;
 47 | 	infinity::queues::QueuePair *queuePair;
 48 | } receive_element_t;
 49 | 
 50 | class Context {
 51 | 
 52 | 	friend class infinity::memory::Region;
 53 | 	friend class infinity::memory::Buffer;
 54 | 	friend class infinity::memory::Atomic;
 55 | 	friend class infinity::memory::RegisteredMemory;
 56 | 	friend class infinity::queues::QueuePair;
 57 | 	friend class infinity::queues::QueuePairFactory;
 58 | 	friend class infinity::requests::RequestToken;
 59 | 
 60 | public:
 61 | 
 62 | 	/**
 63 | 	 * Constructors
 64 | 	 */
 65 | 	Context(uint16_t device = 0, uint16_t devicePort = 1);
 66 | 
 67 | 	/**
 68 | 	 * Destructor
 69 | 	 */
 70 | 	~Context();
 71 | 
 72 | public:
 73 | 
 74 | 	/**
 75 | 	 * Check if receive operation completed
 76 | 	 */
 77 | 	bool receive(receive_element_t *receiveElement);
 78 | 	bool receive(infinity::memory::Buffer **buffer, uint32_t *bytesWritten, uint32_t *immediateValue, bool *immediateValueValid, infinity::queues::QueuePair **queuePair = NULL);
 79 | 
 80 | 	/**
 81 | 	 * Post a new buffer for receiving messages
 82 | 	 */
 83 | 	void postReceiveBuffer(infinity::memory::Buffer *buffer);
 84 | 
 85 | 	/*
 86 | 		Poll expected signal from completion queue
 87 | 	*/
 88 | 	int batchPollSendCompletionQueue(int poll_batch, int expected_num, ibv_wc* wc, bool force_all);
 89 | 
 90 | public:
 91 | 
 92 | 	infinity::requests::RequestToken * defaultRequestToken;
 93 | 	infinity::memory::Atomic * defaultAtomic;
 94 | 
 95 | protected:
 96 | 
 97 | 	/**
 98 | 	 * Returns ibVerbs context
 99 | 	 */
100 | 	ibv_context * getInfiniBandContext();
101 | 
102 | 	/**
103 | 	 * Returns local device id
104 | 	 */
105 | 	uint16_t getLocalDeviceId();
106 | 
107 | 	/**
108 | 	 * Returns device port
109 | 	 */
110 | 	uint16_t getDevicePort();
111 | 
112 | 	/**
113 | 	 * Returns ibVerbs protection domain
114 | 	 */
115 | 	ibv_pd * getProtectionDomain();
116 | 
117 | protected:
118 | 
119 | 	/**
120 | 	 * Check if send operation completed
121 | 	 */
122 | 	bool pollSendCompletionQueue();
123 | 
124 | 	/**
125 | 	 * Returns ibVerbs completion queue for sending
126 | 	 */
127 | 	ibv_cq * getSendCompletionQueue();
128 | 
129 | 	/**
130 | 	 * Returns ibVerbs completion queue for receiving
131 | 	 */
132 | 	ibv_cq * getReceiveCompletionQueue();
133 | 
134 | 	/**
135 | 	 * Returns ibVerbs shared receive queue
136 | 	 */
137 | 	ibv_srq * getSharedReceiveQueue();
138 | 
139 | protected:
140 | 
141 | 	/**
142 | 	 * IB context and protection domain
143 | 	 */
144 | 	ibv_context *ibvContext;
145 | 	ibv_pd *ibvProtectionDomain;
146 | 
147 | 	/**
148 | 	 * Local device id and port
149 | 	 */
150 | 	ibv_device *ibvDevice;
151 | 	uint16_t ibvLocalDeviceId;
152 | 	uint16_t ibvDevicePort;
153 | 
154 | 	/**
155 | 	 * IB send and receive completion queues
156 | 	 */
157 | 	ibv_cq *ibvSendCompletionQueue;
158 | 	ibv_cq *ibvReceiveCompletionQueue;
159 | 	ibv_srq *ibvSharedReceiveQueue;
160 | 
161 | protected:
162 | 
163 | 	void registerQueuePair(infinity::queues::QueuePair *queuePair);
164 | 	std::unordered_map<uint32_t, infinity::queues::QueuePair *> queuePairMap;
165 | 
166 | };
167 | 
168 | } /* namespace core */
169 | } /* namespace infinity */
170 | 
171 | #endif /* CORE_CONTEXT_H_ */
172 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/infinity.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Infinity - A C++ RDMA library for InfiniBand
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef INFINITY_H_
10 | #define INFINITY_H_
11 | 
12 | #include <infinity/core/Context.h>
13 | #include <infinity/core/Configuration.h>
14 | #include <infinity/memory/Atomic.h>
15 | #include <infinity/memory/Buffer.h>
16 | #include <infinity/memory/Region.h>
17 | #include <infinity/memory/RegionToken.h>
18 | #include <infinity/memory/RegionType.h>
19 | #include <infinity/memory/RegisteredMemory.h>
20 | #include <infinity/queues/QueuePair.h>
21 | #include <infinity/queues/QueuePairFactory.h>
22 | #include <infinity/requests/RequestToken.h>
23 | #include <infinity/utils/Address.h>
24 | #include <infinity/utils/Debug.h>
25 | 
26 | #endif /* INFINITY_H_ */
27 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Atomic.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Atomic
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #include "Atomic.h"
10 | 
11 | #include <infiniband/verbs.h>
12 | 
13 | namespace infinity {
14 | namespace memory {
15 | 
16 | Atomic::Atomic(infinity::core::Context* context) {
17 | 
18 | 	this->context = context;
19 | 	this->sizeInBytes = sizeof(uint64_t);
20 | 	this->memoryRegionType = RegionType::ATOMIC;
21 | 
22 | 	this->value = 0;
23 | 	this->data = &value;
24 | 
25 | 	this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), &(this->value), this->sizeInBytes,
26 | 			IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE);
27 | 
28 | 
29 | }
30 | 
31 | uint64_t infinity::memory::Atomic::getValue() {
32 | 
33 | 	return this->value;
34 | 
35 | }
36 | 
37 | void infinity::memory::Atomic::setValueNonAtomic(uint64_t value) {
38 | 
39 | 	this->value = value;
40 | 
41 | }
42 | 
43 | 
44 | Atomic::~Atomic() {
45 | 
46 | 	ibv_dereg_mr(this->ibvMemoryRegion);
47 | 
48 | }
49 | 
50 | } /* namespace memory */
51 | } /* namespace infinity */
52 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Atomic.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Atomic
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef MEMORY_ATOMIC_H_
10 | #define MEMORY_ATOMIC_H_
11 | 
12 | #include <stdint.h>
13 | 
14 | #include <infinity/memory/Region.h>
15 | #include <infinity/core/Context.h>
16 | 
17 | 
18 | namespace infinity {
19 | namespace memory {
20 | 
21 | class Atomic : public Region {
22 | 
23 | public:
24 | 
25 | 	Atomic(infinity::core::Context *context);
26 | 	virtual ~Atomic();
27 | 
28 | public:
29 | 
30 | 	uint64_t getValue();
31 | 
32 | 	void setValueNonAtomic(uint64_t value);
33 | 
34 | protected:
35 | 
36 | 	uint64_t value;
37 | 
38 | 
39 | };
40 | 
41 | } /* namespace memory */
42 | } /* namespace infinity */
43 | 
44 | #endif /* MEMORY_ATOMIC_H_ */
45 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Buffer.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Memory - Buffer
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include "Buffer.h"
 10 | 
 11 | #include <stdlib.h>
 12 | #include <string.h>
 13 | 
 14 | #include <infinity/core/Configuration.h>
 15 | #include <infinity/utils/Debug.h>
 16 | 
 17 | #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 18 | 
 19 | namespace infinity {
 20 | namespace memory {
 21 | 
 22 | Buffer::Buffer(infinity::core::Context* context, uint64_t sizeInBytes) {
 23 |   this->context = context;
 24 |   this->sizeInBytes = sizeInBytes;
 25 |   this->memoryRegionType = RegionType::BUFFER;
 26 | 
 27 |   int res = posix_memalign(
 28 |       &(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes);
 29 |   INFINITY_ASSERT(
 30 |       res == 0,
 31 |       "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n");
 32 | 
 33 |   memset(this->data, 0, sizeInBytes);
 34 | 
 35 |   this->ibvMemoryRegion = ibv_reg_mr(
 36 |       this->context->getProtectionDomain(), this->data, this->sizeInBytes,
 37 |       IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
 38 |           IBV_ACCESS_REMOTE_READ);
 39 |   INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
 40 |                   "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
 41 | 
 42 |   this->memoryAllocated = true;
 43 |   this->memoryRegistered = true;
 44 | }
 45 | 
 46 | Buffer::Buffer(infinity::core::Context* context,
 47 |                infinity::memory::RegisteredMemory* memory,
 48 |                uint64_t offset,
 49 |                uint64_t sizeInBytes) {
 50 |   this->context = context;
 51 |   this->sizeInBytes = sizeInBytes;
 52 |   this->memoryRegionType = RegionType::BUFFER;
 53 | 
 54 |   this->data = reinterpret_cast<char*>(memory->getData()) + offset;
 55 |   this->ibvMemoryRegion = memory->getRegion();
 56 | 
 57 |   this->memoryAllocated = false;
 58 |   this->memoryRegistered = false;
 59 | }
 60 | 
 61 | Buffer::Buffer(infinity::core::Context* context,
 62 |                void* memory,
 63 |                uint64_t sizeInBytes) {
 64 |   this->context = context;
 65 |   this->sizeInBytes = sizeInBytes;
 66 |   this->memoryRegionType = RegionType::BUFFER;
 67 | 
 68 |   this->data = memory;
 69 |   this->ibvMemoryRegion = ibv_reg_mr(
 70 |       this->context->getProtectionDomain(), this->data, this->sizeInBytes,
 71 |       IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
 72 |           IBV_ACCESS_REMOTE_READ);
 73 |   INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
 74 |                   "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
 75 | 
 76 |   this->memoryAllocated = false;
 77 |   this->memoryRegistered = true;
 78 | }
 79 | 
 80 | Buffer::Buffer(infinity::core::Context* context,
 81 |                uint64_t sizeInBytes,
 82 |                int device) {
 83 |   this->context = context;
 84 |   this->sizeInBytes = sizeInBytes;
 85 |   this->memoryRegionType = RegionType::BUFFER;
 86 | 
 87 |   cudaSetDevice(device);
 88 |   int cap = sizeInBytes + infinity::core::Configuration::PAGE_SIZE;
 89 |   int res = cudaMalloc(&this->data, cap);
 90 |   INFINITY_ASSERT(
 91 |       res == 0,
 92 |       "[INFINITY][MEMORY][BUFFER] Cannot allocate and align buffer.\n");
 93 | 
 94 |   void* temp = this->data;
 95 |   if (uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE) {
 96 |     uint64_t head =
 97 |         infinity::core::Configuration::PAGE_SIZE -
 98 |         uint64_t(this->data) % infinity::core::Configuration::PAGE_SIZE;
 99 |     temp += head;
100 |   }
101 |   cudaMemset(this->data, 0, cap);
102 | 
103 |   this->ibvMemoryRegion =
104 |       ibv_reg_mr(this->context->getProtectionDomain(), temp, this->sizeInBytes,
105 |                  IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
106 |                      IBV_ACCESS_REMOTE_READ);
107 |   INFINITY_ASSERT(this->ibvMemoryRegion != NULL,
108 |                   "[INFINITY][MEMORY][BUFFER] Registration failed.\n");
109 | 
110 |   this->memoryAllocated = true;
111 |   this->memoryRegistered = true;
112 |   this->cuda = true;
113 | }
114 | 
115 | Buffer::~Buffer() {
116 |   if (this->memoryRegistered) {
117 |     ibv_dereg_mr(this->ibvMemoryRegion);
118 |   }
119 |   if (this->memoryAllocated) {
120 |     if (!this->cuda) {
121 |       free(this->data);
122 |     } else {
123 |       cudaFree(this->data);
124 |     }
125 |   }
126 | }
127 | 
128 | void* Buffer::getData() {
129 |   return reinterpret_cast<void*>(this->getAddress());
130 | }
131 | 
132 | void Buffer::resize(uint64_t newSize, void* newData) {
133 |   void* oldData = this->data;
134 |   uint32_t oldSize = this->sizeInBytes;
135 | 
136 |   if (newData == NULL) {
137 |     newData = this->data;
138 |   }
139 | 
140 |   if (oldData != newData) {
141 |     uint64_t copySize = MIN(newSize, oldSize);
142 |     memcpy(newData, oldData, copySize);
143 |   }
144 | 
145 |   if (memoryRegistered) {
146 |     ibv_dereg_mr(this->ibvMemoryRegion);
147 |     this->ibvMemoryRegion =
148 |         ibv_reg_mr(this->context->getProtectionDomain(), newData, newSize,
149 |                    IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE |
150 |                        IBV_ACCESS_REMOTE_READ);
151 |     this->data = newData;
152 |     this->sizeInBytes = newSize;
153 |   } else {
154 |     INFINITY_ASSERT(false,
155 |                     "[INFINITY][MEMORY][BUFFER] You can only resize memory "
156 |                     "which has registered by this buffer.\n");
157 |   }
158 | }
159 | 
160 | } /* namespace memory */
161 | } /* namespace infinity */
162 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Buffer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Buffer
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef MEMORY_BUFFER_H_
10 | #define MEMORY_BUFFER_H_
11 | 
12 | #include <infinity/core/Context.h>
13 | #include <infinity/memory/Region.h>
14 | #include <infinity/memory/RegisteredMemory.h>
15 | 
16 | #include <cuda_runtime.h>
17 | 
18 | namespace infinity {
19 | namespace memory {
20 | 
21 | class Buffer : public Region {
22 |  public:
23 |   Buffer(infinity::core::Context* context, uint64_t sizeInBytes);
24 |   Buffer(infinity::core::Context* context, uint64_t sizeInBytes, int device);
25 |   Buffer(infinity::core::Context* context,
26 |          infinity::memory::RegisteredMemory* memory,
27 |          uint64_t offset,
28 |          uint64_t sizeInBytes);
29 |   Buffer(infinity::core::Context* context, void* memory, uint64_t sizeInBytes);
30 |   ~Buffer();
31 | 
32 |  public:
33 |   void* getData();
34 |   void resize(uint64_t newSize, void* newData = NULL);
35 | 
36 |  protected:
37 |   bool memoryRegistered;
38 |   bool memoryAllocated;
39 |   bool cuda;
40 | };
41 | 
42 | } /* namespace memory */
43 | } /* namespace infinity */
44 | 
45 | #endif /* MEMORY_BUFFER_H_ */
46 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Region.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Region
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #include "Buffer.h"
10 | 
11 | #include <infinity/utils/Debug.h>
12 | #include <infinity/memory/RegionToken.h>
13 | 
14 | namespace infinity {
15 | namespace memory {
16 | 
17 | Region::~Region() {
18 | 	// To be overwritten in sub class
19 | }
20 | 
21 | RegionToken* Region::createRegionToken() {
22 | 	return new RegionToken(this, getMemoryRegionType(), getSizeInBytes(), getAddress(), getLocalKey(), getRemoteKey());
23 | }
24 | 
25 | RegionToken * Region::createRegionToken(uint64_t offset) {
26 | 	return new RegionToken(this, getMemoryRegionType(), getRemainingSizeInBytes(offset), getAddressWithOffset(offset), getLocalKey(), getRemoteKey());
27 | }
28 | 
29 | RegionToken * Region::createRegionToken(uint64_t offset, uint64_t size) {
30 | 	return new RegionToken(this, getMemoryRegionType(), size, getAddressWithOffset(offset), getLocalKey(), getRemoteKey());
31 | }
32 | 
33 | RegionType Region::getMemoryRegionType() {
34 | 	return this->memoryRegionType;
35 | }
36 | 
37 | uint64_t Region::getSizeInBytes() {
38 | 	return this->sizeInBytes;
39 | }
40 | 
41 | uint64_t Region::getRemainingSizeInBytes(uint64_t offset) {
42 | 	return this->sizeInBytes - offset;
43 | }
44 | 
45 | uint64_t Region::getAddress() {
46 | 	return reinterpret_cast<uint64_t>(this->data);
47 | }
48 | 
49 | uint64_t Region::getAddressWithOffset(uint64_t offset) {
50 | 	return reinterpret_cast<uint64_t>(this->data) + offset;
51 | }
52 | 
53 | uint32_t Region::getLocalKey() {
54 | 	return this->ibvMemoryRegion->lkey;
55 | }
56 | 
57 | uint32_t Region::getRemoteKey() {
58 | 	return this->ibvMemoryRegion->rkey;
59 | }
60 | 
61 | } /* namespace memory */
62 | } /* namespace infinity */
63 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/Region.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Region
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef MEMORY_REGION_H_
10 | #define MEMORY_REGION_H_
11 | 
12 | #include <stdint.h>
13 | #include <infiniband/verbs.h>
14 | 
15 | #include <infinity/core/Context.h>
16 | #include <infinity/memory/RegionType.h>
17 | 
18 | namespace infinity {
19 | namespace memory {
20 | 
21 | class RegionToken;
22 | 
23 | class Region {
24 | 
25 | public:
26 | 
27 | 	virtual ~Region();
28 | 
29 | 	RegionToken * createRegionToken();
30 | 	RegionToken * createRegionToken(uint64_t offset);
31 | 	RegionToken * createRegionToken(uint64_t offset, uint64_t size);
32 | 
33 | public:
34 | 
35 | 	RegionType getMemoryRegionType();
36 | 	uint64_t getSizeInBytes();
37 | 	uint64_t getRemainingSizeInBytes(uint64_t offset);
38 | 	uint64_t getAddress();
39 | 	uint64_t getAddressWithOffset(uint64_t offset);
40 | 	uint32_t getLocalKey();
41 | 	uint32_t getRemoteKey();
42 | 
43 | protected:
44 | 
45 | 	infinity::core::Context* context;
46 | 	RegionType memoryRegionType;
47 | 	ibv_mr *ibvMemoryRegion;
48 | 
49 | protected:
50 | 
51 | 	void * data;
52 | 	uint64_t sizeInBytes;
53 | 
54 | };
55 | 
56 | } /* namespace memory */
57 | } /* namespace infinity */
58 | 
59 | #endif /* MEMORY_REGION_H_ */
60 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionToken.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Region Token
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #include <infinity/memory/RegionToken.h>
10 | 
11 | namespace infinity {
12 | namespace memory {
13 | 
14 | RegionToken::RegionToken() :
15 | 	memoryRegion (NULL),
16 | 	memoryRegionType (UNKNOWN),
17 | 	sizeInBytes(0),
18 | 	address(0),
19 | 	localKey(0),
20 | 	remoteKey(0) {
21 | 
22 | 	// Nothing to do here
23 | 
24 | }
25 | 
26 | RegionToken::RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey) :
27 | 	memoryRegion (memoryRegion),
28 | 	memoryRegionType (memoryRegionType),
29 | 	sizeInBytes(sizeInBytes),
30 | 	address(address),
31 | 	localKey(localKey),
32 | 	remoteKey(remoteKey) {
33 | 
34 | 	// Nothing to do here
35 | 
36 | }
37 | 
38 | Region* RegionToken::getMemoryRegion() {
39 | 	return memoryRegion;
40 | }
41 | 
42 | RegionType RegionToken::getMemoryRegionType() {
43 | 	return this->memoryRegionType;
44 | }
45 | 
46 | uint64_t RegionToken::getSizeInBytes() {
47 | 	return this->sizeInBytes;
48 | }
49 | 
50 | uint64_t RegionToken::getRemainingSizeInBytes(uint64_t offset) {
51 | 	return this->sizeInBytes-offset;
52 | }
53 | 
54 | uint64_t RegionToken::getAddress() {
55 | 	return address;
56 | }
57 | 
58 | uint64_t RegionToken::getAddressWithOffset(uint64_t offset) {
59 | 	return address + offset;
60 | }
61 | 
62 | uint32_t RegionToken::getLocalKey() {
63 | 	return this->localKey;
64 | }
65 | 
66 | uint32_t RegionToken::getRemoteKey() {
67 | 	return this->remoteKey;
68 | }
69 | 
70 | 
71 | } /* namespace memory */
72 | } /* namespace infinity */
73 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionToken.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Region Token
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef MEMORY_REGIONTOKEN_H_
10 | #define MEMORY_REGIONTOKEN_H_
11 | 
12 | #include <stdint.h>
13 | #include <infinity/memory/RegionType.h>
14 | #include <infinity/memory/Region.h>
15 | 
16 | namespace infinity {
17 | namespace memory {
18 | 
19 | class RegionToken {
20 | 
21 | public:
22 | 
23 | 	RegionToken();
24 | 	RegionToken(Region *memoryRegion, RegionType memoryRegionType, uint64_t sizeInBytes, uint64_t address, uint32_t localKey, uint32_t remoteKey);
25 | 
26 | public:
27 | 
28 | 	Region * getMemoryRegion();
29 | 	RegionType getMemoryRegionType();
30 | 	uint64_t getSizeInBytes();
31 | 	uint64_t getRemainingSizeInBytes(uint64_t offset);
32 | 	uint64_t getAddress();
33 | 	uint64_t getAddressWithOffset(uint64_t offset);
34 | 	uint32_t getLocalKey();
35 | 	uint32_t getRemoteKey();
36 | 
37 | protected:
38 | 
39 | 	Region *memoryRegion;
40 | 	const RegionType memoryRegionType;
41 | 	const uint64_t sizeInBytes;
42 | 	const uint64_t address;
43 | 	const uint32_t localKey;
44 | 	const uint32_t remoteKey;
45 | 
46 | };
47 | 
48 | } /* namespace memory */
49 | } /* namespace infinity */
50 | 
51 | #endif /* MEMORY_REGIONTOKEN_H_ */
52 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegionType.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Region Type
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef MEMORY_REGIONTYPE_H_
10 | #define MEMORY_REGIONTYPE_H_
11 | 
12 | namespace infinity {
13 | namespace memory {
14 | 
15 | enum RegionType {BUFFER, ATOMIC, UNKNOWN};
16 | 
17 | } /* namespace memory */
18 | } /* namespace infinity */
19 | 
20 | #endif /* MEMORY_REGIONTYPE_H_ */
21 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegisteredMemory.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Registered Memory
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #include "RegisteredMemory.h"
10 | 
11 | #include <stdlib.h>
12 | #include <string.h>
13 | 
14 | #include <infinity/core/Configuration.h>
15 | #include <infinity/utils/Debug.h>
16 | 
17 | namespace infinity {
18 | namespace memory {
19 | 
20 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, uint64_t sizeInBytes) {
21 | 
22 | 	this->context = context;
23 | 	this->sizeInBytes = sizeInBytes;
24 | 	this->memoryAllocated = true;
25 | 
26 | 	int res = posix_memalign(&(this->data), infinity::core::Configuration::PAGE_SIZE, sizeInBytes);
27 | 	INFINITY_ASSERT(res == 0, "[INFINITY][MEMORY][REGISTERED] Cannot allocate and align buffer.\n");
28 | 
29 | 	memset(this->data, 0, sizeInBytes);
30 | 
31 | 	this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes,
32 | 			IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
33 | 	INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n");
34 | }
35 | 
36 | RegisteredMemory::RegisteredMemory(infinity::core::Context* context, void *data, uint64_t sizeInBytes) {
37 | 
38 | 	this->context = context;
39 | 	this->sizeInBytes = sizeInBytes;
40 | 	this->memoryAllocated = false;
41 | 
42 | 	this->data = data;
43 | 
44 | 	this->ibvMemoryRegion = ibv_reg_mr(this->context->getProtectionDomain(), this->data, this->sizeInBytes,
45 | 			IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
46 | 	INFINITY_ASSERT(this->ibvMemoryRegion != NULL, "[INFINITY][MEMORY][REGISTERED] Registration failed.\n");
47 | }
48 | 
49 | 
50 | RegisteredMemory::~RegisteredMemory() {
51 | 
52 | 	ibv_dereg_mr(this->ibvMemoryRegion);
53 | 
54 | 	if(this->memoryAllocated) {
55 | 		free(this->data);
56 | 	}
57 | 
58 | }
59 | 
60 | void* RegisteredMemory::getData() {
61 | 
62 | 	return this->data;
63 | 
64 | }
65 | 
66 | uint64_t RegisteredMemory::getSizeInBytes() {
67 | 
68 | 	return this->sizeInBytes;
69 | 
70 | }
71 | 
72 | ibv_mr* RegisteredMemory::getRegion() {
73 | 
74 | 	return this->ibvMemoryRegion;
75 | 
76 | }
77 | 
78 | } /* namespace pool */
79 | } /* namespace ivory */
80 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/memory/RegisteredMemory.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Memory - Registered Memory
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef INFINITY_MEMORY_REGISTEREDMEMORY_H_
10 | #define INFINITY_MEMORY_REGISTEREDMEMORY_H_
11 | 
12 | #include <infinity/core/Context.h>
13 | 
14 | namespace infinity {
15 | namespace memory {
16 | 
17 | class RegisteredMemory {
18 | 
19 | public:
20 | 
21 | 	 RegisteredMemory(infinity::core::Context *context, uint64_t sizeInBytes);
22 | 	 RegisteredMemory(infinity::core::Context *context, void *data, uint64_t sizeInBytes);
23 | 	 ~RegisteredMemory();
24 | 
25 | 	 void * getData();
26 | 
27 | 	 uint64_t getSizeInBytes();
28 | 
29 | 	 ibv_mr * getRegion();
30 | 
31 | 
32 | protected:
33 | 
34 | 	 infinity::core::Context* context;
35 | 
36 | 	 void *data;
37 | 	 uint64_t sizeInBytes;
38 | 
39 | 	 ibv_mr *ibvMemoryRegion;
40 | 
41 | protected:
42 | 
43 | 	 bool memoryAllocated;
44 | 
45 | };
46 | 
47 | } /* namespace infinity */
48 | } /* namespace memory */
49 | 
50 | #endif /* INFINITY_MEMORY_REGISTEREDMEMORY_H_ */
51 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/queues/QueuePair.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Queues - Queue Pair
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #ifndef QUEUES_QUEUEPAIR_H_
 10 | #define QUEUES_QUEUEPAIR_H_
 11 | 
 12 | #include <infiniband/verbs.h>
 13 | 
 14 | #include <infinity/core/Context.h>
 15 | #include <infinity/memory/Atomic.h>
 16 | #include <infinity/memory/Buffer.h>
 17 | #include <infinity/memory/RegionToken.h>
 18 | #include <infinity/requests/RequestToken.h>
 19 | #include <vector>
 20 | 
 21 | namespace infinity {
 22 | namespace queues {
 23 | class QueuePairFactory;
 24 | }
 25 | }  // namespace infinity
 26 | 
 27 | namespace infinity {
 28 | namespace queues {
 29 | struct SendRequestBuffer {
 30 |   std::vector<ibv_sge> sges;
 31 |   std::vector<ibv_send_wr> requests;
 32 |   SendRequestBuffer() {}
 33 |   SendRequestBuffer(int num) {
 34 |     sges.resize(num);
 35 |     requests.resize(num);
 36 |   }
 37 |   void resize(int num) {
 38 |     sges.resize(num);
 39 |     requests.resize(num);
 40 |   }
 41 |   void reset() {
 42 |     memset(sges.data(), 0, sizeof(ibv_sge));
 43 |     memset(requests.data(), 0, sizeof(ibv_send_wr));
 44 |   }
 45 | };
 46 | }  // namespace queues
 47 | }  // namespace infinity
 48 | 
 49 | namespace infinity {
 50 | namespace queues {
 51 | 
 52 | struct IbvWcBuffer {
 53 |   ibv_wc* wc;
 54 |   int size_;
 55 |   IbvWcBuffer() {}
 56 |   IbvWcBuffer(int size) {
 57 |     wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size);
 58 |     size_ = size;
 59 |   }
 60 |   void resize(int size) {
 61 |     wc = (ibv_wc*)malloc(sizeof(ibv_wc) * size);
 62 |     size_ = size;
 63 |   }
 64 | 
 65 |   ibv_wc* ptr() { return wc; }
 66 |   int size() { return size_; }
 67 | };
 68 | }  // namespace queues
 69 | }  // namespace infinity
 70 | 
 71 | namespace infinity {
 72 | namespace queues {
 73 | 
 74 | class OperationFlags {
 75 |  public:
 76 |   bool fenced;
 77 |   bool signaled;
 78 |   bool inlined;
 79 | 
 80 |   OperationFlags() : fenced(false), signaled(false), inlined(false){};
 81 | 
 82 |   /**
 83 |    * Turn the bools into a bit field.
 84 |    */
 85 |   int ibvFlags();
 86 | };
 87 | 
 88 | class QueuePair {
 89 |   friend class infinity::queues::QueuePairFactory;
 90 | 
 91 |  public:
 92 |   /**
 93 |    * Constructor
 94 |    */
 95 |   QueuePair(infinity::core::Context* context);
 96 | 
 97 |   /**
 98 |    * Destructor
 99 |    */
100 |   ~QueuePair();
101 | 
102 |  protected:
103 |   /**
104 |    * Activation methods
105 |    */
106 | 
107 |   void activate(uint16_t remoteDeviceId,
108 |                 uint32_t remoteQueuePairNumber,
109 |                 uint32_t remoteSequenceNumber);
110 |   void setRemoteUserData(void* userData, uint32_t userDataSize);
111 | 
112 |  public:
113 |   /**
114 |    * User data received during connection setup
115 |    */
116 | 
117 |   bool hasUserData();
118 |   uint32_t getUserDataSize();
119 |   void* getUserData();
120 | 
121 |  public:
122 |   /**
123 |    * Queue pair information
124 |    */
125 | 
126 |   uint16_t getLocalDeviceId();
127 |   uint32_t getQueuePairNumber();
128 |   uint32_t getSequenceNumber();
129 | 
130 |  public:
131 |   /**
132 |    * Buffer operations
133 |    */
134 | 
135 |   void send(infinity::memory::Buffer* buffer,
136 |             infinity::requests::RequestToken* requestToken = NULL);
137 |   void send(infinity::memory::Buffer* buffer,
138 |             uint32_t sizeInBytes,
139 |             infinity::requests::RequestToken* requestToken = NULL);
140 |   void send(infinity::memory::Buffer* buffer,
141 |             uint64_t localOffset,
142 |             uint32_t sizeInBytes,
143 |             OperationFlags flags,
144 |             infinity::requests::RequestToken* requestToken = NULL);
145 | 
146 |   void write(infinity::memory::Buffer* buffer,
147 |              infinity::memory::RegionToken* destination,
148 |              infinity::requests::RequestToken* requestToken = NULL);
149 |   void write(infinity::memory::Buffer* buffer,
150 |              infinity::memory::RegionToken* destination,
151 |              uint32_t sizeInBytes,
152 |              infinity::requests::RequestToken* requestToken = NULL);
153 |   void write(infinity::memory::Buffer* buffer,
154 |              uint64_t localOffset,
155 |              infinity::memory::RegionToken* destination,
156 |              uint64_t remoteOffset,
157 |              uint32_t sizeInBytes,
158 |              OperationFlags flags,
159 |              infinity::requests::RequestToken* requestToken = NULL);
160 | 
161 |   void read(infinity::memory::Buffer* buffer,
162 |             infinity::memory::RegionToken* source,
163 |             infinity::requests::RequestToken* requestToken = NULL);
164 |   void read(infinity::memory::Buffer* buffer,
165 |             infinity::memory::RegionToken* source,
166 |             uint32_t sizeInBytes,
167 |             infinity::requests::RequestToken* requestToken = NULL);
168 |   void read(infinity::memory::Buffer* buffer,
169 |             uint64_t localOffset,
170 |             infinity::memory::RegionToken* source,
171 |             uint64_t remoteOffset,
172 |             uint32_t sizeInBytes,
173 |             OperationFlags flags,
174 |             infinity::requests::RequestToken* requestToken = NULL);
175 | 
176 |  public:
177 |   /**
178 |    * Complex buffer operations
179 |    */
180 | 
181 |   void multiWrite(infinity::memory::Buffer** buffers,
182 |                   uint32_t* sizesInBytes,
183 |                   uint64_t* localOffsets,
184 |                   uint32_t numberOfElements,
185 |                   infinity::memory::RegionToken* destination,
186 |                   uint64_t remoteOffset,
187 |                   OperationFlags flags,
188 |                   infinity::requests::RequestToken* requestToken = NULL);
189 | 
190 |   void multiRead(uint32_t batch_size,
191 |                  infinity::memory::Buffer* buffer,
192 |                  int64_t* localOffset,
193 |                  infinity::memory::RegionToken* source,
194 |                  int64_t* remoteOffset,
195 |                  uint32_t sizeInBytes,
196 |                  OperationFlags send_flags,
197 |                  infinity::requests::RequestToken* requestToken,
198 |                  infinity::queues::SendRequestBuffer& send_buffer);
199 | 
200 |   void sendWithImmediate(infinity::memory::Buffer* buffer,
201 |                          uint64_t localOffset,
202 |                          uint32_t sizeInBytes,
203 |                          uint32_t immediateValue,
204 |                          OperationFlags flags,
205 |                          infinity::requests::RequestToken* requestToken = NULL);
206 | 
207 |   void writeWithImmediate(
208 |       infinity::memory::Buffer* buffer,
209 |       uint64_t localOffset,
210 |       infinity::memory::RegionToken* destination,
211 |       uint64_t remoteOffset,
212 |       uint32_t sizeInBytes,
213 |       uint32_t immediateValue,
214 |       OperationFlags flags,
215 |       infinity::requests::RequestToken* requestToken = NULL);
216 | 
217 |   void multiWriteWithImmediate(
218 |       infinity::memory::Buffer** buffers,
219 |       uint32_t* sizesInBytes,
220 |       uint64_t* localOffsets,
221 |       uint32_t numberOfElements,
222 |       infinity::memory::RegionToken* destination,
223 |       uint64_t remoteOffset,
224 |       uint32_t immediateValue,
225 |       OperationFlags flags,
226 |       infinity::requests::RequestToken* requestToken = NULL);
227 | 
228 |  public:
229 |   /**
230 |    * Atomic value operations
231 |    */
232 | 
233 |   void compareAndSwap(infinity::memory::RegionToken* destination,
234 |                       uint64_t compare,
235 |                       uint64_t swap,
236 |                       infinity::requests::RequestToken* requestToken = NULL);
237 |   void compareAndSwap(infinity::memory::RegionToken* destination,
238 |                       infinity::memory::Atomic* previousValue,
239 |                       uint64_t compare,
240 |                       uint64_t swap,
241 |                       OperationFlags flags,
242 |                       infinity::requests::RequestToken* requestToken = NULL);
243 |   void fetchAndAdd(infinity::memory::RegionToken* destination,
244 |                    uint64_t add,
245 |                    infinity::requests::RequestToken* requestToken = NULL);
246 |   void fetchAndAdd(infinity::memory::RegionToken* destination,
247 |                    infinity::memory::Atomic* previousValue,
248 |                    uint64_t add,
249 |                    OperationFlags flags,
250 |                    infinity::requests::RequestToken* requestToken = NULL);
251 | 
252 |  protected:
253 |   infinity::core::Context* const context;
254 | 
255 |   ibv_qp* ibvQueuePair;
256 |   uint32_t sequenceNumber;
257 | 
258 |   void* userData;
259 |   uint32_t userDataSize;
260 | };
261 | 
262 | } /* namespace queues */
263 | } /* namespace infinity */
264 | 
265 | #endif /* QUEUES_QUEUEPAIR_H_ */
266 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/queues/QueuePairFactory.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Queues - Queue Pair Factory
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef QUEUES_QUEUEPAIRFACTORY_H_
10 | #define QUEUES_QUEUEPAIRFACTORY_H_
11 | 
12 | #include <stdlib.h>
13 | #include <stdint.h>
14 | 
15 | #include <infinity/core/Context.h>
16 | #include <infinity/queues/QueuePair.h>
17 | 
18 | namespace infinity {
19 | namespace queues {
20 | 
21 | class QueuePairFactory {
22 | public:
23 | 
24 | 	QueuePairFactory(infinity::core::Context *context);
25 | 	~QueuePairFactory();
26 | 
27 | 	/**
28 | 	 * Bind to port for listening to incoming connections
29 | 	 */
30 | 	void bindToPort(uint16_t port);
31 | 
32 | 	/**
33 | 	 * Accept incoming connection request (passive side)
34 | 	 */
35 | 	QueuePair * acceptIncomingConnection(void *userData = NULL, uint32_t userDataSizeInBytes = 0);
36 | 
37 | 	/**
38 | 	 * Connect to remote machine (active side)
39 | 	 */
40 | 	QueuePair * connectToRemoteHost(const char* hostAddress, uint16_t port, void *userData = NULL, uint32_t userDataSizeInBytes = 0);
41 | 
42 | 	/**
43 | 	 * Create loopback queue pair
44 | 	 */
45 | 	QueuePair * createLoopback(void *userData = NULL, uint32_t userDataSizeInBytes = 0);
46 | 
47 | protected:
48 | 
49 | 	infinity::core::Context * context;
50 | 
51 | 	int32_t serverSocket;
52 | 
53 | };
54 | 
55 | } /* namespace queues */
56 | } /* namespace infinity */
57 | 
58 | #endif /* QUEUES_QUEUEPAIRFACTORY_H_ */
59 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/requests/RequestToken.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Requests - Request Token
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include "RequestToken.h"
 10 | 
 11 | namespace infinity {
 12 | namespace requests {
 13 | 
 14 | RequestToken::RequestToken(infinity::core::Context *context) :
 15 | 		context(context) {
 16 | 	this->success.store(false);
 17 | 	this->completed.store(false);
 18 | 	this->region = NULL;
 19 | 	this->userData = NULL;
 20 | 	this->userDataValid = false;
 21 | 	this->userDataSize = 0;
 22 | 	this->immediateValue = 0;
 23 | 	this->immediateValueValid = false;
 24 | }
 25 | 
 26 | void RequestToken::setCompleted(bool success) {
 27 | 	this->success.store(success);
 28 | 	this->completed.store(true);
 29 | }
 30 | 
 31 | bool RequestToken::checkIfCompleted() {
 32 | 	if (this->completed.load()) {
 33 | 		return true;
 34 | 	} else {
 35 | 		this->context->pollSendCompletionQueue();
 36 | 		return this->completed.load();
 37 | 	}
 38 | }
 39 | 
 40 | void RequestToken::waitUntilCompleted() {
 41 | 	while (!this->completed.load()) {
 42 | 		this->context->pollSendCompletionQueue();
 43 | 	}
 44 | }
 45 | 
 46 | bool RequestToken::wasSuccessful() {
 47 | 	return this->success.load();
 48 | }
 49 | 
 50 | void RequestToken::reset() {
 51 | 	this->success.store(false);
 52 | 	this->completed.store(false);
 53 | 	this->region = NULL;
 54 | 	this->userData = NULL;
 55 | 	this->userDataValid = false;
 56 | 	this->userDataSize = 0;
 57 | 	this->immediateValue = 0;
 58 | 	this->immediateValueValid = false;
 59 | }
 60 | 
 61 | void RequestToken::setRegion(infinity::memory::Region* region) {
 62 | 	this->region = region;
 63 | }
 64 | 
 65 | infinity::memory::Region* RequestToken::getRegion() {
 66 | 	return this->region;
 67 | }
 68 | 
 69 | void RequestToken::setUserData(void* userData, uint32_t userDataSize) {
 70 | 	this->userData = userData;
 71 | 	this->userDataSize = userDataSize;
 72 | 	this->userDataValid = true;
 73 | }
 74 | 
 75 | void* RequestToken::getUserData() {
 76 | 	return this->userData;
 77 | }
 78 | 
 79 | bool RequestToken::hasUserData() {
 80 | 	return this->userDataValid;
 81 | }
 82 | 
 83 | uint32_t RequestToken::getUserDataSize() {
 84 | 	return this->userDataSize;
 85 | }
 86 | 
 87 | void RequestToken::setImmediateValue(uint32_t immediateValue) {
 88 | 	this->immediateValue = immediateValue;
 89 | 	this->immediateValueValid = true;
 90 | }
 91 | 
 92 | uint32_t RequestToken::getImmediateValue() {
 93 | 	return this->immediateValue;
 94 | }
 95 | 
 96 | bool RequestToken::hasImmediateValue() {
 97 | 	return this->immediateValueValid;
 98 | }
 99 | 
100 | } /* namespace requests */
101 | } /* namespace infinity */
102 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/requests/RequestToken.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Requests - Request Token
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef REQUESTS_REQUESTTOKEN_H_
10 | #define REQUESTS_REQUESTTOKEN_H_
11 | 
12 | #include <atomic>
13 | #include <stdint.h>
14 | 
15 | #include <infinity/core/Context.h>
16 | #include <infinity/memory/Region.h>
17 | 
18 | namespace infinity {
19 | namespace requests {
20 | 
21 | class RequestToken {
22 | 
23 | public:
24 | 
25 | 	RequestToken(infinity::core::Context *context);
26 | 
27 | 	void reset();
28 | 
29 | 	void setRegion(infinity::memory::Region * region);
30 | 	infinity::memory::Region * getRegion();
31 | 
32 | 	void setCompleted(bool success);
33 | 	bool wasSuccessful();
34 | 
35 | 	bool checkIfCompleted();
36 | 	void waitUntilCompleted();
37 | 
38 | 	void setImmediateValue(uint32_t immediateValue);
39 | 	bool hasImmediateValue();
40 | 	uint32_t getImmediateValue();
41 | 
42 | 	void setUserData(void* userData, uint32_t userDataSize);
43 | 	bool hasUserData();
44 | 	void* getUserData();
45 | 	uint32_t getUserDataSize();
46 | 
47 | protected:
48 | 
49 | 	infinity::core::Context * const context;
50 | 	infinity::memory::Region * region;
51 | 
52 | 	std::atomic<bool> completed;
53 | 	std::atomic<bool> success;
54 | 
55 | 	void *userData;
56 | 	uint32_t userDataSize;
57 | 	bool userDataValid;
58 | 
59 | 	uint32_t immediateValue;
60 | 	bool immediateValueValid;
61 | 
62 | };
63 | 
64 | } /* namespace requests */
65 | } /* namespace infinity */
66 | 
67 | #endif /* REQUESTS_REQUESTTOKEN_H_ */
68 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Address.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Utils - Address
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #include "Address.h"
10 | 
11 | #include <stdlib.h>
12 | #include <stdint.h>
13 | #include <stdio.h>
14 | #include <ifaddrs.h>
15 | #include <arpa/inet.h>
16 | #include <strings.h>
17 | 
18 | #include <infinity/utils/Debug.h>
19 | 
20 | namespace infinity {
21 | namespace utils {
22 | 
23 | char* Address::getIpAddressOfInterface(const char* interfaceName) {
24 | 
25 | 	struct ifaddrs *ifAddr;
26 | 	struct ifaddrs *ifa;
27 | 	char *ipAddress = (char*) calloc(16, sizeof(char));
28 | 
29 | 	int returnValue = getifaddrs(&ifAddr);
30 | 	INFINITY_ASSERT(returnValue != -1, "[INFINITY][UTILS][ADDRESS] Cannot read interface list.\n");
31 | 
32 | 	for (ifa = ifAddr; ifa != NULL; ifa = ifa->ifa_next) {
33 | 		if (ifa->ifa_addr == NULL) {
34 | 			continue;
35 | 		}
36 | 		if ((ifa->ifa_addr->sa_family == AF_INET) && (strcasecmp(interfaceName, ifa->ifa_name) == 0)) {
37 | 			sprintf(ipAddress, "%s", inet_ntoa(((struct sockaddr_in *) ifa->ifa_addr)->sin_addr));
38 | 			break;
39 | 		}
40 | 	}
41 | 	INFINITY_ASSERT(ifa != NULL, "[INFINITY][UTILS][ADDRESS] Cannot find interface named %s.\n", interfaceName);
42 | 
43 | 	freeifaddrs(ifAddr);
44 | 
45 | 	return ipAddress;
46 | 
47 | }
48 | 
49 | uint32_t Address::getIpAddressAsUint32(const char* ipAddress) {
50 | 
51 | 	uint32_t ipAddressNumbers[4];
52 | 	sscanf(ipAddress, "%d.%d.%d.%d", &ipAddressNumbers[3], &ipAddressNumbers[2], &ipAddressNumbers[1], &ipAddressNumbers[0]);
53 | 	uint32_t ipAddressNumber(ipAddressNumbers[0] | ipAddressNumbers[1] << 8 | ipAddressNumbers[2] << 16 | ipAddressNumbers[3] << 24);
54 | 	return ipAddressNumber;
55 | }
56 | 
57 | } /* namespace utils */
58 | } /* namespace infinity */
59 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Address.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Utils - Address
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef UTILS_ADDRESS_H_
10 | #define UTILS_ADDRESS_H_
11 | 
12 | #include <stdint.h>
13 | 
14 | namespace infinity {
15 | namespace utils {
16 | 
17 | class Address {
18 | 
19 | public:
20 | 
21 | 	static char * getIpAddressOfInterface(const char *interfaceName);
22 | 	static uint32_t getIpAddressAsUint32(const char *ipAddress);
23 | 
24 | };
25 | 
26 | } /* namespace utils */
27 | } /* namespace infinity */
28 | 
29 | #endif /* UTILS_ADDRESS_H_ */
30 | 


--------------------------------------------------------------------------------
/csrc/include/infinity/utils/Debug.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Utils - Debug
 3 |  *
 4 |  * (c) 2018 Claude Barthels, ETH Zurich
 5 |  * Contact: claudeb@inf.ethz.ch
 6 |  *
 7 |  */
 8 | 
 9 | #ifndef UTILS_DEBUG_H_
10 | #define UTILS_DEBUG_H_
11 | 
12 | #include <stdio.h>
13 | #include <stdlib.h>
14 | 
15 | #ifdef INFINITY_DEBUG_ON
16 | 	#define INFINITY_DEBUG(X, ...) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout);}
17 | #else
18 | 	#define INFINITY_DEBUG(X, ...) {}
19 | #endif
20 | 
21 | #ifdef INFINITY_ASSERT_ON
22 | 	#define INFINITY_ASSERT(B, X, ...) {if(!(B)) {fprintf(stdout, X, ##__VA_ARGS__); fflush(stdout); exit(-1);}}
23 | #else
24 | 	#define INFINITY_ASSERT(B, X, ...) {}
25 | #endif
26 | 
27 | #endif /* UTILS_DEBUG_H_ */
28 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/com_endpoint.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <string>
 3 | namespace qvf {
 4 | class ComEndPoint {
 5 |  private:
 6 |   std::string ip_address;
 7 |   int port;
 8 |   int rank;
 9 | 
10 |  public:
11 |   ComEndPoint() {}
12 | 
13 |   ComEndPoint(int rank, std::string ip_address, int port)
14 |       : rank(rank), ip_address(ip_address), port(port) {}
15 | 
16 |   ComEndPoint& operator=(const ComEndPoint& other) {
17 |     this->rank = other.rank;
18 |     this->ip_address = other.ip_address;
19 |     this->port = other.port;
20 |     return *this;
21 |   }
22 | 
23 |   void set_data(int rank, std::string ip_address, int port) {
24 |     this->rank = rank;
25 |     this->ip_address = ip_address;
26 |     this->port = port;
27 |   }
28 | 
29 |   std::string get_address(void) { return ip_address; }
30 |   int get_port(void) { return port; }
31 |   int get_rank(void) { return rank; }
32 | };
33 | }  // namespace qvf
34 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | #define QUIVER_FEATURE_ASSERT(B, X, ...) \
 6 |   {                                      \
 7 |     if (!(B)) {                          \
 8 |       fprintf(stdout, X, ##__VA_ARGS__); \
 9 |       fflush(stdout);                    \
10 |       exit(-1);                          \
11 |     }                                    \
12 |   }
13 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/dist_tensor_client.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <qvf/common.h>
  4 | #include <qvf/pipe.h>
  5 | #include <qvf/tensor_endpoint.h>
  6 | 
  7 | #include <infinity/core/Context.h>
  8 | #include <infinity/memory/Buffer.h>
  9 | #include <infinity/memory/RegionToken.h>
 10 | #include <infinity/queues/QueuePair.h>
 11 | #include <infinity/queues/QueuePairFactory.h>
 12 | #include <infinity/requests/RequestToken.h>
 13 | #include <torch/extension.h>
 14 | #include <ATen/ATen.h>
 15 | #include <chrono>
 16 | #include <deque>
 17 | #include <thread>
 18 | #include <vector>
 19 | 
 20 | namespace qvf {
 21 | struct CollectionTask {
 22 |  public:
 23 |   void* base_address;
 24 |   int collect_from;
 25 |   int64_t* local_offsets;
 26 |   int64_t* remote_offsets;
 27 |   int64_t size;
 28 | 
 29 |  public:
 30 |   CollectionTask() {}
 31 |   CollectionTask(void* base_address,
 32 |                  int64_t* local_offsets,
 33 |                  int64_t* remote_offsets,
 34 |                  int64_t size,
 35 |                  int collect_from)
 36 |       : base_address(base_address),
 37 |         local_offsets(local_offsets),
 38 |         remote_offsets(remote_offsets),
 39 |         size(size),
 40 |         collect_from(collect_from) {}
 41 | };
 42 | class DistTensorClient {
 43 |  public:
 44 |   std::vector<Pipe*> pipes;
 45 |   std::vector<ComEndPoint> com_endpoints;
 46 | 
 47 |   // About communication
 48 |   PipeParam pipe_param;
 49 |   int server_size;
 50 |   int server_rank;
 51 | 
 52 |   // About IB
 53 |   infinity::core::Context* context;
 54 |   infinity::queues::QueuePairFactory* qpFactory;
 55 | 
 56 |   infinity::memory::Buffer* tensor_buffer;
 57 |   infinity::memory::RegionToken* tensor_token;
 58 | 
 59 |   // about feature client
 60 |   std::deque<CollectionTask> task_queue;
 61 | 
 62 |  public:
 63 |   DistTensorClient(int server_rank,
 64 |                    std::vector<ComEndPoint> com_endpoints,
 65 |                    PipeParam pipe_param) {
 66 |     this->server_rank = server_rank;
 67 |     this->com_endpoints = com_endpoints;
 68 |     this->pipe_param = pipe_param;
 69 |     server_size = com_endpoints.size();
 70 |     init_connection();
 71 |   }
 72 | 
 73 |   void init_connection() {
 74 |     context = new infinity::core::Context();
 75 |     qpFactory = new infinity::queues::QueuePairFactory(context);
 76 |     pipes.resize(server_size);
 77 |     for (int idx = 0; idx < server_size; idx++) {
 78 |       if (com_endpoints[idx].get_rank() == server_rank) {
 79 |         continue;
 80 |       }
 81 |       pipes[com_endpoints[idx].get_rank()] =
 82 |           new Pipe(context, qpFactory, com_endpoints[idx], pipe_param);
 83 |       pipes[com_endpoints[idx].get_rank()]->connect();
 84 |     }
 85 |   }
 86 | 
 87 |   torch::Tensor create_registered_float32_tensor(
 88 |       std::vector<int64_t> tensor_shape) {
 89 |     QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2,
 90 |                           "Only support 2-dimensional tensor");
 91 |     auto tensor_option = torch::TensorOptions().dtype(torch::kFloat32);
 92 |     uint64_t size_in_bytes = 4;
 93 |     for (int index = 0; index < tensor_shape.size(); index++) {
 94 |       size_in_bytes *= tensor_shape[index];
 95 |     }
 96 |     tensor_buffer = new infinity::memory::Buffer(context, size_in_bytes);
 97 |     tensor_token = tensor_buffer->createRegionToken();
 98 |     return torch::from_blob(tensor_buffer->getData(),
 99 |                             {tensor_shape[0], tensor_shape[1]}, tensor_option);
100 |   }
101 | 
102 |   void register_float_tensor(torch::Tensor& float_tensor) {
103 |     QUIVER_FEATURE_ASSERT(
104 |         float_tensor.dim() == 2,
105 |         "Only support 2-dimensional tensor, But got %d-dimensional tensor\n",
106 |         float_tensor.dim());
107 | 
108 |     uint64_t size_in_bytes = float_tensor.element_size() * float_tensor.numel();
109 | 
110 |     tensor_buffer = new infinity::memory::Buffer(
111 |         context, float_tensor.data_ptr(), size_in_bytes);
112 | 
113 |     tensor_token = tensor_buffer->createRegionToken();
114 |   }
115 | 
116 |   torch::Tensor create_registered_float32_tensor_cuda(
117 |       std::vector<int64_t> tensor_shape,
118 |       int device) {
119 |     QUIVER_FEATURE_ASSERT(tensor_shape.size() == 2,
120 |                           "Only support 2-dimensional tensor");
121 |     uint64_t size_in_bytes = 4;
122 |     for (int index = 0; index < tensor_shape.size(); index++) {
123 |       size_in_bytes *= tensor_shape[index];
124 |     }
125 |     tensor_buffer =
126 |         new infinity::memory::Buffer(context, size_in_bytes, device);
127 |     tensor_token = tensor_buffer->createRegionToken();
128 |     auto tensor_option = torch::TensorOptions()
129 |                              .dtype(torch::kFloat32)
130 |                              .device(torch::kCUDA, device);
131 |     return torch::from_blob(tensor_buffer->getData(),
132 |                             {tensor_shape[0], tensor_shape[1]}, tensor_option);
133 |   }
134 | 
135 |   void sync_read(int server_rank,
136 |                  torch::Tensor& res_tensor,
137 |                  torch::Tensor& local_offsets,
138 |                  torch::Tensor& remote_offsets) {
139 |     QUIVER_FEATURE_ASSERT(
140 |         reinterpret_cast<uint64_t>(res_tensor.data_ptr()) ==
141 |             tensor_buffer->getAddress(),
142 |         "Result Tensor is not created from registered buffer");
143 | 
144 |     pipes[server_rank]->read(tensor_buffer, local_offsets, remote_offsets,
145 |                              res_tensor.size(1) * res_tensor.element_size());
146 |   }
147 | 
148 |   void collect_inner(CollectionTask collection_task) {
149 |     task_queue.push_back(collection_task);
150 |   }
151 | 
152 |   void start_feature_client() {}
153 | };
154 | }  // namespace qvf
155 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/dist_tensor_server.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | 
 4 | #include <qvf/com_endpoint.h>
 5 | 
 6 | #include <infinity/core/Context.h>
 7 | #include <infinity/memory/Buffer.h>
 8 | #include <infinity/memory/RegionToken.h>
 9 | #include <infinity/queues/QueuePair.h>
10 | #include <infinity/queues/QueuePairFactory.h>
11 | #include <infinity/requests/RequestToken.h>
12 | 
13 | #include <chrono>
14 | #include <iostream>
15 | #include <thread>
16 | #include <vector>
17 | 
18 | #include <torch/extension.h>
19 | #include <ATen/ATen.h>
20 | 
21 | namespace qvf {
22 | class DistTensorServer {
23 |  private:
24 |   int port;
25 |   int world_size;
26 |   int qp_per_pipe;
27 | 
28 |   infinity::core::Context* context;
29 |   infinity::queues::QueuePairFactory* qpFactory;
30 |   infinity::memory::Buffer* feature_buffer;
31 |   infinity::memory::RegionToken* bufferToken;
32 | 
33 |   std::thread server_thread;
34 | 
35 |  public:
36 |   DistTensorServer(int port, int world_size, int qp_per_pipe)
37 |       : port(port), world_size(world_size), qp_per_pipe(qp_per_pipe) {
38 |     context = new infinity::core::Context();
39 |     qpFactory = new infinity::queues::QueuePairFactory(context);
40 |     qpFactory->bindToPort(port);
41 |   }
42 | 
43 |   void join() { server_thread.join(); }
44 | 
45 |   void serve(void* data, int64_t size_in_bytes) {
46 |     feature_buffer =
47 |         new infinity::memory::Buffer(context, data, (uint64_t)size_in_bytes);
48 |     bufferToken = feature_buffer->createRegionToken();
49 |     server_thread =
50 |         std::thread(run, qpFactory, bufferToken, qp_per_pipe * world_size);
51 |   }
52 | 
53 |   void serve_tensor(torch::Tensor& data) {
54 |     std::cout << "Registering Buffer, Please Wait..." << std::endl;
55 |     uint64_t size_in_bytes = data.numel() * data.element_size();
56 |     
57 |     feature_buffer = new infinity::memory::Buffer(
58 |         context, data.data_ptr(), size_in_bytes);
59 |     bufferToken = feature_buffer->createRegionToken();
60 |     server_thread = std::thread(run, qpFactory, bufferToken,
61 |                                 qp_per_pipe * (world_size - 1));
62 |   }
63 | 
64 |   static void run(infinity::queues::QueuePairFactory* qpFactory,
65 |                   infinity::memory::RegionToken* bufferToken,
66 |                   int total_qp_num) {
67 |     std::cout << "Buffer Registeration Done! Ready To Receive Connections, "
68 |                  "Start Your Clients Now"
69 |               << std::endl;
70 |     for (int qp_index = 0; qp_index < total_qp_num; qp_index++) {
71 |       qpFactory->acceptIncomingConnection(
72 |           bufferToken, sizeof(infinity::memory::RegionToken));
73 |     }
74 | 
75 |     while (1) {
76 |       std::this_thread::sleep_for(std::chrono::seconds(10));  // 10s
77 |     }
78 |   }
79 | };
80 | 
81 | }  // namespace qvf
82 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/pipe.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <qvf/com_endpoint.h>
  4 | #include <qvf/common.h>
  5 | #include <qvf/range.h>
  6 | 
  7 | #include <infinity/core/Context.h>
  8 | #include <infinity/memory/Buffer.h>
  9 | #include <infinity/memory/RegionToken.h>
 10 | #include <infinity/queues/QueuePair.h>
 11 | #include <infinity/queues/QueuePairFactory.h>
 12 | #include <infinity/requests/RequestToken.h>
 13 | 
 14 | #include <torch/extension.h>
 15 | #include <iostream>
 16 | #include <vector>
 17 | 
 18 | namespace qvf {
 19 | 
 20 | // Pipe are used for single side RDMA read to remote data servers
 21 | struct PipeParam {
 22 |   int qp_num;
 23 |   int ctx_poll_batch;
 24 |   int tx_depth;
 25 |   int post_list_size;
 26 |   PipeParam() {}
 27 |   PipeParam(int qp_num,
 28 |             int ctx_poll_batch,
 29 |             int tx_depth,
 30 |             int post_list_size) {
 31 |     this->qp_num = qp_num;
 32 |     this->ctx_poll_batch = ctx_poll_batch;
 33 |     this->tx_depth = tx_depth;
 34 |     this->post_list_size = post_list_size;
 35 |   }
 36 |   void set_params(int qp_num,
 37 |                   int ctx_poll_batch,
 38 |                   int tx_depth,
 39 |                   int post_list_size) {
 40 |     this->qp_num = qp_num;
 41 |     this->ctx_poll_batch = ctx_poll_batch;
 42 |     this->tx_depth = tx_depth;
 43 |     this->post_list_size = post_list_size;
 44 |   }
 45 |   void set_param_vec(std::vector<int> param_vec){
 46 |     qp_num = param_vec[0];
 47 |     ctx_poll_batch = param_vec[1];
 48 |     tx_depth = param_vec[2];
 49 |     post_list_size = param_vec[3];
 50 |   }
 51 | 
 52 |   std::vector<int> get_param_vec(){
 53 |     std::vector<int> params;
 54 |     params.push_back(qp_num);
 55 |     params.push_back(ctx_poll_batch);
 56 |     params.push_back(tx_depth);
 57 |     params.push_back(post_list_size);
 58 |     return params;
 59 |   }
 60 | 
 61 |   PipeParam& operator=(const PipeParam& pipe_param) {
 62 |     set_params(pipe_param.qp_num, pipe_param.ctx_poll_batch,
 63 |                pipe_param.tx_depth, pipe_param.post_list_size);
 64 |     return *this;
 65 |   }
 66 | };
 67 | 
 68 | class Pipe {
 69 |  private:
 70 |   ComEndPoint remote_end;
 71 |   PipeParam pipe_param;
 72 |   std::vector<infinity::memory::RegionToken*> remote_buffer_tokens;
 73 |   std::vector<infinity::queues::QueuePair*> qps;
 74 |   std::vector<infinity::requests::RequestToken*> requests;
 75 |   infinity::queues::SendRequestBuffer send_buffer;
 76 |   infinity::core::Context* context;
 77 |   infinity::queues::QueuePairFactory* qpFactory;
 78 |   infinity::queues::IbvWcBuffer wc_buffer;
 79 |   int requests_size;
 80 |   bool connected;
 81 | 
 82 |  public:
 83 |   Pipe() : connected(false) {}
 84 |   Pipe(infinity::core::Context* context,
 85 |        infinity::queues::QueuePairFactory* qpFactory,
 86 |        ComEndPoint com_endpoint,
 87 |        PipeParam pipe_param) {
 88 |     this->context = context;
 89 |     this->qpFactory = qpFactory;
 90 |     this->remote_end = com_endpoint;
 91 |     this->pipe_param = pipe_param;
 92 |     connected = false;
 93 |   }
 94 | 
 95 |   Pipe& operator=(const Pipe& pipe) {
 96 |     if (pipe.connected) {
 97 |       fprintf(stderr, "Pipe can only be assigned before connect");
 98 |     }
 99 |     this->remote_end = pipe.remote_end;
100 |     this->pipe_param = pipe.pipe_param;
101 |     this->context = pipe.context;
102 |     this->qpFactory = pipe.qpFactory;
103 |     connected = false;
104 |     return *this;
105 |   }
106 | 
107 |   void connect() {
108 |     qps.resize(pipe_param.qp_num);
109 |     remote_buffer_tokens.resize(pipe_param.qp_num);
110 |     requests_size =
111 |         pipe_param.tx_depth / pipe_param.post_list_size;
112 |     requests.resize(requests_size);
113 |     send_buffer.resize(pipe_param.post_list_size);
114 |     wc_buffer.resize(pipe_param.ctx_poll_batch);
115 |     for (int qp_index = 0; qp_index < pipe_param.qp_num; qp_index++) {
116 |       qps[qp_index] = qpFactory->connectToRemoteHost(
117 |           remote_end.get_address().c_str(), remote_end.get_port());
118 |       remote_buffer_tokens[qp_index] =
119 |           (infinity::memory::RegionToken*)qps[qp_index]->getUserData();
120 |     }
121 | 
122 |     for (int request_index = 0; request_index < requests.size();
123 |          request_index++) {
124 |       requests[request_index] = new infinity::requests::RequestToken(context);
125 |     }
126 |     connected = true;
127 |   }
128 | 
129 |   void read(infinity::memory::Buffer* local_buffer,
130 |             std::vector<int64_t> local_offsets,
131 |             std::vector<int64_t> remote_offsets,
132 |             uint64_t stride) {
133 |     uint64_t post_list_cnt =
134 |         (local_offsets.size() + pipe_param.post_list_size - 1) /
135 |         pipe_param.post_list_size;
136 | 
137 |     // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check
138 |     // Local_Offset_Size "<< remote_offsets.size()<<std::endl;
139 | 
140 |     int epoch_scnt = 0;
141 |     for (uint64_t post_index = 0; post_index < post_list_cnt; post_index++) {
142 |       int batch_read_size = (post_index == post_list_cnt - 1)
143 |                                 ? (local_offsets.size() -
144 |                                    (pipe_param.post_list_size * post_index))
145 |                                 : pipe_param.post_list_size;
146 |       // std::cout<<"Check Batch_Read_Size " << batch_read_size << std::endl;
147 |       // std::cout<<"Check Current Index " << pipe_param.post_list_size *
148 |       // post_index <<" Total Size " << local_offsets.size()<<std::endl;
149 |       qps[post_index % pipe_param.qp_num]->multiRead(
150 |           batch_read_size, local_buffer,
151 |           &local_offsets[post_index * pipe_param.post_list_size],
152 |           remote_buffer_tokens[post_index % pipe_param.qp_num],
153 |           &remote_offsets[post_index * pipe_param.post_list_size], stride,
154 |           infinity::queues::OperationFlags(), requests[epoch_scnt],
155 |           send_buffer);
156 |       epoch_scnt += 1;
157 | 
158 |       if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) {
159 |         context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch,
160 |                                               epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1);
161 |         epoch_scnt = 0;
162 |       }
163 |     }
164 |   }
165 | 
166 |   void read(infinity::memory::Buffer* local_buffer,
167 |             torch::Tensor& local_offsets_tensor,
168 |             torch::Tensor& remote_offsets_tensor,
169 |             uint64_t stride) {
170 |     QUIVER_FEATURE_ASSERT(local_offsets_tensor.dim() == 1,
171 |                           "local_offsets should be 1-dimensional tensor");
172 |     QUIVER_FEATURE_ASSERT(remote_offsets_tensor.dim() == 1,
173 |                           "local_offsets should be 1-dimensional tensor");
174 |     QUIVER_FEATURE_ASSERT(
175 |         remote_offsets_tensor.size(0) == local_offsets_tensor.size(0),
176 |         "local_offsets and remote_offsets should have the same length");
177 | 
178 |     int64_t* local_offsets = local_offsets_tensor.data_ptr<int64_t>();
179 |     int64_t* remote_offsets = remote_offsets_tensor.data_ptr<int64_t>();
180 | 
181 |     uint64_t post_list_cnt =
182 |         (local_offsets_tensor.size(0) + pipe_param.post_list_size - 1) /
183 |         pipe_param.post_list_size;
184 | 
185 |     // std::cout<<"Check Local_Offset_Size " << local_offsets.size() << " Check
186 |     // Local_Offset_Size "<< remote_offsets.size()<<std::endl;
187 | 
188 |     int epoch_scnt = 0;
189 |     for (uint64_t post_index = 0; post_index < post_list_cnt; post_index++) {
190 |       int batch_read_size = (post_index == post_list_cnt - 1)
191 |                                 ? (local_offsets_tensor.size(0) -
192 |                                    (pipe_param.post_list_size * post_index))
193 |                                 : pipe_param.post_list_size;
194 |       // std::cout<<"Check Batch_Read_Size " << batch_read_size << std::endl;
195 | 
196 |       // std::cout<<"Read "<< batch_read_size <<", From " <<
197 |       // remote_offsets[post_index * pipe_param.post_list_size] <<" To " <<
198 |       // local_offsets[post_index * pipe_param.post_list_size] << " With Size "
199 |       // << stride << std::endl;
200 |       //  post_index <<" Total Size " <<
201 |       //  local_offsets_tensor.size(0)<<std::endl;
202 |       qps[post_index % pipe_param.qp_num]->multiRead(
203 |           batch_read_size, local_buffer,
204 |           &local_offsets[post_index * pipe_param.post_list_size],
205 |           remote_buffer_tokens[post_index % pipe_param.qp_num],
206 |           &remote_offsets[post_index * pipe_param.post_list_size], stride,
207 |           infinity::queues::OperationFlags(), requests[epoch_scnt],
208 |           send_buffer);
209 |       epoch_scnt += 1;
210 |     
211 |       if (epoch_scnt == requests_size || post_index == post_list_cnt - 1) {
212 |         int cq_num = context->batchPollSendCompletionQueue(pipe_param.ctx_poll_batch,
213 |                                               epoch_scnt, wc_buffer.ptr(), post_index == post_list_cnt - 1);
214 |         epoch_scnt -= cq_num;
215 |       }
216 |     }
217 |   }
218 | };
219 | }  // namespace qvf
220 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/qvf.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <qvf/com_endpoint.h>
3 | #include <qvf/common.h>
4 | #include <qvf/dist_tensor_client.h>
5 | #include <qvf/dist_tensor_server.h>
6 | #include <qvf/pipe.h>
7 | #include <qvf/tensor_endpoint.h>
8 | #include <qvf/shared_loader.h>
9 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/range.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | namespace qvf {
 5 | class Range {
 6 |  private:
 7 |   int64_t start;
 8 |   int64_t end;
 9 | 
10 |  public:
11 |   Range() {}
12 |   Range(int64_t start, int64_t end) : start(start), end(end) {}
13 |   void set_params(int64_t start, int64_t end) {
14 |     this->start = start;
15 |     this->end = end;
16 |   }
17 |   Range& operator=(const Range& other) {
18 |     this->start = other.start;
19 |     this->end = other.end;
20 |     return *this;
21 |   }
22 |   int64_t range_start() { return start; }
23 |   int64_t range_end() { return end; }
24 | };
25 | }  // namespace qvf
26 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/shared_loader.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by joker on 2022/5/15.
 3 | //
 4 | 
 5 | #ifndef QUIVER_FEATURE_SHAREDLOADER_H
 6 | #define QUIVER_FEATURE_SHAREDLOADER_H
 7 | 
 8 | #include <serialize/inline_container.h>
 9 | #include <serialize/read_adapter_interface.h>
10 | #include <torch/extension.h>
11 | #include <torch/torch.h>
12 | 
13 | namespace qvf {
14 | 
15 | using caffe2::serialize::PyTorchStreamReader;
16 | using caffe2::serialize::ReadAdapterInterface;
17 | 
18 | template <typename Tag, typename Tag::type M>
19 | struct Rob {
20 |   friend typename Tag::type get(Tag) { return M; }
21 | };
22 | 
23 | #define ROB_FIELD_FROM_READER(FieldType, FieldName)    \
24 |   struct PyTorchStreamReader_##FieldName {             \
25 |     typedef FieldType PyTorchStreamReader::*type;      \
26 |     friend type get(PyTorchStreamReader_##FieldName);  \
27 |   };                                                   \
28 |   template struct Rob<PyTorchStreamReader_##FieldName, \
29 |                       &PyTorchStreamReader::FieldName>
30 | 
31 | ROB_FIELD_FROM_READER(std::string, archive_name_plus_slash_);
32 | ROB_FIELD_FROM_READER(std::unique_ptr<mz_zip_archive>, ar_);
33 | ROB_FIELD_FROM_READER(std::mutex, reader_lock_);
34 | 
35 | struct TORCH_API SharedLoader {
36 |   PyTorchStreamReader reader;
37 |   explicit SharedLoader(const std::string& file_name) : reader(file_name) {}
38 |   explicit SharedLoader(std::istream* in) : reader(in) {}
39 |   explicit SharedLoader(std::shared_ptr<ReadAdapterInterface> in)
40 |       : reader(in) {}
41 |   void valid(const char* what, const char* info = "");
42 |   std::tuple<at::DataPtr, size_t> getRecord(const std::string& name);
43 |   size_t getRecordID(const std::string& name);
44 |   size_t getRecordOffset(const std::string& name) {
45 |     return reader.getRecordOffset(name);
46 |   }
47 |   bool hasRecord(const std::string& name) { return reader.hasRecord(name); }
48 |   std::vector<std::string> getAllRecords() { return reader.getAllRecords(); }
49 | };
50 | 
51 | }  // namespace qvf
52 | #endif  // QUIVER_FEATURE_SHAREDLOADER_H
53 | 


--------------------------------------------------------------------------------
/csrc/include/qvf/tensor_endpoint.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <qvf/com_endpoint.h>
 3 | #include <qvf/range.h>
 4 | namespace qvf {
 5 | class TensorEndPoint {
 6 |  public:
 7 |   ComEndPoint com_endpoint;
 8 |   Range range;
 9 | 
10 |  public:
11 |   TensorEndPoint(ComEndPoint com_endpoint, Range range) {
12 |     this->com_endpoint = com_endpoint;
13 |     this->range = range;
14 |   }
15 | 
16 |   TensorEndPoint(int rank,
17 |                  std::string ip,
18 |                  int port,
19 |                  int64_t range_start,
20 |                  int64_t range_end) {
21 |     this->com_endpoint = ComEndPoint(rank, ip, port);
22 |     this->range = Range(range_start, range_end);
23 |   }
24 | 
25 |   TensorEndPoint(std::string ip, int port, int rank, Range range) {
26 |     this->com_endpoint = ComEndPoint(rank, ip, port);
27 |     this->range = range;
28 |   }
29 | 
30 |   TensorEndPoint& operator=(const TensorEndPoint& other) {
31 |     this->com_endpoint = other.com_endpoint;
32 |     this->range = other.range;
33 |     return *this;
34 |   }
35 | };
36 | }  // namespace qvf
37 | 


--------------------------------------------------------------------------------
/csrc/src/module.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #include <infinity/infinity.h>
 4 | #include <qvf/qvf.h>
 5 | #include <torch/extension.h>
 6 | 
 7 | void register_TensorEndPoint(pybind11::module& m);
 8 | void register_DistTensorServer(pybind11::module& m);
 9 | void register_PipeParam(pybind11::module& m);
10 | void register_DistTensorClient(pybind11::module& m);
11 | void register_ComEndPoint(pybind11::module& m);
12 | void register_SharedStorageReader(pybind11::module& m);
13 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
14 |   register_TensorEndPoint(m);
15 |   register_DistTensorServer(m);
16 |   register_PipeParam(m);
17 |   register_DistTensorClient(m);
18 |   register_ComEndPoint(m);
19 |   register_SharedStorageReader(m);
20 | }
21 | 


--------------------------------------------------------------------------------
/csrc/src/register.cpp:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <infinity/infinity.h>
  3 | #include <qvf/qvf.h>
  4 | #include <torch/extension.h>
  5 | #include <torch/jit.h>
  6 | #include <torch/serialize.h>
  7 | #include <torch/torch.h>
  8 | 
  9 | void register_TensorEndPoint(pybind11::module& m) {
 10 |   // define TensorEndPoint
 11 |   py::class_<qvf::TensorEndPoint>(m, "TensorEndPoint")
 12 |       .def(py::init<int, std::string, int, int64_t, int64_t>());
 13 | }
 14 | 
 15 | void register_ComEndPoint(pybind11::module& m) {
 16 |   // define ComEndPoint
 17 |   py::class_<qvf::ComEndPoint>(m, "ComEndPoint")
 18 |       .def(py::init<int, std::string, int>())
 19 |       .def(py::init<>())
 20 |       .def("rank", &qvf::ComEndPoint::get_rank, py::call_guard<py::gil_scoped_release>())
 21 |       .def("address", &qvf::ComEndPoint::get_address, py::call_guard<py::gil_scoped_release>())
 22 |       .def("port", &qvf::ComEndPoint::get_port, py::call_guard<py::gil_scoped_release>());
 23 | }
 24 | 
 25 | void register_DistTensorServer(pybind11::module& m) {
 26 |   // define TensorEndPoint
 27 |   py::class_<qvf::DistTensorServer>(m, "DistTensorServer")
 28 |       .def(py::init<int, int, int>())
 29 |       .def("serve_tensor", &qvf::DistTensorServer::serve_tensor,
 30 |            py::call_guard<py::gil_scoped_release>())
 31 |       .def("join", &qvf::DistTensorServer::join,
 32 |            py::call_guard<py::gil_scoped_release>());
 33 | }
 34 | 
 35 | void register_PipeParam(pybind11::module& m) {
 36 |   py::class_<qvf::PipeParam>(m, "PipeParam")
 37 |       .def(py::init<int, int, int, int>())
 38 |       .def(py::init<>())
 39 |       .def("get_param_vec", &qvf::PipeParam::get_param_vec, py::call_guard<py::gil_scoped_release>())
 40 |       .def("set_param_vec", &qvf::PipeParam::set_param_vec, py::call_guard<py::gil_scoped_release>())
 41 |       ;
 42 | }
 43 | 
 44 | void register_DistTensorClient(pybind11::module& m) {
 45 |   py::class_<qvf::DistTensorClient>(m, "DistTensorClient")
 46 |       .def(py::init<int, std::vector<qvf::ComEndPoint>, qvf::PipeParam>())
 47 |       .def("create_registered_float32_tensor",
 48 |            &qvf::DistTensorClient::create_registered_float32_tensor,
 49 |            py::call_guard<py::gil_scoped_release>())
 50 |       .def("register_float_tensor",
 51 |            &qvf::DistTensorClient::register_float_tensor,
 52 |            py::call_guard<py::gil_scoped_release>())
 53 |       .def("create_registered_float32_tensor_cuda",
 54 |            &qvf::DistTensorClient::create_registered_float32_tensor_cuda,
 55 |            py::call_guard<py::gil_scoped_release>())
 56 | 
 57 |       .def("sync_read", &qvf::DistTensorClient::sync_read,
 58 |            py::call_guard<py::gil_scoped_release>());
 59 | }
 60 | 
 61 | void register_SharedStorageReader(pybind11::module& m) {
 62 |   class BufferAdapter : public caffe2::serialize::ReadAdapterInterface {
 63 |    public:
 64 |     BufferAdapter(const py::object& buffer) : buffer_(buffer) {
 65 |       // Jump to the end of the buffer to get its size
 66 |       auto current = buffer.attr("tell")();
 67 |       start_offset_ = py::cast<size_t>(current);
 68 |       buffer.attr("seek")(current, py::module::import("os").attr("SEEK_END"));
 69 |       size_ = py::cast<size_t>(buffer.attr("tell")()) - start_offset_;
 70 |       buffer.attr("seek")(current);
 71 | 
 72 |       // If we can read directly into a buffer, do that instead of an extra copy
 73 |       use_readinto_ = py::hasattr(buffer, "readinto");
 74 |     }
 75 | 
 76 |     size_t size() const override { return size_; }
 77 | 
 78 |     THPObjectPtr getMemview(void* buf, size_t n) const {
 79 |       THPObjectPtr memview(PyMemoryView_FromMemory(reinterpret_cast<char*>(buf),
 80 |                                                    n, PyBUF_WRITE));
 81 |       if (!memview) {
 82 |         throw python_error();
 83 |       }
 84 |       return memview;
 85 |     }
 86 | 
 87 |     size_t read(uint64_t pos,
 88 |                 void* buf,
 89 |                 size_t n,
 90 |                 const char* what) const override {
 91 |       // Seek to desired position (NB: this has to be a Py_ssize_t or Python
 92 |       // throws a weird error)
 93 |       Py_ssize_t absolute_pos = start_offset_ + pos;
 94 |       buffer_.attr("seek")(absolute_pos);
 95 | 
 96 |       if (use_readinto_) {
 97 |         auto memview = getMemview(buf, n);
 98 |         auto res =
 99 |             PyObject_CallMethod(buffer_.ptr(), "readinto", "O", memview.get());
100 |         if (res) {
101 |           int64_t i = static_cast<int64_t>(PyLong_AsLongLong(res));
102 |           if (i > 0) {
103 |             return i;
104 |           }
105 |         }
106 |       }
107 | 
108 |       // Read bytes into `buf` from the buffer
109 |       std::string bytes = py::cast<std::string>(buffer_.attr("read")(n));
110 |       std::copy(bytes.data(), bytes.data() + bytes.size(),
111 |                 reinterpret_cast<char*>(buf));
112 |       return bytes.size();
113 |     }
114 | 
115 |     py::object buffer_;
116 |     size_t size_;
117 |     size_t start_offset_;
118 |     bool use_readinto_;
119 |   };
120 |   py::class_<qvf::SharedLoader, std::shared_ptr<qvf::SharedLoader>>(
121 |       m, "SharedTensorLoader")
122 |       .def(py::init<std::string>())
123 |       .def(py::init([](const py::object& buffer) {
124 |         auto adapter = std::make_unique<BufferAdapter>(buffer);
125 |         return std::make_shared<qvf::SharedLoader>(std::move(adapter));
126 |       }))
127 |       .def("get_record",
128 |            [](qvf::SharedLoader& self, const std::string& key) {
129 |              at::DataPtr data;
130 |              size_t size = 0;
131 |              std::tie(data, size) = self.getRecord(key);
132 |              return py::bytes(reinterpret_cast<const char*>(data.get()), size);
133 |            })
134 |       .def("has_record",
135 |            [](qvf::SharedLoader& self, const std::string& key) {
136 |              return self.hasRecord(key);
137 |            })
138 |       .def("get_storage_from_record",
139 |            [](qvf::SharedLoader& self, const std::string& key, size_t numel,
140 |               py::object data_type_obj) {
141 |              at::DataPtr data(std::get<0>(self.getRecord(key)));
142 |              auto scalar_type =
143 |                  reinterpret_cast<THPDtype*>(data_type_obj.ptr())->scalar_type;
144 | 
145 |              c10::Storage storage(c10::Storage::use_byte_size_t(),
146 |                                   numel * elementSize(scalar_type),
147 |                                   std::move(data),
148 |                                   /*allocator=*/nullptr,
149 |                                   /*resizable=*/false);
150 |              auto ptr =
151 |                  c10::make_intrusive<at::TensorImpl, at::UndefinedTensorImpl>(
152 |                      std::move(storage), at::DispatchKeySet(),
153 |                      at::CPU(scalar_type).typeMeta());
154 |              return at::Tensor(std::move(ptr));
155 |            })
156 |       .def("get_all_records",
157 |            [](qvf::SharedLoader& self) { return self.getAllRecords(); });
158 | }


--------------------------------------------------------------------------------
/csrc/src/shared_loader.cpp:
--------------------------------------------------------------------------------
 1 | #include <ATen/MapAllocator.h>
 2 | #include <c10/core/CPUAllocator.h>
 3 | #include <qvf/shared_loader.h>
 4 | extern "C" {
 5 | #include <miniz/miniz.h>
 6 | }
 7 | 
 8 | #define RB(x) get(PyTorchStreamReader_##x())
 9 | 
10 | at::DataPtr new_fd_storage(ptrdiff_t size) {
11 |   int flags = at::ALLOCATOR_MAPPED_SHAREDMEM | at::ALLOCATOR_MAPPED_EXCLUSIVE |
12 |               at::ALLOCATOR_MAPPED_KEEPFD | at::ALLOCATOR_MAPPED_UNLINK;
13 |   std::string handle = at::NewProcessWideShmHandle();
14 |   auto sptr = at::MapAllocator::makeDataPtr(handle.c_str(), flags,
15 |                                             size * sizeof(uint8_t), nullptr);
16 | 
17 |   return sptr;
18 | }
19 | 
20 | size_t qvf::SharedLoader::getRecordID(const std::string& name) {
21 |   std::string ss = reader.*RB(archive_name_plus_slash_) + name;
22 |   size_t result = mz_zip_reader_locate_file((reader.*RB(ar_)).get(), ss.c_str(),
23 |                                             nullptr, 0);
24 |   valid("locating file ", name.c_str());
25 |   return result;
26 | }
27 | 
28 | std::tuple<at::DataPtr, size_t> qvf::SharedLoader::getRecord(
29 |     const std::string& name) {
30 |   std::lock_guard<std::mutex> guard(reader.*RB(reader_lock_));
31 |   size_t key = getRecordID(name);
32 |   mz_zip_archive_file_stat stat;
33 |   mz_zip_reader_file_stat((reader.*RB(ar_)).get(), key, &stat);
34 |   valid("retrieving file meta-data for ", name.c_str());
35 |   at::DataPtr retval = new_fd_storage(stat.m_uncomp_size);
36 |   mz_zip_reader_extract_to_mem((reader.*RB(ar_)).get(), key, retval.get(),
37 |                                stat.m_uncomp_size, 0);
38 |   valid("reading file ", name.c_str());
39 | 
40 |   return std::make_tuple(std::move(retval), stat.m_uncomp_size);
41 | }
42 | 
43 | void qvf::SharedLoader::valid(const char* what, const char* info) {
44 |   const auto err = mz_zip_get_last_error((reader.*RB(ar_)).get());
45 |   TORCH_CHECK(err == MZ_ZIP_NO_ERROR, "PytorchStreamReader failed ", what, info,
46 |               ": ", mz_zip_get_error_string(err));
47 | }


--------------------------------------------------------------------------------
/docs/imgs/Network Bandwidth Under 100Gbps IB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/Network Bandwidth Under 100Gbps IB.png


--------------------------------------------------------------------------------
/docs/imgs/consistent_memory_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/consistent_memory_view.png


--------------------------------------------------------------------------------
/docs/imgs/e2e_feature_collection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection.png


--------------------------------------------------------------------------------
/docs/imgs/e2e_feature_collection_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/e2e_feature_collection_performance.png


--------------------------------------------------------------------------------
/docs/imgs/gpu0_centered_access_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/gpu0_centered_access_performance.png


--------------------------------------------------------------------------------
/docs/imgs/memory_usage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/memory_usage.png


--------------------------------------------------------------------------------
/docs/imgs/multi_qp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/multi_qp.png


--------------------------------------------------------------------------------
/docs/imgs/one_batch_feature_collection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/one_batch_feature_collection.png


--------------------------------------------------------------------------------
/docs/imgs/peak_memory_footprint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/peak_memory_footprint.png


--------------------------------------------------------------------------------
/docs/imgs/pgas_tensor_access.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_access.png


--------------------------------------------------------------------------------
/docs/imgs/pgas_tensor_view.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/pgas_tensor_view.png


--------------------------------------------------------------------------------
/docs/imgs/range_partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/range_partition.png


--------------------------------------------------------------------------------
/docs/imgs/rdma_mtt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/rdma_mtt.png


--------------------------------------------------------------------------------
/docs/imgs/shared_load.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/shared_load.png


--------------------------------------------------------------------------------
/docs/imgs/subset_signaled_requests.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/subset_signaled_requests.png


--------------------------------------------------------------------------------
/docs/imgs/train_gnn_on_large_graphs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quiver-team/quiver-feature/8e7058d247997f67dce05402555c730d29f02cee/docs/imgs/train_gnn_on_large_graphs.png


--------------------------------------------------------------------------------
/docs/memory.md:
--------------------------------------------------------------------------------
 1 | # Peak Memory Footprint Optimization In Quiver-Feature
 2 | 
 3 | By default, Quiver-Feature use `range partition` method to partition original gaint feature array onto different machines. This is pretty easy to understand, let's talk more about memory usage in each machine.
 4 | 
 5 | ![range_partition](imgs/range_partition.png)
 6 | 
 7 | On each machine:
 8 | 1. The feature tensor needs to be pinned so that RNIC and GPU can access its memory directly.
 9 | 
10 | 2. The feature tensor should be in SHM because multiple processes needs to access its data.
11 | 
12 | ![memory_usage](imgs/memory_usage.png)
13 | 
14 | Pinning memory doesnt consume extra memory but moving a torch.Tensor to SHM will cause 
15 | 2x the peak memory of the original data size.
16 | 
17 | To solve this problem, we implement `quiver_feature.shared_load` to replace the original `torch.load`. **`quiver_feature.shared_load` is almost the same as `torch.load` except that it loads data directly into SHM**. So the peak memory during creating `DistTensorPGAS` using `quiver_feature.shared_load` will just be around the original data size, **half of that when using torch.load**.
18 | 
19 | ![shared_load](imgs/shared_load.png)
20 | 
21 | ![peak_memory](imgs/peak_memory_footprint.png)
22 | 
23 | You can check our [test script](../tests/python/test_SharedLoader.py) for more details.
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/partition_methods.md:
--------------------------------------------------------------------------------
 1 | # Partition Methods
 2 | 
 3 | This doc will mainly describe feature partition methods we use in `Quiver-Feature`. 
 4 | 
 5 | # Metadata Of Each Partition
 6 | 
 7 | Default metadata for each partition is `TensorEndPoint` which records `Range` information of each serverßß.
 8 | 
 9 | ```python
10 | Range = namedtuple("Range", ["start", "end"])
11 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"])
12 | 
13 | ```
14 | For example, in the following partition settting, we have a list of `TensorEndPoint` like shown below. With this list, we can easily compute the `server_rank` and `local offset` of a certrain node idx.
15 | 
16 | ```python
17 | [
18 |     TensorEndPoint(server_rank=0, ip=ip0, port=port0, range=Range(start=0, end=M)), 
19 |     TensorEndPoint(server_rank=1, ip=ip1, port=port1, range=Range(start=M, end=N))
20 | ]
21 | ```
22 | 
23 | ![](imgs/range_partition.png)
24 | 
25 | # Range Partition
26 | Range partition is the default partition method we support for now. Take the following partition setting as example, we just assign [0, M) to Machine0 and assign [M, N) to Machine1.
27 | 
28 | ![](imgs/range_partition.png)


--------------------------------------------------------------------------------
/docs/rdma_details.md:
--------------------------------------------------------------------------------
 1 | # RDMA Details
 2 | 
 3 | This doc will mainly describe how we use RDMA for remote data access and summarize techniques we use to get the best RDMA performance. 
 4 | 
 5 | Before we start, we would like to show our appreciation to [@claudebarthels](https://github.com/claudebarthels) for developing [infinity](https://github.com/claudebarthels/infinity) which is a lightweight C++ RDMA library for IB and is also the code base for our RDMA implementation.
 6 | 
 7 | 
 8 | ## Use RDMA READ for Feature Collection
 9 | 
10 | As we mentioned in the [REAMDE](../README.md), `quiver_feature.DistTensorPGAS` is a 2-dimension distributed tensor abstraction above different memory spaces using `PGAS` model(Partitioned Global Address Space) and **`quiver_feature.DistTensorPGAS` is partitioned by row onto different machines**. 
11 | ![memory_view](imgs/pgas_tensor_view.png)
12 | 
13 | By default, we use `range partition`, when we want to access a certain row of `quiver_feature.DistTensorPGAS`, **we can compute the target machine's index and the memory offset of this row on that target machine from row index**.
14 | 
15 | ![range_partition](imgs/range_partition.png)
16 | 
17 | 
18 |  Since each row's data size can be known in advance, **we can use one single `RDMA READ` to fetch this wanted row's data(which corresponds to a single node's feature)**.
19 | 
20 | ![memory_view](imgs/pgas_tensor_view.png)
21 | 
22 | So **each batch's feature collection involves millons of `RDMA READ`**, each `READ` for one node's feature.
23 | 
24 | ![feature_collection](imgs/one_batch_feature_collection.png)
25 | 
26 | ## 4 Techniques We Use
27 | Feature collection invloves millions of small `RDMA READs`(each `READ` may read just 2KB data), and we use these 4 techniques to get the best performance.
28 | 
29 | ### Rule 1: Use Multi QPs Per Client
30 | 
31 | RDMA hosts use Queue Pair(QP) to communicate with each other. Nowadays, RNICs contains a pool of processing units(PUs) and we believe that requests in the same QP is always processed by the same PU to avoid cross-PU synchronization. But CPU is much powerful than a PU so if we only use one QP per RDMA client, the performance can be easily bottlenecked by the PU's ability. So we use multi QPs per RDMA client and dispatch READ requests evenly to these QPs to take full advantage of RNIC's parallel processing ability.
32 | 
33 | ![multi_qp](imgs/multi_qp.png)
34 | 
35 | 
36 | ### Rule 2: Only Set A Subset Of All Requests as Signaled
37 | 
38 | Each RDMA read request can be set as signaled or unsignaled. <!--A CQE(Completion Query Entry) will be put into CQ(Completion Queue) if a signaled read request is completed and CPU can poll from CQ to check the status of this request.-->Signaled requests need CPU intervention but users can check result status by polling CQs(Completion Queue). Unsignaled requests dont involve CPU, but users have to decide their own way to check if these requests are completed successfully.
39 | 
40 | Like we said before, each batch's feature collection involves millions of `RDMA READ` requests. For each QP, we sequentially send these requests but only set one request out of `CQ_MOD`(which we often set as 128) requests as signaled, i.e. we only set 1/128 of all requests as signaled and check their result status. We also set the last request as signaled and wait until its completion to make sure that all requests in this QP are completed. If these signaled requests' result status are all successful, we think all requests are completed sucessfully.
41 | 
42 | In the future we may add more mechanisms about failures: If we find a signaled request is failed, we will retry this group of `CQ_MOD` requests again. Even with that, We could not guarantee that all requests are completed successfully.
43 | 
44 | ![subset_signaled](imgs/subset_signaled_requests.png)
45 | 
46 | 
47 | ## Set QP's max_rd_atomic as the RNIC's max_qp_rd_atom
48 | 
49 | `max_rd_atomic` is a crucial QP attribute for performance, it is the number of RDMA Reads & atomic operations outstanding at any time that can be handled by a RC QP as an initiator. We suggest you set it as RNIC's `max_qp_rd_atom` which you can get by calling `ibv_query_device()`. You can refer to [our code](https://github.com/quiver-team/quiver-feature/blob/main/csrc/include/infinity/queues/QueuePair.cpp#L38) to see how to set this attribute.
50 | 
51 | ## Reduce Address Translation Overhead
52 | 
53 | RNIC uses DMA to access system memory, since DMA can only handle physical addresses, the memory region which is exposed to RNIC must be registered so that RNIC stores virtual-to-physical mapping of this memory region in its MTT(Memory Translation Table). MTT is stored in system memory but RNIC's SRAM will cache some. Every time RNIC receive a RDMA read/write requests, it will first translate user's virtual address to physical address by looking up it's MTT cache, if the cache is missed, it will send requsts through PCIe to check this mapping in system memory which may bring severe overhead and thus cause RDMA performance degradation.
54 | 
55 | ![rdma_mtt](imgs/rdma_mtt.png)
56 | 
57 | To reduce this address translation overhead, we choose to sort our requested node ids before sending RDMA requests to increase memory accessing locality so that RNIC's cache could get higher hit rate.
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/examples/mag240m/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | Distributed training setting on MAG240M dataset is almost the same as the [official example in DGL](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) except that we use `Quiver-Feature` for distributed feature collection.
 4 | 
 5 | Our implementation is much faster than DGL's offical example while achieving similar accuracy.
 6 | 
 7 | # Data Preprocess & Partition
 8 | 
 9 | First, please run [preprocess.py](./preprocess.py) to generate `graph.dgl` and `full.npy`, you can check [DGL's official guide](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M) for more details.
10 | 
11 | Then we use [Range Partition](../../docs/partition_methods.md) to partition feature data, it is very easy to understand, you can check [process_quiver.py](./process_quiver.py) for more details.
12 | 
13 | ![](../../docs/imgs/range_partition.png)
14 | 
15 | 
16 | # Running Training Script
17 | 
18 | On each machine, please run:
19 | 
20 |     python3 distributed_training.py \
21 |             --rootdir . \
22 |             --graph-path ./graph.dgl \
23 |             --feature-partition-path ./feature_part.pt \
24 |             --server_world_size 2
25 |             --server_rank 0
26 | 
27 | Remember to:
28 | 
29 | - Set shm size limit as large as your physical memory size. You can set by:
30 | 
31 |         sudo mount -o remount,size=300G /dev/shm
32 | 
33 | - Set `MASTER_IP` as your master node's IP
34 | 
35 | 
36 | The validation accuracy is 0.680. We do not have ground truth test labels so we do not report test accuracy.
37 | 
38 | # Performance
39 | 
40 | With 2 machines and 1 GPU per machine, we need 2 minutes 10 seconds to train and 15 seconds to validate for each epoch. This is 3x faster than [DGL's performance result](https://github.com/dmlc/dgl/tree/master/examples/pytorch/ogb_lsc/MAG240M).
41 | 
42 | From logs we can see that most of the training time of each iteration is spended on model computation.
43 | 
44 |     Avg_Sample: 0.0051s, Avg_Feature: 0.0176s, Avg_Model: 0.1801s, Avg_Feature_BandWidth = 14588.4937 MB/s
45 | 
46 | # Hardware configurations
47 | 
48 | We have 2 machines, each have 377G memory and they are connected with 100Gbps IB. Running training script will consume around 256GB memory.
49 | 


--------------------------------------------------------------------------------
/examples/mag240m/config.py:
--------------------------------------------------------------------------------
 1 | PORT_NUMBER = 3344
 2 | MASTER_IP = "155.198.152.17"
 3 | #MASTER_IP = "127.0.0.1"
 4 | HLPER_PORT = 5678
 5 | NODE_COUNT = 1200000
 6 | FEATURE_DIM = 128
 7 | FEATURE_TYPE_SIZE = 4
 8 | SAMPLE_NUM = 80000
 9 | ITER_NUM = 10
10 | POST_LIST_SIZE = 128
11 | QP_NUM = 8
12 | TX_DEPTH = 2048
13 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
14 | TEST_TLB_OPTIMIZATION = True
15 | 
16 | # For MAG240M Training
17 | SAMPLE_PARAM = [15, 25]
18 | BATCH_SIZE = 1024
19 | 


--------------------------------------------------------------------------------
/examples/mag240m/preprocess.py:
--------------------------------------------------------------------------------
  1 | import ogb
  2 | from ogb.lsc import MAG240MDataset
  3 | import tqdm
  4 | import numpy as np
  5 | import torch
  6 | import dgl
  7 | import dgl.function as fn
  8 | import argparse
  9 | import os
 10 | 
 11 | parser = argparse.ArgumentParser()
 12 | parser.add_argument('--rootdir', type=str, default='.', help='Directory to download the OGB dataset.')
 13 | parser.add_argument('--author-output-path', type=str, help='Path to store the author features.')
 14 | parser.add_argument('--inst-output-path', type=str,
 15 |                     help='Path to store the institution features.')
 16 | parser.add_argument('--graph-output-path', type=str, help='Path to store the graph.')
 17 | parser.add_argument('--graph-format', type=str, default='csc', help='Graph format (coo, csr or csc).')
 18 | parser.add_argument('--graph-as-homogeneous', action='store_true', help='Store the graph as DGL homogeneous graph.')
 19 | parser.add_argument('--full-output-path', type=str,
 20 |                     help='Path to store features of all nodes.  Effective only when graph is homogeneous.')
 21 | args = parser.parse_args()
 22 | 
 23 | print('Building graph')
 24 | dataset = MAG240MDataset(root=args.rootdir)
 25 | ei_writes = dataset.edge_index('author', 'writes', 'paper')
 26 | ei_cites = dataset.edge_index('paper', 'paper')
 27 | ei_affiliated = dataset.edge_index('author', 'institution')
 28 | 
 29 | # We sort the nodes starting with the papers, then the authors, then the institutions.
 30 | author_offset = 0
 31 | inst_offset = author_offset + dataset.num_authors
 32 | paper_offset = inst_offset + dataset.num_institutions
 33 | 
 34 | g = dgl.heterograph({
 35 |     ('author', 'write', 'paper'): (ei_writes[0], ei_writes[1]),
 36 |     ('paper', 'write-by', 'author'): (ei_writes[1], ei_writes[0]),
 37 |     ('author', 'affiliate-with', 'institution'): (ei_affiliated[0], ei_affiliated[1]),
 38 |     ('institution', 'affiliate', 'author'): (ei_affiliated[1], ei_affiliated[0]),
 39 |     ('paper', 'cite', 'paper'): (np.concatenate([ei_cites[0], ei_cites[1]]), np.concatenate([ei_cites[1], ei_cites[0]]))
 40 |     })
 41 | 
 42 | paper_feat = dataset.paper_feat
 43 | author_feat = np.memmap(args.author_output_path, mode='w+', dtype='float16', shape=(dataset.num_authors, dataset.num_paper_features))
 44 | inst_feat = np.memmap(args.inst_output_path, mode='w+', dtype='float16', shape=(dataset.num_institutions, dataset.num_paper_features))
 45 | 
 46 | # Iteratively process author features along the feature dimension.
 47 | BLOCK_COLS = 16
 48 | with tqdm.trange(0, dataset.num_paper_features, BLOCK_COLS) as tq:
 49 |     for start in tq:
 50 |         tq.set_postfix_str('Reading paper features...')
 51 |         g.nodes['paper'].data['x'] = torch.FloatTensor(paper_feat[:, start:start + BLOCK_COLS].astype('float32'))
 52 |         # Compute author features...
 53 |         tq.set_postfix_str('Computing author features...')
 54 |         g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='write-by')
 55 |         # Then institution features...
 56 |         tq.set_postfix_str('Computing institution features...')
 57 |         g.update_all(fn.copy_u('x', 'm'), fn.mean('m', 'x'), etype='affiliate-with')
 58 |         tq.set_postfix_str('Writing author features...')
 59 |         author_feat[:, start:start + BLOCK_COLS] = g.nodes['author'].data['x'].numpy().astype('float16')
 60 |         tq.set_postfix_str('Writing institution features...')
 61 |         inst_feat[:, start:start + BLOCK_COLS] = g.nodes['institution'].data['x'].numpy().astype('float16')
 62 |         del g.nodes['paper'].data['x']
 63 |         del g.nodes['author'].data['x']
 64 |         del g.nodes['institution'].data['x']
 65 | author_feat.flush()
 66 | inst_feat.flush()
 67 | 
 68 | # Convert to homogeneous if needed.  (The RGAT baseline needs homogeneous graph)
 69 | if args.graph_as_homogeneous:
 70 |     # Process graph
 71 |     g = dgl.to_homogeneous(g)
 72 |     # DGL ensures that nodes with the same type are put together with the order preserved.
 73 |     # DGL also ensures that the node types are sorted in ascending order.
 74 |     assert torch.equal(
 75 |         g.ndata[dgl.NTYPE],
 76 |         torch.cat([torch.full((dataset.num_authors,), 0),
 77 |                    torch.full((dataset.num_institutions,), 1),
 78 |                    torch.full((dataset.num_papers,), 2)]))
 79 |     assert torch.equal(
 80 |         g.ndata[dgl.NID],
 81 |         torch.cat([torch.arange(dataset.num_authors),
 82 |                    torch.arange(dataset.num_institutions),
 83 |                    torch.arange(dataset.num_papers)]))
 84 |     g.edata['etype'] = g.edata[dgl.ETYPE].byte()
 85 |     del g.edata[dgl.ETYPE]
 86 |     del g.ndata[dgl.NTYPE]
 87 |     del g.ndata[dgl.NID]
 88 | 
 89 |     # Process feature
 90 |     full_feat = np.memmap(
 91 |         args.full_output_path, mode='w+', dtype='float16',
 92 |         shape=(dataset.num_authors + dataset.num_institutions + dataset.num_papers, dataset.num_paper_features))
 93 |     BLOCK_ROWS = 100000
 94 |     for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS):
 95 |         end = min(dataset.num_authors, start + BLOCK_ROWS)
 96 |         full_feat[author_offset + start:author_offset + end] = author_feat[start:end]
 97 |     for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS):
 98 |         end = min(dataset.num_institutions, start + BLOCK_ROWS)
 99 |         full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end]
100 |     for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS):
101 |         end = min(dataset.num_papers, start + BLOCK_ROWS)
102 |         full_feat[paper_offset + start:paper_offset + end] = paper_feat[start:end]
103 |     
104 | # Convert the graph to the given format and save.  (The RGAT baseline needs CSC graph)
105 | g = g.formats(args.graph_format)
106 | dgl.save_graphs(args.graph_output_path, g)


--------------------------------------------------------------------------------
/examples/mag240m/preprocess_quiver.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | meta = torch.load("/data/mag/mag240m_kddcup2021/meta.pt")
 6 | 
 7 | print("Dataset Loading Finished")
 8 | 
 9 | paper_offset = meta["author"] + meta["institution"]
10 | num_nodes = paper_offset + meta["paper"]
11 | num_features = 768
12 | 
13 | feats = np.memmap("/data/dalong/full.npy", mode='r', dtype='float16', shape=(num_nodes, num_features))
14 | 
15 | print("Paper Loading Finished")
16 | 
17 | print("Creating Float32 Tensor")
18 | tensor_feature = torch.HalfTensor(feats[num_nodes//2: ])
19 | 
20 | torch.save(tensor_feature, "/data/dalong/second_half.pt")
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/examples/ogb-products/config.py:
--------------------------------------------------------------------------------
 1 | PORT_NUMBER = 3344
 2 | MASTER_IP = "127.0.0.1"
 3 | HLPER_PORT = 5678
 4 | NODE_COUNT = 1200000
 5 | FEATURE_DIM = 128
 6 | FEATURE_TYPE_SIZE = 4
 7 | SAMPLE_NUM = 80000
 8 | ITER_NUM = 10
 9 | POST_LIST_SIZE = 128
10 | QP_NUM = 8
11 | TX_DEPTH = 2048
12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
13 | TEST_TLB_OPTIMIZATION = True
14 | 
15 | # For Reddit Training
16 | SAMPLE_PARAM = [15, 10, 5]
17 | BATCH_SIZE = 1024
18 | 


--------------------------------------------------------------------------------
/examples/reddit/config.py:
--------------------------------------------------------------------------------
 1 | PORT_NUMBER = 3344
 2 | MASTER_IP = "127.0.0.1"
 3 | HLPER_PORT = 5678
 4 | NODE_COUNT = 1200000
 5 | FEATURE_DIM = 128
 6 | FEATURE_TYPE_SIZE = 4
 7 | SAMPLE_NUM = 80000
 8 | ITER_NUM = 10
 9 | POST_LIST_SIZE = 128
10 | QP_NUM = 8
11 | TX_DEPTH = 2048
12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
13 | TEST_TLB_OPTIMIZATION = True
14 | 
15 | # For Reddit Training
16 | SAMPLE_PARAM = [25, 10]
17 | BATCH_SIZE = 256
18 | 


--------------------------------------------------------------------------------
/quiver_feature/__init__.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from . import multiprocessing
 3 | from .dist_tensor_rpc import DistTensorRPC
 4 | from .common import Range, TensorEndPoint, DistTensorDeviceParam, DistTensorServerParam
 5 | from .dist_tensor_pgas import DistTensor as DistTensorPGAS
 6 | from .dist_helper import DistHelper
 7 | from .local_tensor_pgas import LocalTensorPGAS
 8 | from .tensor_loader import shared_load
 9 | from .utils import serve_tensor_for_remote_access
10 | from qvf import PipeParam, DistTensorServer
11 | 
12 | __all__ = ["DistTensorRPC", "DistTensorPGAS", "LocalTensorPGAS" , "Range", "TensorEndPoint", "DistHelper",
13 |            'shared_load', "PipeParam", "DistTensorServer", "serve_tensor_for_remote_access", "DistTensorServerParam", "DistTensorDeviceParam"]
14 | 


--------------------------------------------------------------------------------
/quiver_feature/common.py:
--------------------------------------------------------------------------------
1 | from collections import namedtuple
2 | Range = namedtuple("Range", ["start", "end"])
3 | TensorEndPoint = namedtuple("TensorEndPoint", ["server_rank", "ip", "port", "range"])
4 | DistTensorServerParam = namedtuple("DistTensorServerParam", ["port_num", "server_world_size", "device_per_server"])
5 | DistTensorServerParam.__new__.__defaults__ = (3344, 1, 1)
6 | DistTensorDeviceParam = namedtuple("DistTensorDeviceParam", ["device_list", "device_cache_size", "cache_policy"])
7 | DistTensorDeviceParam.__new__.__defaults__ = ([], 0, "device_replicate")


--------------------------------------------------------------------------------
/quiver_feature/dist_helper.py:
--------------------------------------------------------------------------------
  1 | import torch.distributed as torch_dist
  2 | import socket
  3 | import pickle
  4 | from datetime import timedelta
  5 | from .common import TensorEndPoint, Range
  6 | 
  7 | def resolve_my_ip():
  8 |     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  9 |     s.connect(("8.8.8.8", 80))
 10 |     my_ip = s.getsockname()[0]
 11 |     return my_ip
 12 | 
 13 | class DistHelper:
 14 |     def __init__(self, master_ip: str, master_port: int, world_size: int, my_rank: int):
 15 |         self.tcp_store = torch_dist.TCPStore(master_ip, master_port, world_size, my_rank == 0, wait_for_workers = True, multi_tenant=True)
 16 |         self.my_server_rank = my_rank
 17 |         self.server_world_size = world_size
 18 |         self.sync_point = 0
 19 | 
 20 |     def exchange_tensor_endpoints_info(self, local_tensor_range: Range, dist_tensor_server_port=3344):
 21 |         my_ip = resolve_my_ip()
 22 | 
 23 |         local_tensor_endpoint = TensorEndPoint(server_rank=self.my_server_rank, ip=my_ip, port=dist_tensor_server_port, range=local_tensor_range)
 24 |         pickled_data = pickle.dumps(local_tensor_endpoint)
 25 |         self.tcp_store.set(f"worker{self.my_server_rank}_data", pickled_data)
 26 | 
 27 | 
 28 |         tensor_endpoints = [0] * self.server_world_size
 29 |         tensor_endpoints[self.my_server_rank] = local_tensor_endpoint
 30 |         for rank in range(self.server_world_size):
 31 |             if rank != self.my_server_rank:
 32 |                 tensor_endpoints[rank] = pickle.loads(self.tcp_store.get(f"worker{rank}_data"))
 33 | 
 34 |         self.tcp_store.set(f"worker{self.my_server_rank}_status", "DONE")
 35 | 
 36 |         keys = [f"worker{rank}_status" for rank in range(self.server_world_size)]
 37 |         if self.my_server_rank == 0:
 38 |             while True:
 39 |                 try:
 40 |                     self.tcp_store.wait(keys, timedelta(seconds=1))
 41 |                     break
 42 |                 except:
 43 |                     pass
 44 | 
 45 | 
 46 |         return tensor_endpoints
 47 | 
 48 |     def sync_all(self):
 49 |         self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC1")
 50 | 
 51 |         keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)]
 52 |         while True:
 53 |             try:
 54 |                 self.tcp_store.wait(keys, timedelta(seconds=1))
 55 |                 break
 56 |             except:
 57 |                 pass
 58 | 
 59 | 
 60 |         self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1")
 61 | 
 62 |         keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)]
 63 |         if self.my_server_rank == 0:
 64 |            while True:
 65 |                 try:
 66 |                     self.tcp_store.wait(keys, timedelta(seconds=1))
 67 |                     break
 68 |                 except:
 69 |                     pass
 70 | 
 71 | 
 72 |             # TODO Delete Keys
 73 |             #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}")
 74 |             #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}")
 75 |         self.sync_point += 1
 76 | 
 77 |     def sync_start(self):
 78 |         self.tcp_store.set(f"worker{self.my_server_rank}_sync_start_{self.sync_point}", f"SYNC")
 79 | 
 80 |     def sync_end(self):
 81 | 
 82 | 
 83 |         keys = [f"worker{rank}_sync_start_{self.sync_point}" for rank in range(self.server_world_size)]
 84 |         while True:
 85 |             try:
 86 |                 self.tcp_store.wait(keys, timedelta(seconds=1))
 87 |                 break
 88 |             except:
 89 |                 pass
 90 | 
 91 |         self.tcp_store.set(f"worker{self.my_server_rank}_sync_end_{self.sync_point}", f"SYNC1")
 92 | 
 93 |         keys = [f"worker{rank}_sync_end_{self.sync_point}" for rank in range(self.server_world_size)]
 94 |         if self.my_server_rank == 0:
 95 |              while True:
 96 |                 try:
 97 |                     self.tcp_store.wait(keys, timedelta(seconds=1))
 98 |                     break
 99 |                 except:
100 |                     pass
101 |             # TODO Delete Keys
102 |             #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_start_{self.sync_point}")
103 |             #self.tcp_store.deleteKey(f"worker{self.my_server_rank}_sync_end_{self.sync_point}")
104 |         self.sync_point += 1
105 | 


--------------------------------------------------------------------------------
/quiver_feature/dist_tensor_pgas.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import qvf
  3 | from typing import List
  4 | from .common import Range, TensorEndPoint, DistTensorServerParam, DistTensorDeviceParam
  5 | from .dist_helper import DistHelper
  6 | from .local_tensor_pgas import LocalTensorPGAS
  7 | from .utils import serve_tensor_for_remote_access
  8 | 
  9 | FloatType = [torch.float32, torch.float64, torch.float16, torch.bfloat16]
 10 | IntType = [torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64]
 11 | 
 12 | class DistTensor:
 13 |     def __init__(self, server_rank, tensor_endpoints: List[TensorEndPoint], pipe_param: qvf.PipeParam, buffer_tensor_shape, cached_range: Range= Range(start=0, end=0), order_transform:torch.Tensor=None, dtype=torch.float32)-> None:
 14 | 
 15 |         # About DistTensorClient
 16 |         self.server_rank = server_rank
 17 |         self.world_size = len(tensor_endpoints)
 18 |         self.tensor_endpoints = sorted(tensor_endpoints, key= lambda x: x.server_rank)
 19 |         self.buffer_tensor_shape = buffer_tensor_shape
 20 |         self.pipe_param = pipe_param
 21 |         self.com_endpoints = [qvf.ComEndPoint(item.server_rank, item.ip, item.port) for item in tensor_endpoints]
 22 | 
 23 |         self.data_type = dtype
 24 | 
 25 |         # About Lazy Init
 26 |         self.inited = False
 27 | 
 28 |         # About ShardTensor
 29 |         self.local_tensor_pgas = None
 30 |         self.cached_range = cached_range
 31 |         self.device_rank = -1
 32 |         self.order_transform = order_transform
 33 |     
 34 |     @property
 35 |     def dtype(self):
 36 |         return self.data_type
 37 | 
 38 |     def lazy_init(self):
 39 |         if self.inited:
 40 |             return
 41 |         self.inited = True
 42 | 
 43 |         self.device_rank = torch.cuda.current_device()
 44 | 
 45 |         # Create DistTensorClient
 46 |         self.dist_tensor_client = qvf.DistTensorClient(self.server_rank, self.com_endpoints, self.pipe_param)
 47 |         self.registered_tensor = torch.zeros(self.buffer_tensor_shape, dtype=self.dtype).pin_memory()
 48 |         self.dist_tensor_client.register_float_tensor(self.registered_tensor)
 49 | 
 50 |         if self.order_transform is not None:
 51 |             self.order_transform = self.order_transform.to(self.device_rank)
 52 | 
 53 | 
 54 |     def from_cpu_tensor(self, cpu_tensor, dist_helper:DistHelper, server_param:DistTensorServerParam= None, device_param:DistTensorDeviceParam=None):
 55 | 
 56 |         self.data_type = cpu_tensor.dtype
 57 | 
 58 |         server_param: DistTensorServerParam = server_param or DistTensorServerParam()
 59 |         device_param: DistTensorDeviceParam = device_param or DistTensorDeviceParam()
 60 | 
 61 |         cpu_tensor.share_memory_()
 62 | 
 63 |         # Start Server
 64 |         serve_tensor_for_remote_access(server_param.port_num, self.pipe_param.get_param_vec()[0], server_param.server_world_size, server_param.device_per_server, cpu_tensor, dist_helper)
 65 | 
 66 |         # Build Local Tensor
 67 |         self.local_tensor_pgas = LocalTensorPGAS(device_param.device_list, device_param.device_cache_size, device_param.cache_policy)
 68 |         self.local_tensor_pgas.from_cpu_tensor(cpu_tensor)
 69 |         
 70 | 
 71 |     def to(self, device_rank):
 72 |         self.device_rank = device_rank
 73 |         if self.order_transform is not None:
 74 |             self.order_transform = self.order_transform.to(device_rank)
 75 | 
 76 |         return self
 77 | 
 78 |     def size(self, dim):
 79 |         assert dim < 2, "DistTensorPGAS is 2-dimensional"
 80 |         if dim == 1:
 81 |             return self.buffer_tensor_shape[1]
 82 |         if dim == 0:
 83 |             all_ends = [item.range.end for item in self.tensor_endpoints]
 84 |             all_ends.sort()
 85 |             return all_ends[-1]
 86 |     
 87 |     @property
 88 |     def shape(self):
 89 |         return [self.size(0), self.size(1)]
 90 | 
 91 |     def collect(self, nodes):
 92 |         nodes -= self.tensor_endpoints[self.server_rank].range.start
 93 |         nodes += self.cached_range.end
 94 |         data = self.local_tensor_pgas[nodes]
 95 |         return data
 96 | 
 97 |     def collect_cached_data(self, nodes):
 98 |         data = self.local_tensor_pgas[nodes]
 99 |         return data
100 | 
101 |     def cal_remote_offsets(self, nodes, server_rank):
102 |         remote_offsets = (nodes - self.tensor_endpoints[server_rank].range.start + self.cached_range.end) * self.buffer_tensor_shape[1] * self.registered_tensor.element_size()
103 |         return remote_offsets
104 | 
105 |     def __getitem__(self, nodes):
106 | 
107 |         self.lazy_init()
108 |         nodes = nodes.cuda()
109 |         if self.order_transform is not None:
110 |             nodes = self.order_transform[nodes]
111 | 
112 |         input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device)
113 | 
114 |         feature = torch.empty(nodes.shape[0], self.shape[1], device = nodes.device, dtype=self.dtype)
115 | 
116 |         cache_nodes_mask = None
117 |         local_nodes_mask = None
118 | 
119 | 
120 |         # Load cache data
121 |         if self.cached_range.end > 0:
122 |             cache_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end)
123 |             cache_request_nodes = torch.masked_select(nodes, cache_nodes_mask)
124 |             cache_part_orders = torch.masked_select(input_orders, cache_nodes_mask)
125 |             if cache_request_nodes.shape[0] > 0:
126 |                 feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes)
127 | 
128 | 
129 | 
130 | 
131 |         # Load local data
132 |         range_item = self.tensor_endpoints[self.server_rank].range
133 |         local_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
134 |         local_request_nodes = torch.masked_select(nodes, local_nodes_mask)
135 |         local_part_orders = torch.masked_select(input_orders, local_nodes_mask)
136 |         if local_request_nodes.shape[0] > 0:
137 |             feature[local_part_orders] = self.collect(local_request_nodes)
138 | 
139 | 
140 |         # Collect Remote Data
141 |         if cache_nodes_mask is None:
142 |             all_remote_nodes_mask = torch.logical_not(local_nodes_mask)
143 |         else:
144 |             all_remote_nodes_mask = torch.logical_not(torch.logical_or(local_nodes_mask, cache_nodes_mask))
145 | 
146 |         all_remote_nodes = torch.masked_select(nodes, all_remote_nodes_mask)
147 |         all_remote_orders = torch.masked_select(input_orders, all_remote_nodes_mask)
148 | 
149 |         assert all_remote_nodes.shape[0] <= self.registered_tensor.shape[0], "Collected Data Exceeds Buffer Size"
150 | 
151 |         for server_rank in range(self.world_size):
152 | 
153 |             range_item = self.tensor_endpoints[server_rank].range
154 |             if server_rank != self.server_rank:
155 |                 request_nodes_mask = (all_remote_nodes >= range_item.start) & (all_remote_nodes < range_item.end)
156 |                 request_nodes = torch.masked_select(all_remote_nodes, request_nodes_mask)
157 |                 if request_nodes.shape[0] > 0:
158 |                     local_orders = torch.masked_select(input_orders[:all_remote_nodes.shape[0]], request_nodes_mask)
159 |                     local_offsets = local_orders * self.registered_tensor.shape[1] * self.registered_tensor.element_size()
160 |                     remote_offsets = self.cal_remote_offsets(request_nodes, server_rank)
161 |                     self.dist_tensor_client.sync_read(server_rank, self.registered_tensor, local_offsets.cpu(), remote_offsets.cpu())
162 | 
163 |         feature[all_remote_orders] = self.registered_tensor[:all_remote_nodes.shape[0]].to(self.device_rank)
164 |         return feature
165 | 


--------------------------------------------------------------------------------
/quiver_feature/dist_tensor_rpc.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed.rpc as rpc
  3 | from typing import List
  4 | from .common import Range
  5 | 
  6 | class Task:
  7 |     def __init__(self, prev_order, fut):
  8 |         self.prev_order_ = prev_order
  9 |         self.fut_ = fut
 10 |         self.data_ = None
 11 | 
 12 |     def wait(self):
 13 |         self.data_ = self.fut_.wait()
 14 | 
 15 |     @property
 16 |     def data(self):
 17 |         return self.data_
 18 | 
 19 |     @property
 20 |     def prev_order(self):
 21 |         return self.prev_order_
 22 | 
 23 | class Singleton(object):
 24 |     def __init__(self, cls):
 25 |         self._cls = cls
 26 |         self._instance = {}
 27 |     def __call__(self, *args, **kwargs):
 28 |         if self._cls not in self._instance:
 29 |             self._instance[self._cls] = self._cls()
 30 |             self._instance[self._cls].init(*args, **kwargs)
 31 |         return self._instance[self._cls]
 32 | 
 33 | 
 34 | def collect(nodes):
 35 |     dist_tensor = DistTensorRPC()
 36 |     return dist_tensor.collect(nodes)
 37 | 
 38 | 
 39 | @Singleton
 40 | class DistTensorRPC(object):
 41 | 
 42 |     def __init__(self):
 43 |         pass
 44 | 
 45 |     def init(self, world_size, rank, local_size, local_rank, shard_tensor, range_list: List[Range], rpc_option, cached_range = Range(start=0, end=0), order_transform=None, **debug_params) -> None:
 46 |         self.shard_tensor = shard_tensor
 47 |         self.range_list = range_list
 48 |         self.cached_range = cached_range
 49 |         self.order_transform = None
 50 |         if order_transform is not None:
 51 |             self.order_transform = order_transform.to(local_rank)
 52 |         self.rank = rank
 53 |         self.local_rank = local_rank
 54 |         self.world_size = world_size
 55 |         self.local_size = local_size
 56 |         self.debug_params = debug_params
 57 | 
 58 |         rpc.init_rpc(f"worker{rank}", rank=self.rank, world_size= world_size, rpc_backend_options=rpc_option)
 59 | 
 60 |     def collect(self, nodes):
 61 | 
 62 |         # TODO Just For Debugging
 63 |         if nodes.is_cuda:
 64 |             torch.cuda.set_device(self.local_rank)
 65 |         nodes -= self.range_list[self.rank].start
 66 |         nodes += self.cached_range.end
 67 |         data = self.shard_tensor[nodes]
 68 | 
 69 |         return data
 70 | 
 71 |     def collect_cached_data(self, nodes):
 72 |          # TODO Just For Debugging
 73 |         if nodes.is_cuda:
 74 |             torch.cuda.set_device(self.local_rank)
 75 |         data = self.shard_tensor[nodes]
 76 | 
 77 |         return data
 78 | 
 79 |     def __getitem__(self, nodes):
 80 | 
 81 |         task_list: List[Task] = []
 82 |         if self.order_transform is not None:
 83 |             nodes = self.order_transform[nodes]
 84 |         input_orders = torch.arange(nodes.size(0), dtype=torch.long, device = nodes.device)
 85 | 
 86 |         remote_collect = 0
 87 |         for worker_id in range(self.local_rank, self.world_size, self.local_size):
 88 |             range_item = self.range_list[worker_id]
 89 |             if worker_id != self.rank:
 90 |                 request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
 91 |                 request_nodes = torch.masked_select(nodes, request_nodes_mask)
 92 |                 if request_nodes.shape[0] > 0:
 93 |                     remote_collect += request_nodes.shape[0]
 94 |                     part_orders = torch.masked_select(input_orders, request_nodes_mask)
 95 |                     fut = rpc.rpc_async(f"worker{worker_id}", collect, args=(request_nodes, ))
 96 |                     task_list.append(Task(part_orders, fut))
 97 | 
 98 |         feature = torch.zeros(nodes.shape[0], self.shard_tensor.shape[1], device = f"cuda:{self.local_rank}")
 99 | 
100 |         # Load Cached Data
101 |         if self.cached_range.end > 0:
102 |             request_nodes_mask = (nodes >= self.cached_range.start) & (nodes < self.cached_range.end)
103 |             cache_request_nodes = torch.masked_select(nodes, request_nodes_mask)
104 |             cache_part_orders = torch.masked_select(input_orders, request_nodes_mask)
105 |             if cache_request_nodes.shape[0] > 0:
106 |                 feature[cache_part_orders] = self.collect_cached_data(cache_request_nodes).to(self.local_rank)
107 | 
108 | 
109 |         # Load local data
110 |         range_item = self.range_list[self.rank]
111 |         request_nodes_mask = (nodes >= range_item.start) & (nodes < range_item.end)
112 |         local_request_nodes = torch.masked_select(nodes, request_nodes_mask)
113 |         local_part_orders = torch.masked_select(input_orders, request_nodes_mask)
114 |         if local_request_nodes.shape[0] > 0:
115 |             feature[local_part_orders] = self.collect(local_request_nodes).to(self.local_rank)
116 | 
117 |         for task in task_list:
118 |             task.wait()
119 |             feature[task.prev_order] = task.data.to(self.local_rank)
120 |         return feature
121 | 


--------------------------------------------------------------------------------
/quiver_feature/multiprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .reductions import init_reductions
2 | 
3 | init_reductions()


--------------------------------------------------------------------------------
/quiver_feature/multiprocessing/reductions.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.reduction import ForkingPickler
 2 | import qvf
 3 | from ..local_tensor_pgas import LocalTensorPGAS
 4 | 
 5 | def rebuild_qvf_pipeparam(ipc_handle):
 6 | 
 7 |     pipe_param = qvf.PipeParam()
 8 |     pipe_param.set_param_vec(ipc_handle)
 9 |     return pipe_param
10 | 
11 | def reduce_qvf_pipeparam(pipe_param):
12 |     param_vec = pipe_param.get_param_vec()
13 |     return(rebuild_qvf_pipeparam, (param_vec, ))
14 | 
15 | 
16 | def rebuild_qvf_comendpoint(ipc_handle):
17 | 
18 |     com_endpoint = qvf.ComEndPoint(ipc_handle[0], ipc_handle[1], ipc_handle[2])
19 |     return com_endpoint
20 | 
21 | def reduce_qvf_comendpoint(com_endpoint):
22 |     param_vec = (com_endpoint.rank(), com_endpoint.address(), com_endpoint.port())
23 |     return (rebuild_qvf_comendpoint, (param_vec, ))
24 | 
25 | def rebuild_localtensorpgas(ipc_handle):
26 | 
27 |     feature = LocalTensorPGAS.lazy_from_ipc_handle(ipc_handle)
28 |     return feature
29 | 
30 | 
31 | def reduce_localtensorpgas(feature):
32 | 
33 |     ipc_handle = feature.share_ipc()
34 |     return (rebuild_localtensorpgas, (ipc_handle, ))
35 | 
36 | def init_reductions():
37 |     ForkingPickler.register(qvf.PipeParam, reduce_qvf_pipeparam)
38 |     ForkingPickler.register(qvf.ComEndPoint, reduce_qvf_comendpoint)
39 |     ForkingPickler.register(LocalTensorPGAS, reduce_localtensorpgas)


--------------------------------------------------------------------------------
/quiver_feature/tensor_loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import qvf
 3 | import torch.serialization as se
 4 | from torch.serialization import *
 5 | 
 6 | 
 7 | class _open_zipfile_reader(torch.serialization._opener):
 8 |     def __init__(self, name_or_buffer) -> None:
 9 |         super(_open_zipfile_reader, self).__init__(qvf.SharedTensorLoader(name_or_buffer))
10 | 
11 | 
12 | def shared_load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
13 |     se._check_dill_version(pickle_module)
14 | 
15 |     if 'encoding' not in pickle_load_args.keys():
16 |         pickle_load_args['encoding'] = 'utf-8'
17 | 
18 |     with se._open_file_like(f, 'rb') as opened_file:
19 |         if se._is_zipfile(opened_file):
20 |             # The zipfile reader is going to advance the current file position.
21 |             # If we want to actually tail call to torch.jit.load, we need to
22 |             # reset back to the original position.
23 |             orig_position = opened_file.tell()
24 |             with _open_zipfile_reader(opened_file) as opened_zipfile:
25 |                 if se._is_torchscript_zip(opened_zipfile):
26 |                     warnings.warn("'torch.load' received a zip file that looks like a TorchScript archive"
27 |                                   " dispatching to 'torch.jit.load' (call 'torch.jit.load' directly to"
28 |                                   " silence this warning)", UserWarning)
29 |                     opened_file.seek(orig_position)
30 |                     return torch.jit.load(opened_file)
31 |                 return se._load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
32 |         return se._legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
33 | 


--------------------------------------------------------------------------------
/quiver_feature/utils.py:
--------------------------------------------------------------------------------
 1 | import threading 
 2 | from qvf import DistTensorServer
 3 | 
 4 | def server_thread(port_number, qp_num, world_size, tensor, dist_helper):
 5 |     dist_tensor_server = DistTensorServer(port_number, world_size, qp_num)
 6 |     dist_tensor_server.serve_tensor(tensor)
 7 |     dist_helper.sync_start()
 8 |     dist_tensor_server.join()
 9 | 
10 | def serve_tensor_for_remote_access(port_number, qp_num, server_world_size, device_per_server, cpu_tensor, dist_helper):
11 |     server = threading.Thread(target=server_thread, args=(port_number, qp_num, server_world_size * device_per_server, cpu_tensor, dist_helper))
12 |     server.daemon = True
13 |     server.start()
14 |     dist_helper.sync_end()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import os.path as osp
  5 | from itertools import product
  6 | from setuptools import setup, find_packages
  7 | import platform
  8 | 
  9 | import torch
 10 | from torch.__config__ import parallel_info
 11 | from torch.utils.cpp_extension import BuildExtension
 12 | from torch.utils.cpp_extension import CppExtension, CUDAExtension, CUDA_HOME
 13 | import torch.utils.cpp_extension as cpp_extension
 14 | 
 15 | WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
 16 | suffices = ['cpu', 'cuda'] if WITH_CUDA else ['cpu']
 17 | if os.getenv('FORCE_CUDA', '0') == '1':
 18 |     suffices = ['cuda', 'cpu']
 19 | if os.getenv('FORCE_ONLY_CUDA', '0') == '1':
 20 |     suffices = ['cuda']
 21 | if os.getenv('FORCE_ONLY_CPU', '0') == '1':
 22 |     suffices = ['cpu']
 23 | 
 24 | BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
 25 | 
 26 | WITH_SYMBOLS = True if os.getenv('WITH_SYMBOLS', '0') == '1' else False
 27 | 
 28 | 
 29 | def get_torch_includes():
 30 |     lib_include = os.path.join(cpp_extension._TORCH_PATH, 'include')
 31 |     paths = [
 32 |         osp.join(lib_include, 'ATen'),
 33 |         osp.join(lib_include, 'c10'),
 34 |         osp.join(lib_include, 'caffe2'),
 35 |     ]
 36 | 
 37 |     return paths
 38 | 
 39 | 
 40 | def get_extensions():
 41 |     extensions = []
 42 |     libraries = ['ibverbs']
 43 | 
 44 |     extensions_dir = osp.join('csrc')
 45 | 
 46 |     srcs = glob.glob(osp.join(extensions_dir, 'src', '*.cpp'))
 47 |     srcs += glob.glob(osp.join(extensions_dir, 'src', '*.cu'))
 48 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/core", '*.cpp'))
 49 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/memory", '*.cpp'))
 50 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/queues", '*.cpp'))
 51 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/requests", '*.cpp'))
 52 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "infinity/utils", '*.cpp'))
 53 |     srcs += glob.glob(osp.join(extensions_dir, 'include', "miniz", '*.c'))
 54 |     includes = osp.join(extensions_dir, 'include/')
 55 | 
 56 |     define_macros = [('WITH_PYTHON', None)]
 57 |     extra_compile_args = {
 58 |         'cxx': ['-O3', '-std=c++17', '-libverbs'],
 59 |         '/usr/local/cuda/bin/nvcc': ['-O3', '--expt-extended-lambda', '-std=c++17', '-libverbs']}
 60 |     extra_link_args = [] if WITH_SYMBOLS else ['-s']
 61 | 
 62 |     Extension = CUDAExtension
 63 |     extension = Extension(
 64 |         'qvf',
 65 |         srcs,
 66 |         include_dirs=[includes] + get_torch_includes(),
 67 |         define_macros=define_macros,
 68 |         extra_compile_args=extra_compile_args,
 69 |         extra_link_args=extra_link_args,
 70 |         libraries=libraries,
 71 |     )
 72 |     extensions += [extension]
 73 |     return extensions
 74 | 
 75 | 
 76 | install_requires = []
 77 | setup_requires = []
 78 | tests_require = ['pytest', 'pytest-runner', 'pytest-cov']
 79 | 
 80 | setup(
 81 |     name='quiver_feature',
 82 |     version='0.0.1',
 83 |     author='quiver-team',
 84 |     author_email='',
 85 |     url='https://github.com/quiver-team/quiver_feature',
 86 |     description=('PyTorch Library for graph learning sampling'),
 87 |     keywords=['pytorch', 'sparse', 'graph'],
 88 |     license='Apache',
 89 |     python_requires='>=3.6',
 90 |     install_requires=install_requires,
 91 |     setup_requires=setup_requires,
 92 |     tests_require=tests_require,
 93 |     extras_require={'test': tests_require},
 94 |     ext_modules=get_extensions() if not BUILD_DOCS else [],
 95 |     cmdclass={
 96 |         'build_ext':
 97 |             BuildExtension.with_options(no_python_abi_suffix=True, use_ninja=False)
 98 |     },
 99 |     packages=find_packages(),
100 | )
101 | 


--------------------------------------------------------------------------------
/tests/cpp/test_DistTensorClient.cpp:
--------------------------------------------------------------------------------
 1 | #include <qvf/com_endpoint.h>
 2 | #include <qvf/common.h>
 3 | #include <qvf/dist_tensor_client.h>
 4 | #include <qvf/pipe.h>
 5 | 
 6 | #include <iostream>
 7 | #include <vector>
 8 | 
 9 | #include <torch/extension.h>
10 | 
11 | #define PORT_NUMBER 3344
12 | #define SERVER_IP "155.198.152.17"
13 | 
14 | #define NODE_COUNT 120000LL
15 | #define FEATURE_DIM 256LL
16 | #define FEATURE_TYPE_SIZE 4LL
17 | #define SAMPLE_NUM 80960LL
18 | #define TEST_COUNT 8192LL
19 | #define ITER_NUM 10LL
20 | #define POST_LIST_SIZE 16LL
21 | #define CQ_MOD 16LL
22 | #define QP_NUM 2LL
23 | #define TX_DEPTH 2048LL
24 | #define CTX_POLL_BATCH 16LL
25 | 
26 | int min(int a, int b);
27 | 
28 | void print_tensor_res(torch::Tensor& res_tensor) {
29 |   float* res = res_tensor.data_ptr<float>();
30 |   for (int col = 0; col < res_tensor.size(1); col++) {
31 |     std::cout << res[0 * res_tensor.size(1) + col] << " ";
32 |   }
33 |   std::cout << std::endl;
34 | }
35 | void check_tensor_res(torch::Tensor& res_tensor,
36 |                       torch::Tensor& remote_offsets) {
37 |   float* res = res_tensor.data_ptr<float>();
38 |   int stride = res_tensor.size(1);
39 |   int64_t* offsets = remote_offsets.data_ptr<int64_t>();
40 |   for (int row = 0; row < remote_offsets.size(0); row++) {
41 |     for (int col = 0; col < res_tensor.size(1); col++) {
42 |       float expected_value =
43 |           float(offsets[row]) / (FEATURE_DIM * FEATURE_TYPE_SIZE);
44 |       QUIVER_FEATURE_ASSERT(
45 |           res[row * stride + col] == expected_value,
46 |           "Result Check Failed At (%d, %d)!, Expected %f, Got %f\n", row, col,
47 |           expected_value, res[row * stride + col]);
48 |     }
49 |   }
50 |   printf("Result Check Passed, Congrats!\n");
51 | }
52 | 
53 | void test_dist_tensor_client(int argc, char** argv) {
54 |   qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
55 |                             POST_LIST_SIZE);
56 | 
57 |   qvf::ComEndPoint local_com_end_point(0, SERVER_IP, PORT_NUMBER);
58 |   qvf::ComEndPoint remote_com_end_point(1, SERVER_IP, PORT_NUMBER);
59 |   std::vector<qvf::ComEndPoint> com_endpoints{local_com_end_point,
60 |                                               remote_com_end_point};
61 |   qvf::DistTensorClient dist_tensor_client(0, com_endpoints, pipe_param);
62 |   std::vector<int64_t> shape{SAMPLE_NUM, FEATURE_DIM};
63 | 
64 |   torch::Tensor registered_tensor =
65 |       dist_tensor_client.create_registered_float32_tensor(shape);
66 | 
67 |   std::vector<int64_t> local_offsets(SAMPLE_NUM);
68 |   std::vector<int64_t> remote_offsets(SAMPLE_NUM);
69 | 
70 |   for (int index = 0; index < SAMPLE_NUM; index++) {
71 |     local_offsets[index] = index * FEATURE_DIM * FEATURE_TYPE_SIZE;
72 |     remote_offsets[index] =
73 |         rand() % NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE;
74 |     // remote_offsets[index] = FEATURE_DIM * FEATURE_TYPE_SIZE;
75 |   }
76 | 
77 |   for (int index = 0; index < min(1, SAMPLE_NUM); index++) {
78 |     std::cout << "Collect Node "
79 |               << remote_offsets[index] / (FEATURE_DIM * FEATURE_TYPE_SIZE)
80 |               << ": " << local_offsets[index] << "<-" << remote_offsets[index]
81 |               << std::endl;
82 |   }
83 |   std::cout << std::endl;
84 | 
85 |   auto tensor_option = torch::TensorOptions().dtype(torch::kInt64);
86 |   torch::Tensor local_offsets_tensor =
87 |       torch::from_blob(&local_offsets[0], {SAMPLE_NUM}, tensor_option);
88 |   torch::Tensor remote_offsets_tensor =
89 |       torch::from_blob(&remote_offsets[0], {SAMPLE_NUM}, tensor_option);
90 | 
91 |   dist_tensor_client.sync_read(1, registered_tensor, local_offsets_tensor,
92 |                                remote_offsets_tensor);
93 |   // print_tensor_res(registered_tensor);
94 |   check_tensor_res(registered_tensor, remote_offsets_tensor);
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/cpp/test_DistTensorServer.cpp:
--------------------------------------------------------------------------------
 1 | #include <qvf/dist_tensor_server.h>
 2 | #include <qvf/pipe.h>
 3 | 
 4 | #define PORT_NUMBER 3344
 5 | #define SERVER_IP "155.198.152.17"
 6 | 
 7 | #define NODE_COUNT 120000LL
 8 | #define FEATURE_DIM 256LL
 9 | #define FEATURE_TYPE_SIZE 4LL
10 | #define TEST_COUNT 8192LL
11 | #define ITER_NUM 10LL
12 | #define POST_LIST_SIZE 16LL
13 | #define CQ_MOD 16LL
14 | #define QP_NUM 2LL
15 | #define TX_DEPTH 2048LL
16 | #define CTX_POLL_BATCH 16LL
17 | 
18 | float* allocate_float_feature(bool set_value) {
19 |   float* buffer = (float*)malloc(NODE_COUNT * FEATURE_DIM * sizeof(float));
20 |   float index = 0;
21 |   for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
22 |     for (int dim = 0; dim < FEATURE_DIM; dim++) {
23 |       if (set_value)
24 |         buffer[start * FEATURE_DIM + dim] = index;
25 |       else
26 |         buffer[start * FEATURE_DIM + dim] = 0;
27 |     }
28 |     index += 1;
29 |   }
30 |   return buffer;
31 | }
32 | 
33 | void test_dist_tensor_server(int argc, char** argv) {
34 |   qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
35 |                             POST_LIST_SIZE);
36 |   qvf::DistTensorServer dist_tensor_server(PORT_NUMBER, 1, 1);
37 |   float* server_data_buffer = allocate_float_feature(true);
38 |   dist_tensor_server.serve(server_data_buffer,
39 |                            NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
40 | }
41 | 


--------------------------------------------------------------------------------
/tests/cpp/test_Pipe.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Examples - Read/Write/Send Operations
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include <math.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <sys/time.h>
 13 | #include <time.h>
 14 | #include <unistd.h>
 15 | #include <cassert>
 16 | 
 17 | #include <algorithm>
 18 | #include <chrono>
 19 | #include <iostream>
 20 | #include <vector>
 21 | 
 22 | #include <infinity/core/Context.h>
 23 | #include <infinity/memory/Buffer.h>
 24 | #include <infinity/memory/RegionToken.h>
 25 | #include <infinity/queues/QueuePair.h>
 26 | #include <infinity/queues/QueuePairFactory.h>
 27 | #include <infinity/requests/RequestToken.h>
 28 | 
 29 | #include <qvf/com_endpoint.h>
 30 | #include <qvf/common.h>
 31 | #include <qvf/pipe.h>
 32 | #include <qvf/range.h>
 33 | 
 34 | #define PORT_NUMBER 3344
 35 | #define SERVER_IP "155.198.152.17"
 36 | 
 37 | #define NODE_COUNT 120000LL
 38 | #define FEATURE_DIM 256LL
 39 | #define FEATURE_TYPE_SIZE 4LL
 40 | #define TEST_COUNT 8192LL
 41 | #define ITER_NUM 10LL
 42 | #define POST_LIST_SIZE 16LL
 43 | #define CQ_MOD 16LL
 44 | #define QP_NUM 2LL
 45 | #define TX_DEPTH 2048LL
 46 | #define CTX_POLL_BATCH 16LL
 47 | 
 48 | int min(int a, int b) {
 49 |   if (a < b) {
 50 |     return a;
 51 |   }
 52 |   return b;
 53 | }
 54 | 
 55 | uint64_t timeDiff(struct timeval stop, struct timeval start) {
 56 |   return (stop.tv_sec * 1000000L + stop.tv_usec) -
 57 |          (start.tv_sec * 1000000L + start.tv_usec);
 58 | }
 59 | 
 60 | float* allocate_float_feature(bool set_value);
 61 | 
 62 | bool mem_check(float* data_buffer) {
 63 |   float index = 0;
 64 |   bool have_valid_data = false;
 65 |   for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
 66 |     for (int dim = 0; dim < FEATURE_DIM; dim++) {
 67 |       if (data_buffer[start * FEATURE_DIM + dim] != 0) {
 68 |         have_valid_data = true;
 69 |       }
 70 |     }
 71 |   }
 72 |   QUIVER_FEATURE_ASSERT(have_valid_data == true, "No valid data is copied")
 73 | 
 74 |   for (u_int64_t start = 0; start < NODE_COUNT; start += 1) {
 75 |     float expected_value =
 76 |         (data_buffer[start * FEATURE_DIM] == 0) ? 0 : float(start);
 77 |     std::cout << data_buffer[start * FEATURE_DIM] << " ";
 78 |     for (u_int64_t dim = 0; dim < FEATURE_DIM; dim++) {
 79 |       QUIVER_FEATURE_ASSERT(
 80 |           data_buffer[start * FEATURE_DIM + dim] == expected_value,
 81 |           "Result Check Failed At (%lld, %lld)!, Expected %f, Got %f\n", start,
 82 |           dim, expected_value, data_buffer[start * FEATURE_DIM + dim]);
 83 |     }
 84 |   }
 85 |   return true;
 86 | }
 87 | 
 88 | void test_pipe(int argc, char** argv) {
 89 |   bool random = true;
 90 |   bool sort_index = false;
 91 | 
 92 |   while (argc > 1) {
 93 |     if (argv[1][0] == '-') {
 94 |       switch (argv[1][1]) {
 95 |         case 'l': {
 96 |           random = false;
 97 |           break;
 98 |         }
 99 |         case 't': {
100 |           sort_index = true;
101 |           break;
102 |         }
103 |       }
104 |     }
105 |     ++argv;
106 |     --argc;
107 |   }
108 |   if (random) {
109 |     printf("Test Random Data Access \n");
110 |   } else {
111 |     printf("Test Sequential Data Access \n");
112 |   }
113 |   if (sort_index) {
114 |     printf("Test Data Access With TLB Optimization\n");
115 |   }
116 | 
117 |   std::vector<infinity::queues::QueuePair*> qps;
118 |   infinity::core::Context* context = new infinity::core::Context();
119 |   infinity::queues::QueuePairFactory* qpFactory =
120 |       new infinity::queues::QueuePairFactory(context);
121 | 
122 |   qps.resize(QP_NUM);
123 |   qvf::ComEndPoint endpoint(0, SERVER_IP, PORT_NUMBER);
124 |   qvf::PipeParam pipe_param(QP_NUM, CTX_POLL_BATCH, TX_DEPTH,
125 |                             POST_LIST_SIZE);
126 |   qvf::Pipe quiver_pipe(context, qpFactory, endpoint, pipe_param);
127 |   quiver_pipe.connect();
128 | 
129 |   printf("Creating buffers\n");
130 |   std::vector<infinity::memory::Buffer*> buffers;
131 |   float* client_data_buffer = allocate_float_feature(false);
132 |   infinity::memory::Buffer* buffer1Sided = new infinity::memory::Buffer(
133 |       context, client_data_buffer,
134 |       NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
135 |   infinity::memory::Buffer* buffer2Sided =
136 |       new infinity::memory::Buffer(context, 128 * sizeof(char));
137 | 
138 |   printf("Reading content from remote buffer\n");
139 |   infinity::requests::RequestToken requestToken(context);
140 | 
141 |   printf("Start Real Test \n");
142 |   // auto start = std::chrono::system_clock::now();
143 |   struct timeval start, stop;
144 |   uint64_t time_consumed = 0;
145 |   std::vector<int64_t> local_offsets(TEST_COUNT * POST_LIST_SIZE);
146 |   std::vector<int64_t> remote_offsets(TEST_COUNT * POST_LIST_SIZE);
147 |   if (sort_index) {
148 |     for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) {
149 |       std::vector<int> all_request_nodes(TEST_COUNT * POST_LIST_SIZE);
150 |       for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) {
151 |         all_request_nodes[i] = rand() % NODE_COUNT;
152 |       }
153 |       std::sort(all_request_nodes.begin(), all_request_nodes.end());
154 |       for (int i = 0; i < TEST_COUNT * POST_LIST_SIZE; i++) {
155 |         uint64_t remote_node_offset =
156 |             all_request_nodes[i] * FEATURE_DIM * FEATURE_TYPE_SIZE;
157 |         local_offsets[i] = remote_node_offset;
158 |         remote_offsets[i] = remote_node_offset;
159 |       }
160 |       gettimeofday(&start, NULL);
161 | 
162 |       quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets,
163 |                        FEATURE_DIM * FEATURE_TYPE_SIZE);
164 |       gettimeofday(&stop, NULL);
165 |       time_consumed += timeDiff(stop, start);
166 |     }
167 |   } else {
168 |     for (int iter_index = 0; iter_index < ITER_NUM; iter_index++) {
169 |       for (int k = 0; k < TEST_COUNT * POST_LIST_SIZE; k++) {
170 |         int request_node = k % NODE_COUNT;
171 |         if (random) {
172 |           request_node = rand() % NODE_COUNT;
173 |         }
174 |         uint64_t remote_node_offset =
175 |             request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
176 |         local_offsets[k] = remote_node_offset;
177 |         remote_offsets[k] = remote_node_offset;
178 |       }
179 |       gettimeofday(&start, NULL);
180 |       quiver_pipe.read(buffer1Sided, local_offsets, remote_offsets,
181 |                        FEATURE_DIM * FEATURE_TYPE_SIZE);
182 |       gettimeofday(&stop, NULL);
183 |       time_consumed += timeDiff(stop, start);
184 |     }
185 |   }
186 | 
187 |   printf("Avg Bandwidth is %f MB/s\n",
188 |          (POST_LIST_SIZE * TEST_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE *
189 |           ITER_NUM / (1024.0 * 1024.0)) /
190 |              (((double)time_consumed) / 1000000L));
191 | 
192 |   printf("Memory checking..., Please wait...\n");
193 |   if (!mem_check(client_data_buffer)) {
194 |     fprintf(stderr, "Memory Check Failed, Benchmark Failed!\n");
195 |   } else {
196 |     printf("Memory check success! Congrats!\n");
197 |   }
198 | 
199 |   delete buffer1Sided;
200 |   delete buffer2Sided;
201 | 
202 |   for (int index = 0; index < QP_NUM; index++) {
203 |     delete qps[index];
204 |   }
205 |   delete qpFactory;
206 |   delete context;
207 | }
208 | 


--------------------------------------------------------------------------------
/tests/cpp/test_main.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | // Usage: ./progam -s for server and ./program for client component
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | void test_pipe(int argc, char** argv);
 7 | void test_dist_tensor_server(int argc, char** argv);
 8 | void test_dist_tensor_client(int argc, char** argv);
 9 | int main(int argc, char** argv) {
10 |   int test_case = 0;
11 |   switch (argv[1][0]) {
12 |     case '0': {
13 |       test_case = 0;
14 |       break;
15 |     }
16 |     case '1': {
17 |       test_case = 1;
18 |       break;
19 |     }
20 |     case '2': {
21 |       test_case = 2;
22 |       break;
23 |     }
24 |   }
25 | 
26 |   ++argv;
27 |   --argc;
28 | 
29 |   if (test_case == 0) {
30 |     printf("Testing Pipe ...\n");
31 |     test_pipe(argc, argv);
32 |   } else if (test_case == 1) {
33 |     printf("Testing DistTensorClient ...\n");
34 |     test_dist_tensor_client(argc, argv);
35 |   } else if (test_case == 2) {
36 |     printf("Testing DistTensorServer ...\n");
37 |     test_dist_tensor_server(argc, argv);
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/infinity/read-write-send.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Examples - Read/Write/Send Operations
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <unistd.h>
 12 | #include <cassert>
 13 | 
 14 | #include <infinity/core/Context.h>
 15 | #include <infinity/queues/QueuePairFactory.h>
 16 | #include <infinity/queues/QueuePair.h>
 17 | #include <infinity/memory/Buffer.h>
 18 | #include <infinity/memory/RegionToken.h>
 19 | #include <infinity/requests/RequestToken.h>
 20 | 
 21 | #define PORT_NUMBER 8011
 22 | #define SERVER_IP "127.0.0.1"
 23 | 
 24 | // Usage: ./progam -s for server and ./program for client component
 25 | int main(int argc, char **argv) {
 26 | 
 27 | 	bool isServer = false;
 28 | 
 29 | 	while (argc > 1) {
 30 | 		if (argv[1][0] == '-') {
 31 | 			switch (argv[1][1]) {
 32 | 
 33 | 				case 's': {
 34 | 					isServer = true;
 35 | 					break;
 36 | 				}
 37 | 
 38 | 			}
 39 | 		}
 40 | 		++argv;
 41 | 		--argc;
 42 | 	}
 43 | 
 44 | 	infinity::core::Context *context = new infinity::core::Context();
 45 | 	infinity::queues::QueuePairFactory *qpFactory = new  infinity::queues::QueuePairFactory(context);
 46 | 	infinity::queues::QueuePair *qp;
 47 | 
 48 | 	if(isServer) {
 49 | 
 50 | 		printf("Creating buffers to read from and write to\n");
 51 | 		infinity::memory::Buffer *bufferToReadWrite = new infinity::memory::Buffer(context, 128 * sizeof(char));
 52 | 		infinity::memory::RegionToken *bufferToken = bufferToReadWrite->createRegionToken();
 53 | 
 54 | 		printf("Creating buffers to receive a message\n");
 55 | 		infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char));
 56 | 		context->postReceiveBuffer(bufferToReceive);
 57 | 
 58 | 		printf("Setting up connection (blocking)\n");
 59 | 		qpFactory->bindToPort(PORT_NUMBER);
 60 | 		qp = qpFactory->acceptIncomingConnection(bufferToken, sizeof(infinity::memory::RegionToken));
 61 | 
 62 | 		printf("Waiting for message (blocking)\n");
 63 | 		infinity::core::receive_element_t receiveElement;
 64 | 		while(!context->receive(&receiveElement));
 65 | 
 66 | 		printf("Message received\n");
 67 | 		delete bufferToReadWrite;
 68 | 		delete bufferToReceive;
 69 | 
 70 | 	} else {
 71 | 
 72 | 		printf("Connecting to remote node\n");
 73 | 		qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER);
 74 | 		infinity::memory::RegionToken *remoteBufferToken = (infinity::memory::RegionToken *) qp->getUserData();
 75 | 
 76 | 
 77 | 		printf("Creating buffers\n");
 78 | 		infinity::memory::Buffer *buffer1Sided = new infinity::memory::Buffer(context, 128 * sizeof(char));
 79 | 		infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char));
 80 | 
 81 | 		printf("Reading content from remote buffer\n");
 82 | 		infinity::requests::RequestToken requestToken(context);
 83 | 		qp->read(buffer1Sided, remoteBufferToken, &requestToken);
 84 | 		requestToken.waitUntilCompleted();
 85 | 
 86 | 		printf("Writing content to remote buffer\n");
 87 | 		qp->write(buffer1Sided, remoteBufferToken, &requestToken);
 88 | 		requestToken.waitUntilCompleted();
 89 | 
 90 | 		printf("Sending message to remote host\n");
 91 | 		qp->send(buffer2Sided, &requestToken);
 92 | 		requestToken.waitUntilCompleted();
 93 | 
 94 | 		delete buffer1Sided;
 95 | 		delete buffer2Sided;
 96 | 
 97 | 	}
 98 | 
 99 | 	delete qp;
100 | 	delete qpFactory;
101 | 	delete context;
102 | 
103 | 	return 0;
104 | 
105 | }
106 | 


--------------------------------------------------------------------------------
/tests/infinity/send-performance.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Examples - Send Performance
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <unistd.h>
 12 | #include <math.h>
 13 | #include <time.h>
 14 | #include <sys/time.h>
 15 | 
 16 | #include <infinity/core/Context.h>
 17 | #include <infinity/queues/QueuePairFactory.h>
 18 | #include <infinity/queues/QueuePair.h>
 19 | #include <infinity/memory/Buffer.h>
 20 | #include <infinity/memory/RegionToken.h>
 21 | #include <infinity/requests/RequestToken.h>
 22 | 
 23 | #define PORT_NUMBER 8011
 24 | #define SERVER_IP "192.0.0.1"
 25 | #define BUFFER_COUNT 128
 26 | #define MAX_BUFFER_SIZE 4096
 27 | #define OPERATIONS_COUNT 1024
 28 | 
 29 | uint64_t timeDiff(struct timeval stop, struct timeval start);
 30 | 
 31 | // Usage: ./progam -s for server and ./program for client component
 32 | int main(int argc, char **argv) {
 33 | 
 34 | 	bool isServer = false;
 35 | 
 36 | 	while (argc > 1) {
 37 | 		if (argv[1][0] == '-') {
 38 | 			switch (argv[1][1]) {
 39 | 
 40 | 			case 's': {
 41 | 				isServer = true;
 42 | 				break;
 43 | 			}
 44 | 
 45 | 			}
 46 | 		}
 47 | 		++argv;
 48 | 		--argc;
 49 | 	}
 50 | 
 51 | 	infinity::core::Context *context = new infinity::core::Context();
 52 | 	infinity::queues::QueuePairFactory *qpFactory = new infinity::queues::QueuePairFactory(context);
 53 | 	infinity::queues::QueuePair *qp;
 54 | 
 55 | 	if (isServer) {
 56 | 
 57 | 		printf("Creating buffers to receive a messages\n");
 58 | 		infinity::memory::Buffer **receiveBuffers = new infinity::memory::Buffer *[BUFFER_COUNT];
 59 | 		for (uint32_t i = 0; i < BUFFER_COUNT; ++i) {
 60 | 			receiveBuffers[i] = new infinity::memory::Buffer(context, MAX_BUFFER_SIZE * sizeof(char));
 61 | 			context->postReceiveBuffer(receiveBuffers[i]);
 62 | 		}
 63 | 
 64 | 		printf("Waiting for incoming connection\n");
 65 | 		qpFactory->bindToPort(PORT_NUMBER);
 66 | 		qp = qpFactory->acceptIncomingConnection();
 67 | 
 68 | 		printf("Waiting for first message (first message has additional setup costs)\n");
 69 | 		infinity::core::receive_element_t receiveElement;
 70 | 		while (!context->receive(&receiveElement));
 71 | 		context->postReceiveBuffer(receiveElement.buffer);
 72 | 
 73 | 		printf("Performing measurement\n");
 74 | 
 75 | 		uint32_t messageSize = 1;
 76 | 		uint32_t rounds = (uint32_t) log2(MAX_BUFFER_SIZE);
 77 | 
 78 | 		for(uint32_t sizeIndex = 0; sizeIndex <= rounds; ++sizeIndex) {
 79 | 
 80 | 			printf("Receiving messages of size %d bytes\n", messageSize);
 81 | 			fflush(stdout);
 82 | 
 83 | 			uint32_t numberOfReceivedMessages = 0;
 84 | 			while (numberOfReceivedMessages < OPERATIONS_COUNT) {
 85 | 				while (!context->receive(&receiveElement));
 86 | 				++numberOfReceivedMessages;
 87 | 				context->postReceiveBuffer(receiveElement.buffer);
 88 | 			}
 89 | 
 90 | 			messageSize *= 2;
 91 | 		}
 92 | 
 93 | 		printf("All messages received\n");
 94 | 
 95 | 		printf("Sending notification to client\n");
 96 | 		infinity::memory::Buffer *sendBuffer = new infinity::memory::Buffer(context, sizeof(char));
 97 | 		qp->send(sendBuffer, context->defaultRequestToken);
 98 | 		context->defaultRequestToken->waitUntilCompleted();
 99 | 
100 | 		printf("Clean up\n");
101 | 		for (uint32_t i = 0; i < BUFFER_COUNT; ++i) {
102 | 			delete receiveBuffers[i];
103 | 		}
104 | 		delete receiveBuffers;
105 | 		delete sendBuffer;
106 | 
107 | 	} else {
108 | 
109 | 		printf("Connecting to remote node\n");
110 | 		qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER);
111 | 
112 | 		printf("Creating buffers\n");
113 | 		infinity::memory::Buffer *sendBuffer = new infinity::memory::Buffer(context, MAX_BUFFER_SIZE * sizeof(char));
114 | 		infinity::memory::Buffer *receiveBuffer = new infinity::memory::Buffer(context, sizeof(char));
115 | 		context->postReceiveBuffer(receiveBuffer);
116 | 
117 | 		printf("Sending first message\n");
118 | 		qp->send(sendBuffer, sizeof(char), context->defaultRequestToken);
119 | 		context->defaultRequestToken->waitUntilCompleted();
120 | 
121 | 		printf("Performing measurement\n");
122 | 		uint32_t rounds = (uint32_t) log2(MAX_BUFFER_SIZE);
123 | 		uint32_t messageSize = 1;
124 | 
125 | 		for(uint32_t sizeIndex = 0; sizeIndex <= rounds; ++sizeIndex) {
126 | 
127 | 			printf("Sending messages of size %d bytes\t", messageSize);
128 | 			fflush(stdout);
129 | 
130 | 			struct timeval start;
131 | 			gettimeofday(&start, NULL);
132 | 
133 | 			for(uint32_t i=0; i<OPERATIONS_COUNT; ++i) {
134 | 				if(i %BUFFER_COUNT == 0 || i == OPERATIONS_COUNT) {
135 | 
136 | 					infinity::requests::RequestToken requestToken(context);
137 | 					qp->send(sendBuffer, messageSize, &requestToken);
138 | 					requestToken.waitUntilCompleted();
139 | 
140 | 				} else {
141 | 
142 | 					qp->send(sendBuffer, messageSize, NULL);
143 | 
144 | 				}
145 | 			}
146 | 
147 | 			struct timeval stop;
148 | 			gettimeofday(&stop, NULL);
149 | 
150 | 			uint64_t time = timeDiff(stop, start);
151 | 			double msgRate = ((double)(OPERATIONS_COUNT * 1000000L)) / time;
152 | 			double bandwidth = ((double) (OPERATIONS_COUNT * messageSize)) / (1024*1024) / (((double) time) / 1000000L);
153 | 			printf("%.3f msg/sec\t%.3f MB/sec\n", msgRate, bandwidth);
154 | 			fflush(stdout);
155 | 
156 | 			messageSize *= 2;
157 | 
158 | 		}
159 | 
160 | 		printf("Waiting for notification from server\n");
161 | 		infinity::core::receive_element_t receiveElement;
162 | 		while (!context->receive(&receiveElement));
163 | 
164 | 		delete receiveBuffer;
165 | 		delete sendBuffer;
166 | 	}
167 | 
168 | 	delete qp;
169 | 	delete qpFactory;
170 | 	delete context;
171 | 
172 | 	return 0;
173 | 
174 | }
175 | 
176 | uint64_t timeDiff(struct timeval stop, struct timeval start) {
177 | 	return (stop.tv_sec * 1000000L + stop.tv_usec) - (start.tv_sec * 1000000L + start.tv_usec);
178 | }
179 | 


--------------------------------------------------------------------------------
/tests/infinity/test_multiread.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Examples - Read/Write/Send Operations
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include <cassert>
 10 | #include <math.h>
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <sys/time.h>
 14 | #include <time.h>
 15 | #include <unistd.h>
 16 | 
 17 | #include <chrono>
 18 | #include <vector>
 19 | #include <iostream>
 20 | 
 21 | #include <infinity/core/Context.h>
 22 | #include <infinity/memory/Buffer.h>
 23 | #include <infinity/memory/RegionToken.h>
 24 | #include <infinity/queues/QueuePair.h>
 25 | #include <infinity/queues/QueuePairFactory.h>
 26 | #include <infinity/requests/RequestToken.h>
 27 | 
 28 | #define PORT_NUMBER 3344
 29 | #define SERVER_IP "155.198.152.17"
 30 | 
 31 | #define NODE_COUNT 1000000
 32 | #define FEATURE_DIM 128
 33 | #define FEATURE_TYPE_SIZE 4
 34 | #define TEST_COUNT 350000
 35 | #define MAX_OUTSTANDING_REQ 1
 36 | #define POST_LIST_SIZE 20
 37 | #define CQ_MOD 25
 38 | 
 39 | int min(int a, int b){
 40 |     if(a < b){
 41 |         return a;
 42 |     }
 43 |     return b;
 44 | }
 45 | 
 46 | 
 47 | uint64_t timeDiff(struct timeval stop, struct timeval start) {
 48 |   return (stop.tv_sec * 1000000L + stop.tv_usec) -
 49 |          (start.tv_sec * 1000000L + start.tv_usec);
 50 | }
 51 | 
 52 | // Usage: ./progam -s for server and ./program for client component
 53 | int main(int argc, char **argv) {
 54 | 
 55 |   bool isServer = false;
 56 |   bool random = true;
 57 | 
 58 |   while (argc > 1) {
 59 |     if (argv[1][0] == '-') {
 60 |       switch (argv[1][1]) {
 61 |         case 's': {
 62 |           isServer = true;
 63 |           break;
 64 |         }
 65 |         case 'l': {
 66 |           random = false;
 67 |           break;
 68 |         }
 69 |       }
 70 |     }
 71 |     ++argv;
 72 |     --argc;
 73 |   }
 74 |   if(random){
 75 |     printf("Test Random Data Access \n");
 76 |   }else{
 77 |     printf("Test Sequential Data Access \n");
 78 |   }
 79 | 
 80 |   infinity::core::Context *context = new infinity::core::Context();
 81 |   infinity::queues::QueuePairFactory *qpFactory =
 82 |       new infinity::queues::QueuePairFactory(context);
 83 |   infinity::queues::QueuePair *qp;
 84 | 
 85 |   if (isServer) {
 86 | 
 87 |     printf("Creating buffers to read from and write to\n");
 88 |     std::cout << "Server Buffer Size " << NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl;
 89 |     infinity::memory::Buffer *bufferToReadWrite =
 90 |         new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
 91 |     infinity::memory::RegionToken *bufferToken =
 92 |         bufferToReadWrite->createRegionToken();
 93 |     
 94 |     printf("Creating buffers to receive a message\n");
 95 | 		infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char));
 96 | 		context->postReceiveBuffer(bufferToReceive);
 97 | 
 98 | 
 99 |     printf("Setting up connection (blocking)\n");
100 |     qpFactory->bindToPort(PORT_NUMBER);
101 |     qp = qpFactory->acceptIncomingConnection(
102 |         bufferToken, sizeof(infinity::memory::RegionToken));
103 | 
104 |     printf("Waiting for message (blocking)\n");
105 |     infinity::core::receive_element_t receiveElement;
106 |     while (!context->receive(&receiveElement))
107 |       ;
108 | 
109 |     printf("Message received\n");
110 |     delete bufferToReadWrite;
111 |     delete bufferToReceive;
112 | 
113 |   } else {
114 | 
115 |     std::vector<uint64_t> local_offsets(POST_LIST_SIZE, 0);
116 |     std::vector<uint64_t> remote_offsets(POST_LIST_SIZE, 0);
117 |     int start_request = 0;
118 |     int end_request = 0;
119 |     infinity::queues::SendRequestBuffer send_buffer(POST_LIST_SIZE);
120 | 
121 |     printf("Connecting to remote node\n");
122 |     qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER);
123 |     infinity::memory::RegionToken *remoteBufferToken =
124 |         (infinity::memory::RegionToken *)qp->getUserData();
125 | 
126 |     printf("Creating buffers\n");
127 |     std::vector<infinity::memory::Buffer *> buffers;
128 |     infinity::memory::Buffer *buffer1Sided =
129 |         new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
130 |     infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char));
131 | 
132 | 
133 |     printf("Reading content from remote buffer\n");
134 |     infinity::requests::RequestToken requestToken(context);
135 | 
136 |     // warm up
137 | 
138 |     printf("Warm up\n");
139 |     for (int k = 0; k < 10; k++) {
140 |       int request_node = rand() % NODE_COUNT;
141 |       uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
142 |       //std::cout << "Getting Data From " << offset << " To " << offset + FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl;
143 |       qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE,
144 |                 infinity::queues::OperationFlags(), &requestToken);
145 |       requestToken.waitUntilCompleted();
146 |     }
147 | 
148 |     printf("Start Real Test \n");
149 |     auto start = std::chrono::system_clock::now();
150 |     int avaliable = MAX_OUTSTANDING_REQ;
151 |     for (int k = 0; k < TEST_COUNT; k++) {
152 |         for(int multi_read_index = 0; multi_read_index < POST_LIST_SIZE; multi_read_index ++){
153 |             int request_node = (k + multi_read_index) % NODE_COUNT;
154 |             if(random){
155 |                 request_node = rand() % NODE_COUNT;
156 |             }
157 |             uint64_t remote_node_offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
158 |             local_offsets[multi_read_index] = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
159 |             remote_offsets[multi_read_index] = remote_node_offset;
160 |         }
161 |       
162 | 
163 |         if(k % CQ_MOD == CQ_MOD -1){
164 |             qp->multiRead(buffer1Sided, local_offsets, remoteBufferToken, remote_offsets, FEATURE_DIM * FEATURE_TYPE_SIZE,
165 |                         infinity::queues::OperationFlags(), &requestToken, send_buffer);
166 |             requestToken.waitUntilCompleted();
167 |         }else{
168 |             qp->multiRead(buffer1Sided, local_offsets, remoteBufferToken, remote_offsets, FEATURE_DIM * FEATURE_TYPE_SIZE,
169 |                         infinity::queues::OperationFlags(), nullptr, send_buffer);
170 | 
171 |         }
172 |     }
173 | 
174 |     auto end = std::chrono::system_clock::now();
175 |     std::chrono::duration<double> diff = end - start;
176 |     printf("Avg Bandwidth is %f MB/s\n", (POST_LIST_SIZE * TEST_COUNT *  FEATURE_DIM/ (1024.0 * 1024.0 ) ) * FEATURE_TYPE_SIZE / diff.count() );
177 | 
178 |     printf("Sending message to remote host\n");
179 |     qp->send(buffer2Sided, &requestToken);
180 |     requestToken.waitUntilCompleted();
181 | 
182 |     delete buffer1Sided;
183 |     delete buffer2Sided;
184 |   }
185 | 
186 |   delete qp;
187 |   delete qpFactory;
188 |   delete context;
189 | 
190 |   return 0;
191 | }


--------------------------------------------------------------------------------
/tests/infinity/test_read.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Examples - Read/Write/Send Operations
  3 |  *
  4 |  * (c) 2018 Claude Barthels, ETH Zurich
  5 |  * Contact: claudeb@inf.ethz.ch
  6 |  *
  7 |  */
  8 | 
  9 | #include <cassert>
 10 | #include <math.h>
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <sys/time.h>
 14 | #include <time.h>
 15 | #include <unistd.h>
 16 | 
 17 | #include <chrono>
 18 | #include <vector>
 19 | #include <iostream>
 20 | 
 21 | #include <infinity/core/Context.h>
 22 | #include <infinity/memory/Buffer.h>
 23 | #include <infinity/memory/RegionToken.h>
 24 | #include <infinity/queues/QueuePair.h>
 25 | #include <infinity/queues/QueuePairFactory.h>
 26 | #include <infinity/requests/RequestToken.h>
 27 | 
 28 | #define PORT_NUMBER 3344
 29 | #define SERVER_IP "155.198.152.17"
 30 | 
 31 | #define NODE_COUNT 1
 32 | #define FEATURE_DIM 512
 33 | #define FEATURE_TYPE_SIZE 4
 34 | #define TEST_COUNT 10
 35 | #define MAX_OUTSTANDING_REQ 1
 36 | 
 37 | 
 38 | 
 39 | uint64_t timeDiff(struct timeval stop, struct timeval start) {
 40 |   return (stop.tv_sec * 1000000L + stop.tv_usec) -
 41 |          (start.tv_sec * 1000000L + start.tv_usec);
 42 | }
 43 | 
 44 | // Usage: ./progam -s for server and ./program for client component
 45 | int main(int argc, char **argv) {
 46 | 
 47 |   bool isServer = false;
 48 |   bool random = false;
 49 | 
 50 |   while (argc > 1) {
 51 |     if (argv[1][0] == '-') {
 52 |       switch (argv[1][1]) {
 53 |         case 's': {
 54 |           isServer = true;
 55 |           break;
 56 |         }
 57 |         case 'r': {
 58 |           random = true;
 59 |           break;
 60 |         }
 61 |       }
 62 |     }
 63 |     ++argv;
 64 |     --argc;
 65 |   }
 66 | 
 67 |   infinity::core::Context *context = new infinity::core::Context();
 68 |   infinity::queues::QueuePairFactory *qpFactory =
 69 |       new infinity::queues::QueuePairFactory(context);
 70 |   infinity::queues::QueuePair *qp;
 71 | 
 72 |   if (isServer) {
 73 | 
 74 |     printf("Creating buffers to read from and write to\n");
 75 |     std::cout << "Server Buffer Size " << NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl;
 76 |     infinity::memory::Buffer *bufferToReadWrite =
 77 |         new infinity::memory::Buffer(context, NODE_COUNT * FEATURE_DIM * FEATURE_TYPE_SIZE);
 78 |     infinity::memory::RegionToken *bufferToken =
 79 |         bufferToReadWrite->createRegionToken();
 80 |     
 81 |     printf("Creating buffers to receive a message\n");
 82 | 		infinity::memory::Buffer *bufferToReceive = new infinity::memory::Buffer(context, 128 * sizeof(char));
 83 | 		context->postReceiveBuffer(bufferToReceive);
 84 | 
 85 | 
 86 |     printf("Setting up connection (blocking)\n");
 87 |     qpFactory->bindToPort(PORT_NUMBER);
 88 |     qp = qpFactory->acceptIncomingConnection(
 89 |         bufferToken, sizeof(infinity::memory::RegionToken));
 90 | 
 91 |     printf("Waiting for message (blocking)\n");
 92 |     infinity::core::receive_element_t receiveElement;
 93 |     while (!context->receive(&receiveElement))
 94 |       ;
 95 | 
 96 |     printf("Message received\n");
 97 |     delete bufferToReadWrite;
 98 |     delete bufferToReceive;
 99 | 
100 |   } else {
101 | 
102 |     printf("Connecting to remote node\n");
103 |     qp = qpFactory->connectToRemoteHost(SERVER_IP, PORT_NUMBER);
104 |     infinity::memory::RegionToken *remoteBufferToken =
105 |         (infinity::memory::RegionToken *)qp->getUserData();
106 | 
107 |     printf("Creating buffers\n");
108 |     std::vector<infinity::memory::Buffer *> buffers;
109 |     infinity::memory::Buffer *buffer1Sided =
110 |         new infinity::memory::Buffer(context, FEATURE_DIM * FEATURE_TYPE_SIZE);
111 |     infinity::memory::Buffer *buffer2Sided = new infinity::memory::Buffer(context, 128 * sizeof(char));
112 | 
113 | 
114 |     printf("Reading content from remote buffer\n");
115 |     std::vector<infinity::requests::RequestToken *> requests;
116 |     for (int i = 0; i < 1000; i++) {
117 |       requests.push_back(new infinity::requests::RequestToken(context));
118 |     }
119 | 
120 |     // warm up
121 | 
122 |     printf("A little Warmup \n");
123 |     for (int k = 0; k < 10; k++) {
124 |       int request_node = rand() % NODE_COUNT;
125 |       uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
126 |       //std::cout << "Getting Data From " << offset << " To " << offset + FEATURE_DIM * FEATURE_TYPE_SIZE << std::endl;
127 |       qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE,
128 |                 infinity::queues::OperationFlags(), requests[k]);
129 |       requests[k]->waitUntilCompleted();
130 |     }
131 | 
132 |     printf("Start Real Test \n");
133 |     auto start = std::chrono::system_clock::now();
134 |     int avaliable = MAX_OUTSTANDING_REQ;
135 |     for (int k = 0; k < TEST_COUNT; k++) {
136 |       int request_node = k;
137 |       if(random){
138 |         request_node = rand() % NODE_COUNT;
139 |       }
140 | 
141 |       uint64_t offset = request_node * FEATURE_DIM * FEATURE_TYPE_SIZE;
142 |       qp->read(buffer1Sided, 0, remoteBufferToken, offset, FEATURE_DIM * FEATURE_TYPE_SIZE,
143 |                 infinity::queues::OperationFlags(), requests[k % 1000]);
144 |       avaliable -= 1;
145 |       if(avaliable == 0){
146 |         requests[k % MAX_OUTSTANDING_REQ]->waitUntilCompleted();
147 |         avaliable += 1;
148 |       }
149 |     }
150 | 
151 |     // make sure all finished
152 |     for (int k = 0; k < MAX_OUTSTANDING_REQ; k++) {
153 |         requests[k % MAX_OUTSTANDING_REQ]->waitUntilCompleted();
154 |     }
155 |     
156 | 
157 |     auto end = std::chrono::system_clock::now();
158 |     std::chrono::duration<double> diff = end - start;
159 |     printf("Avg Bandwidth is %f MB/s\n", TEST_COUNT *  FEATURE_DIM * FEATURE_TYPE_SIZE / (1024.0 * 1024.0 ) / diff.count() );
160 | 
161 |     printf("Sending message to remote host\n");
162 | 		qp->send(buffer2Sided, requests[0]);
163 | 		requests[0]->waitUntilCompleted();
164 | 
165 | 		delete buffer1Sided;
166 | 		delete buffer2Sided;
167 |   }
168 | 
169 |   delete qp;
170 |   delete qpFactory;
171 |   delete context;
172 | 
173 |   return 0;
174 | }
175 | 


--------------------------------------------------------------------------------
/tests/python/config.py:
--------------------------------------------------------------------------------
 1 | PORT_NUMBER = 3344
 2 | MASTER_IP = "155.198.152.17"#"127.0.0.1"
 3 | HLPER_PORT = 5678
 4 | NODE_COUNT = 1200000
 5 | FEATURE_DIM = 128
 6 | FEATURE_TYPE_SIZE = 4
 7 | SAMPLE_NUM = 80000
 8 | ITER_NUM = 10
 9 | POST_LIST_SIZE = 128
10 | QP_NUM = 8
11 | TX_DEPTH = 2048
12 | CTX_POLL_BATCH = TX_DEPTH // POST_LIST_SIZE
13 | TEST_TLB_OPTIMIZATION = True
14 | 
15 | # For Reddit Training
16 | SAMPLE_PARAM = [25, 10]
17 | BATCH_SIZE = 256
18 | 


--------------------------------------------------------------------------------
/tests/python/preprocess_Dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import quiver
  3 | from torch_geometric.datasets import Reddit
  4 | import os.path as osp
  5 | 
  6 | def reindex_with_random(adj_csr, graph_feature=None, hot_ratio=0):
  7 | 
  8 |     node_count = adj_csr.indptr.shape[0] - 1
  9 |     total_range = torch.arange(node_count, dtype=torch.long)
 10 |     cold_ratio = 1 - hot_ratio
 11 |     cold_part = int(node_count * cold_ratio)
 12 |     hot_part = node_count - cold_part
 13 |     perm_range = torch.randperm(cold_part) + hot_part
 14 |     # sort and shuffle
 15 |     degree = adj_csr.indptr[1:] - adj_csr.indptr[:-1]
 16 |     _, prev_order = torch.sort(degree, descending=True)
 17 |     new_order = torch.zeros_like(prev_order)
 18 |     prev_order[hot_part:] = prev_order[perm_range]
 19 |     new_order[prev_order] = total_range
 20 |     if graph_feature is not None:
 21 |         graph_feature = graph_feature[prev_order]
 22 | 
 23 |     return graph_feature, new_order
 24 | 
 25 | def reindex_with_certain(adj_csr, graph_feature=None, hot_ratio=0):
 26 |     node_count = adj_csr.indptr.shape[0] - 1
 27 |     total_range = torch.arange(node_count, dtype=torch.long)
 28 |     print("node count", node_count)
 29 |     cold_ratio = 1 - hot_ratio
 30 |     cold_part = int(node_count * cold_ratio)
 31 |     hot_part = node_count - cold_part
 32 | 
 33 |     # sort
 34 |     degree = adj_csr.indptr[1:] - adj_csr.indptr[:-1]
 35 |     _, prev_order = torch.sort(degree, descending=True)
 36 |     hot_part_order = prev_order[:hot_part]
 37 | 
 38 |     total_range_set = set(total_range.tolist())
 39 |     hot_part_set = set(hot_part_order.tolist())
 40 |     cold_part_set = total_range_set - hot_part_set
 41 | 
 42 |     cold_part_order = torch.LongTensor(list(cold_part_set))
 43 |     new_order = torch.zeros_like(prev_order)
 44 |     print(hot_part_order.shape, cold_part_order.shape, total_range.shape)
 45 | 
 46 | 
 47 | 
 48 |     new_order[torch.cat([hot_part_order, cold_part_order])] = total_range
 49 | 
 50 |     new_feature = torch.cat((graph_feature[hot_part_order], graph_feature[cold_part_order]))
 51 | 
 52 |     return new_feature, new_order
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | def load_topo_paper100M():
 59 |     indptr = torch.load("/data/papers/ogbn_papers100M/csr/indptr.pt")
 60 |     indices = torch.load("/data/papers/ogbn_papers100M/csr/indices.pt")
 61 |     train_idx = torch.load("/data/papers/ogbn_papers100M/index/train_idx.pt")
 62 |     csr_topo = quiver.CSRTopo(indptr=indptr, indices=indices)
 63 |     quiver_sampler = quiver.pyg.GraphSageSampler(csr_topo, [15, 10, 5], 0, mode="UVA")
 64 |     print(f"Graph Stats:\tNodes:{csr_topo.node_count}\tEdges:{csr_topo.edge_count}\tAvg_Deg:{csr_topo.edge_count / csr_topo.node_count}")
 65 |     return train_idx, csr_topo, quiver_sampler
 66 | 
 67 | def load_feat_paper100M():
 68 |     feat =  torch.load("/data/papers/ogbn_papers100M/feat/feature.pt")
 69 |     print(f"Feature Stats:\tDim:{feat.shape[1]}")
 70 |     return feat
 71 | 
 72 | def load_topo_mag240m():
 73 |     pass
 74 | 
 75 | def load_feat_mag240m():
 76 |     pass
 77 | 
 78 | 
 79 | def load_topo_reddit():
 80 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit')
 81 |     dataset = Reddit(path)
 82 |     data = dataset[0]
 83 |     csr_topo = quiver.CSRTopo(edge_index=data.edge_index)
 84 |     quiver_sampler = quiver.pyg.GraphSageSampler(csr_topo, [25, 10], 0, mode="UVA")
 85 |     train_idx = data.train_mask.nonzero(as_tuple=False).view(-1)
 86 |     return train_idx, csr_topo, quiver_sampler
 87 | 
 88 | def load_feat_reddit():
 89 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit')
 90 |     dataset = Reddit(path)
 91 |     data = dataset[0]
 92 |     return data.x
 93 | 
 94 | 
 95 | def preprocess_dataset(dataset="paper100m", cache_ratio = 0.0, method="certain"):
 96 |     if dataset == "paper100m":
 97 |         _, csr_topo, _ = load_topo_paper100M()
 98 | 
 99 |         feat = load_feat_paper100M()
100 |     elif dataset == "mag240m":
101 |         _, csr_topo, _ = load_topo_mag240m()
102 |         feat = load_feat_mag240m()
103 |     else:
104 |         _, csr_topo, _ = load_topo_reddit()
105 |         feat = load_feat_reddit()
106 | 
107 |     if method == "random":
108 |         sorted_feature, sorted_order = reindex_with_random(csr_topo, feat, cache_ratio)
109 |     else:
110 |         sorted_feature, sorted_order = reindex_with_certain(csr_topo, feat, cache_ratio)
111 | 
112 | 
113 |     torch.save(sorted_feature, f"/data/dalong/sorted_feature_{dataset}_{method}_{cache_ratio:.2f}.pt")
114 |     torch.save(sorted_order, f"/data/dalong/sorted_order_{dataset}_{method}_{cache_ratio:.2f}.pt")
115 | 
116 | 
117 | def test_curve(dataset, method="certain", cache_ratio=0, partition_size=2):
118 |     if dataset == "reddit":
119 |         train_idx, csr_topo, quiver_sampler = load_topo_reddit()
120 |     elif dataset == "paper100m":
121 |         train_idx, csr_topo, quiver_sampler = load_topo_paper100M()
122 | 
123 | 
124 |     sorted_order_path = f"/data/dalong/sorted_order_{dataset}_{method}_{cache_ratio:.2f}.pt"
125 |     order_transform = None
126 |     if sorted_order_path is not None:
127 |         order_transform = torch.load(sorted_order_path)
128 |         order_transform = order_transform.cuda()
129 |     dataloader = torch.utils.data.DataLoader(train_idx, batch_size=256)
130 |     hot_part = int(cache_ratio * csr_topo.node_count)
131 |     cold_part = (csr_topo.node_count - int(cache_ratio * csr_topo.node_count)) // partition_size
132 | 
133 |     col_part_hit_count = 0
134 |     hot_part_hit_count = 0
135 |     total_count = 0
136 |     for seeds in dataloader:
137 |         n_id, _, _ = quiver_sampler.sample(seeds)
138 |         n_id = n_id.cuda()
139 |         feature_n_id = order_transform[n_id]
140 |         col_part_hit_count += feature_n_id[torch.logical_and(feature_n_id > hot_part, feature_n_id < (hot_part + cold_part))].shape[0]
141 |         hot_part_hit_count += feature_n_id[feature_n_id < hot_part].shape[0]
142 |         total_count += feature_n_id.shape[0]
143 |     print(f"Hot Part Hit Ratio:\t{hot_part_hit_count / total_count}\nCold Partition Hit Rate:\t{col_part_hit_count / total_count}")
144 | 
145 | 
146 | CACHE_RATIO = 0.0
147 | PARTITION_SIZE = 2
148 | METHOD = "random"
149 | DATASET = "paper100m"
150 | #preprocess_dataset(dataset=DATASET, cache_ratio=CACHE_RATIO, method=METHOD)
151 | test_curve(DATASET, cache_ratio = CACHE_RATIO, partition_size=PARTITION_SIZE, method=METHOD)
152 | 


--------------------------------------------------------------------------------
/tests/python/test_DGLUnifiedTensor.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import random
 3 | import dgl
 4 | import torch
 5 | import numpy as np
 6 | from texttable import Texttable
 7 | 
 8 | NUM_ELEMENT = 400000000
 9 | FEATURE_DIM = 128
10 | SAMPLE_SIZE = 80000
11 | LOOP_NUM = 10
12 | 
13 | features = torch.empty((NUM_ELEMENT, FEATURE_DIM))
14 | features = dgl.contrib.UnifiedTensor(features, device=torch.device('cuda'))
15 | 
16 | results = np.empty([1, 3], dtype = int) 
17 | for idx in range(LOOP_NUM):
18 |     sample_idx = torch.randint(0, high=NUM_ELEMENT - 1, size=(SAMPLE_SIZE, )).to('cuda')
19 | 
20 |     torch.cuda.synchronize()
21 |     start = time.time()
22 | 
23 |     data = features[sample_idx]
24 | 
25 |     torch.cuda.synchronize()
26 |     end = time.time()
27 |     consumed = end - start
28 | 
29 |     results = np.append(results, [[idx, NUM_ELEMENT * FEATURE_DIM * 4 / 1024 / 1024 / 1024, data.numel() * 4 / 1024 / 1024 / consumed]], axis=0)
30 |     
31 | results = np.append(results, [np.mean(results[1:LOOP_NUM], axis=0)], axis=0)
32 | results = results.tolist()
33 | 
34 | results[0] = ['', 'Tensor Size (GB)', 'Throughput (MB/s)'] 
35 | results[-1][0] = 'Avg' 
36 | 
37 | table = Texttable()
38 | table.set_deco(Texttable.HEADER)
39 | table.set_cols_dtype(['a', 't', 't'])
40 | table.add_rows(results)
41 | print(table.draw())


--------------------------------------------------------------------------------
/tests/python/test_DistHelper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import time
 3 | 
 4 | import os
 5 | import quiver_feature
 6 | from quiver_feature import Range
 7 | from quiver_feature import DistHelper
 8 | 
 9 | MASTER_ADDR = '155.198.152.17'
10 | MASTER_PORT = 5678
11 | 
12 | MY_SERVER_RANK = 1
13 | SERVER_WORLD_SIZE = 2
14 | 
15 | 
16 | dist_helper = DistHelper(MASTER_ADDR, MASTER_PORT, SERVER_WORLD_SIZE, MY_SERVER_RANK)
17 | LOCAL_RANGE = Range(MY_SERVER_RANK * 100, MY_SERVER_RANK * 200)
18 | tensor_endpoints = dist_helper.exchange_tensor_endpoints_info(LOCAL_RANGE)
19 | 
20 | print(f"Check TensorEndPoint ", tensor_endpoints)
21 | time.sleep(MY_SERVER_RANK * 5 + 1)
22 | print(f"Rank {MY_SERVER_RANK} Finished, Begin To Sync")
23 | dist_helper.sync_all()
24 | print(f"Rank {MY_SERVER_RANK} Finished, Bye Bye")
25 | 


--------------------------------------------------------------------------------
/tests/python/test_DistTensorClient.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import qvf
 3 | 
 4 | import config
 5 | 
 6 | import time
 7 | 
 8 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
 9 | local_com_endpoint = qvf.ComEndPoint(0, config.MASTER_IP, config.PORT_NUMBER)
10 | remote_com_endpoint = qvf.ComEndPoint(1, config.MASTER_IP, config.PORT_NUMBER)
11 | dist_tensor_client = qvf.DistTensorClient(0, [local_com_endpoint, remote_com_endpoint], pipe_param)
12 | registered_tensor = dist_tensor_client.create_registered_float32_tensor([config.SAMPLE_NUM, config.FEATURE_DIM])
13 | 
14 | print("Before Collect, Check RegisteredTensor Shape ", registered_tensor.shape)
15 | local_idx = torch.arange(0, config.SAMPLE_NUM, dtype=torch.int64)
16 | remote_idx = torch.randint(0, config.NODE_COUNT, (config.SAMPLE_NUM, ), dtype=torch.int64)
17 | 
18 | if config.TEST_TLB_OPTIMIZATION:
19 |     print("Using TLB Optimization")
20 |     remote_idx, _= torch.sort(remote_idx)
21 | 
22 | local_offsets  = local_idx * config.FEATURE_DIM * config.FEATURE_TYPE_SIZE
23 | remote_offsets = remote_idx * config.FEATURE_DIM * config.FEATURE_TYPE_SIZE
24 | 
25 | # warm up
26 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets, remote_offsets)
27 | #registered_tensor[:] = 0
28 | 
29 | start_time = time.time()
30 | dist_tensor_client.sync_read(1, registered_tensor, local_offsets, remote_offsets)
31 | consumed = time.time() - start_time
32 | 
33 | print("Begin To Check Result...")
34 | registered_tensor = registered_tensor.to('cpu')
35 | for row in range(config.SAMPLE_NUM):
36 |     if not all(registered_tensor[row] == remote_idx[row]):
37 |         print(f"Result Check Failed At {row}, Expected {remote_idx[row]}, But got {registered_tensor[row]}, Local Offsets {local_offsets[row]}, Remote Offsets {remote_offsets[row]}")
38 |         exit()
39 | print(f"Result Check Passed!, Throughput = {registered_tensor.numel() * 4 / 1024 / 1024 / consumed} MB/s")
40 | 


--------------------------------------------------------------------------------
/tests/python/test_DistTensorPGAS.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | import numpy as np
 4 | import time
 5 | from typing import List
 6 | import config
 7 | from quiver_feature import TensorEndPoint, Range, DistTensorDeviceParam, DistTensorServerParam, PipeParam
 8 | from quiver_feature import DistHelper
 9 | from quiver_feature import DistTensorPGAS
10 | 
11 | parser = argparse.ArgumentParser(description='')
12 | parser.add_argument('-rank', type=int, default=0, help='rank')
13 | parser.add_argument('-device', type=int, default=0, help="device idx")
14 | parser.add_argument('-world_size', type=int, default=1, help="world size")
15 | parser.add_argument('-start_server', type=int, default=1, help='whether to start server')
16 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache")
17 | 
18 | args = parser.parse_args()
19 | 
20 | NUM_ELEMENT = 1000000
21 | FEATURE_DIM = 600
22 | SAMPLE_SIZE = 80000
23 | 
24 | DEVICE_RANK = args.device
25 | WORLD_SIZE = args.world_size
26 | START_SERVER = args.start_server
27 | CACHE_RATIO = args.cache_ratio
28 | LOCAL_SERVER_RANK = args.rank
29 | 
30 | 
31 | torch.cuda.set_device(DEVICE_RANK)
32 | 
33 | cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * WORLD_SIZE))
34 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * WORLD_SIZE - cached_range.end) // WORLD_SIZE
35 | 
36 | host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM)
37 | host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM)
38 | 
39 | tensor = torch.from_numpy(host_tensor).type(torch.float32).share_memory_()
40 | 
41 | 
42 | 
43 | range_list = []
44 | for idx in range(WORLD_SIZE):
45 |     range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1))
46 |     range_list.append(range_item)
47 | 
48 | 
49 | dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, WORLD_SIZE, LOCAL_SERVER_RANK)
50 | tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK])
51 | 
52 | print(f"Check All TensorEndPoints {tensor_endpoints_list}")
53 | 
54 | host_indice = np.random.randint(0, high= WORLD_SIZE * NUM_ELEMENT - 1, size=(SAMPLE_SIZE, ))
55 | indices = torch.from_numpy(host_indice).type(torch.long)
56 | indices_device = indices.to(DEVICE_RANK)
57 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * WORLD_SIZE)
58 | 
59 | device_param = DistTensorDeviceParam(device_list=[DEVICE_RANK], device_cache_size="8G", cache_policy="device_replicate")
60 | server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size= WORLD_SIZE)
61 | buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, tensor.shape[1]]
62 | pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
63 | 
64 | dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range)
65 | dist_tensor.from_cpu_tensor(tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param)
66 | 
67 | 
68 | start = time.time()
69 | data = dist_tensor[indices_device]
70 | consumed = time.time() - start
71 | 
72 | data = data.cpu()
73 | data_gt = whole_tensor[indices]
74 | 
75 | assert torch.equal(data, data_gt), "Result Check Failed!"
76 | 
77 | print(f"Result Check Successed! Throughput = {data.numel() * 4 / 1024 / 1024 / consumed} MB/s")
78 | 
79 | dist_helper.sync_all()
80 | 


--------------------------------------------------------------------------------
/tests/python/test_DistTensorRPC.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.distributed.rpc as rpc
  3 | #from tmp import DistTensorRPC
  4 | from tmp import DistTensorRPC
  5 | from quiver_feature import Range
  6 | 
  7 | import numpy as np
  8 | from quiver.shard_tensor import ShardTensorConfig, ShardTensor
  9 | import argparse
 10 | import os
 11 | import time
 12 | import numpy as np
 13 | 
 14 | 
 15 | """
 16 | 1. CPU & IB
 17 | 2. Komodo1,2,3
 18 | 3. can we do some GPU sampling when waiting for network
 19 | """
 20 | os.environ['MASTER_ADDR'] = '155.198.152.17'
 21 | os.environ['MASTER_PORT'] = '5678'
 22 | 
 23 | os.environ["NCCL_SOCKET_IFNAME"] = "eth0"
 24 | os.environ["TP_SOCKET_IFNAME"] = "eth0"
 25 | os.environ["GLOO_SOCKET_IFNAME"] = "eth0"
 26 | os.environ["TP_VERBOSE_LOGGING"] = "0"
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | parser = argparse.ArgumentParser(description='python3 test.py -rank x -world_size x  -cpu_collect True for test CPU')
 33 | parser.add_argument('-rank', type=int, help='rank')
 34 | parser.add_argument('-local_rank', type=int, default=0, help="local rank")
 35 | parser.add_argument('-world_size', type=int, help="world size")
 36 | parser.add_argument("-device_per_node", type=int, default=1, help ="device per node")
 37 | parser.add_argument("-cpu_collect", type=int, default=0, help ="test for cpu collection")
 38 | parser.add_argument("-cpu_collect_gpu_send", type=int, default=0, help ="send from gpu")
 39 | parser.add_argument("-test_ib", type=int, default=1, help ="test IB")
 40 | 
 41 | args = parser.parse_args()
 42 | device_map = {}
 43 | for idx in range(args.world_size):
 44 |     device_map[f"worker{idx}"] = {}
 45 |     for device_idx in range(args.device_per_node):
 46 |         device_map[f"worker{idx}"][device_idx] = device_idx
 47 | 
 48 | print(f"Device Map: {device_map}")
 49 | print(f"Rank {args.rank}: Test Mode Is {'CPU' if args.cpu_collect else 'GPU'}")
 50 | """
 51 | All transports and channels we have:
 52 | 
 53 | V0327 07:52:54.252611 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport ibv
 54 | V0327 07:52:54.252761 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport uv
 55 | V0327 07:52:54.261135 2716381 tensorpipe/core/context_impl.cc:81] Context worker0 is registering transport shm
 56 | V0327 07:52:54.261295 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_basic
 57 | V0327 07:52:54.262006 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_xth
 58 | V0327 07:52:54.262173 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cma
 59 | V0327 07:52:54.276424 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel cuda_ipc
 60 | V0327 07:52:54.276447 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel basic
 61 | V0327 07:52:54.278730 2716381 tensorpipe/core/context_impl.cc:104] Context worker0 is registering channel mpt_uv
 62 | """
 63 | 
 64 | if args.cpu_collect and args.test_ib:
 65 |     # python3 test.py -cpu_collect 1 -test_ib 1
 66 |     print("Transports: IBV, Channel: BASIC")
 67 |     rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['basic'])
 68 | elif args.cpu_collect:
 69 |     # python3 test.py -cpu_collect 1 -test_ib 0
 70 |     print("Transports: UV, Channel: MPT_UV")
 71 |     rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['uv'], _channels=['mpt_uv'])
 72 | elif args.test_ib:
 73 |      # python3 test.py -cpu_collect 0 -test_ib 1
 74 |     print("Transports: IBV, Channel: CUDA_BASIC")
 75 |     rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['cuda_basic'])
 76 | else:
 77 |       # python3 test.py -cpu_collect 0 -test_ib 0
 78 |     print("Transports: UV, Channel: CUDA_BASIC")
 79 |     rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['uv'], _channels=['cuda_basic'])
 80 | 
 81 | if args.cpu_collect and args.cpu_collect_gpu_send:
 82 | 
 83 |     # python3 test.py -cpu_collect 1 -test_ib 1 -cpu_collect_gpu_send 1
 84 |     print("CPU Collect and GPU Send,  Update To: Transports: IBV, Channel: CUDA_BASIC")
 85 |     rpc_option = torch.distributed.rpc.TensorPipeRpcBackendOptions(device_maps=device_map, _transports=['ibv'], _channels=['cuda_basic'])
 86 | 
 87 | debug_param =  {"cpu_collect_gpu_send": args.cpu_collect_gpu_send}
 88 | 
 89 | NUM_ELEMENT = 1000000
 90 | FEATURE_DIM = 600
 91 | SAMPLE_SIZE = 80000
 92 | 
 93 | #########################
 94 | # Init With Numpy
 95 | ########################
 96 | torch.cuda.set_device(args.local_rank)
 97 | cached_ratio = 0.0
 98 | cached_range = Range(0, int(cached_ratio * NUM_ELEMENT * args.world_size // args.device_per_node))
 99 | UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * args.world_size // args.device_per_node - cached_range.end) // (args.world_size // args.device_per_node)
100 | 
101 | host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM)
102 | host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM)
103 | 
104 | tensor = torch.from_numpy(host_tensor).type(torch.float32)
105 | 
106 | 
107 | shard_tensor_config = ShardTensorConfig({args.local_rank: "8G"})
108 | shard_tensor = ShardTensor(args.local_rank, shard_tensor_config)
109 | shard_tensor.from_cpu_tensor(tensor)
110 | 
111 | 
112 | range_list = []
113 | for idx in range(args.world_size // args.device_per_node):
114 |     range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1))
115 |     for _ in range(args.device_per_node):
116 |         range_list.append(range_item)
117 | 
118 | 
119 | host_indice = np.random.randint(0, high= (args.world_size // args.device_per_node) * NUM_ELEMENT - 1, size=(SAMPLE_SIZE, ))
120 | indices = torch.from_numpy(host_indice).type(torch.long)
121 | 
122 | whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * (args.world_size // args.device_per_node))
123 | 
124 | print(f"Whole Tensor Shape: {whole_tensor.shape}")
125 | print(f"Shard Tensor Shape: {shard_tensor.shape}")
126 | 
127 | # TODO Just For Debugging
128 | if args.cpu_collect_gpu_send or not args.cpu_collect:
129 |     indices = indices.to(args.local_rank)
130 | 
131 | if args.cpu_collect or args.cpu_collect_gpu_send:
132 |     print(f"Using CPU Collect")
133 |     dist_tensor = DistTensorRPC(args.world_size, args.rank, args.device_per_node, args.local_rank, tensor, range_list, rpc_option, cached_range, **debug_param)
134 | else:
135 |     dist_tensor = DistTensorRPC(args.world_size, args.rank, args.device_per_node, args.local_rank, shard_tensor, range_list, rpc_option, cached_range, **debug_param)
136 | 
137 | warm_up = 4
138 | for idx in range(warm_up):
139 |     data = dist_tensor[indices]
140 | 
141 | test_count = 100
142 | consumed_time = 0
143 | data_times = []
144 | for idx in range(test_count):
145 |     start = time.time()
146 |     data = dist_tensor[indices]
147 |     data_times.append(time.time() - start)
148 | 
149 | data_cpu = data.cpu()
150 | indices_cpu = indices.cpu()
151 | data_gt = whole_tensor[indices_cpu]
152 | 
153 | assert torch.equal(data_gt, data_cpu)
154 | 
155 | data_times = np.array(data_times)
156 | data_times = np.sort(data_times)
157 | data_times = data_times[int(0.1 * test_count): -int(0.1 * test_count)]
158 | consumed_time = np.sum(data_times)
159 | print(f"Bandwidth in Rank {args.rank} = {data_times.shape[0] * torch.numel(data) * 4 / 1024 / 1024 / 1024 / consumed_time  }GB/s")
160 | time.sleep(10)
161 | rpc.shutdown()
162 | 


--------------------------------------------------------------------------------
/tests/python/test_DistTensorServer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import qvf
 3 | import config
 4 | 
 5 | 
 6 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
 7 | 
 8 | data = torch.empty((config.NODE_COUNT, config.FEATURE_DIM), dtype=torch.float)
 9 | for row in range(config.NODE_COUNT):
10 |     data[row] = row
11 | 
12 | dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, 2, config.QP_NUM)
13 | dist_tensor_server.serve_tensor(data)
14 | dist_tensor_server.join()
15 | 


--------------------------------------------------------------------------------
/tests/python/test_LocalTensorPGAS.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch_geometric.datasets import Reddit
 3 | import os.path as osp
 4 | import time
 5 | from ogb.nodeproppred import PygNodePropPredDataset
 6 | import quiver
 7 | from quiver_feature import LocalTensorPGAS
 8 | import quiver_feature
 9 | 
10 | def load_products():
11 |     root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products')
12 |     dataset = PygNodePropPredDataset('ogbn-products', root)
13 |     data = dataset[0]
14 |     return data.x
15 | 
16 | 
17 | def load_reddit():
18 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit')
19 |     dataset = Reddit(path)
20 |     data = dataset[0]
21 |     return data.x
22 | 
23 | def load_mag240_partition():
24 |     tensor = quiver_feature.shared_load("/data/dalong/front_half.pt")
25 |     return tensor
26 | 
27 | 
28 | TEST_COUNT = 100
29 | SAMPLE_NUM = 80000
30 | 
31 | def test_normal_feature_collect(dataset="reddit"):
32 |     if dataset == "reddit":
33 |         tensor = load_reddit()
34 |     elif dataset == "mag240m":
35 |         tensor = load_mag240_partition()
36 |     else:
37 |         tensor = load_products()
38 | 
39 |     consumed = 0
40 |     res = None
41 | 
42 |     for _ in range(TEST_COUNT):
43 |         indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cpu")
44 |         start = time.time()
45 |         res = tensor[indices]
46 |         consumed += time.time() - start
47 | 
48 |     print(f"Throughput = {TEST_COUNT * res.numel() * 4 / consumed / 1024 / 1024 / 1024 :.4f} GB/s")
49 | 
50 | def test_LocalTensorPGAS(dataset="reddit", device_nums = 1, device_cache_size = 0, cache_policy = "device_replicate"):
51 | 
52 |     print(f"Dataset: {dataset}, Device Num: {device_nums}, Device Cache Size: {device_cache_size}, Cache Policy: {cache_policy}")
53 |     if dataset == "reddit":
54 |         tensor = load_reddit()
55 |     elif dataset == "mag240m":
56 | 
57 |         tensor = load_mag240_partition()
58 |     else:
59 |         tensor = load_products()
60 | 
61 |     tensor.share_memory_()
62 | 
63 |     local_tensor_pgas = LocalTensorPGAS(device_list=list(range(device_nums)), device_cache_size=device_cache_size, cache_policy=cache_policy)
64 |     local_tensor_pgas.from_cpu_tensor(tensor)
65 | 
66 |     indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cuda:0")
67 |     res = local_tensor_pgas[indices]
68 |     torch.cuda.synchronize()
69 | 
70 |     consumed = 0
71 |     res = None
72 | 
73 |     for _ in range(TEST_COUNT):
74 |         indices = torch.randint(0, tensor.shape[0],(SAMPLE_NUM,), device="cuda:0")
75 |         torch.cuda.synchronize()
76 |         start = time.time()
77 |         res = local_tensor_pgas[indices]
78 |         torch.cuda.synchronize()
79 |         consumed += time.time() - start
80 | 
81 |     print(f"Throughput = {TEST_COUNT * res.numel() * tensor.element_size() / consumed / 1024 / 1024 / 1024 :.4f} GB/s")
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     """
86 |     Set shm size as your whole memory size
87 |      sudo mount -o remount,size=377G /dev/shm
88 |     """
89 |     quiver.init_p2p([0, 1])
90 |     #test_normal_feature_collect()
91 |     test_LocalTensorPGAS("mag240m", device_cache_size="30G", device_nums=2, cache_policy="p2p_clique_replicate")
92 | 


--------------------------------------------------------------------------------
/tests/python/test_MultiMachineDistTensorClientServer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from ntpath import join
  3 | import torch
  4 | import numpy as np
  5 | import time
  6 | import threading
  7 | from typing import List
  8 | import qvf
  9 | import config
 10 | import torch.multiprocessing as mp
 11 | from quiver_feature import TensorEndPoint, Range
 12 | from quiver_feature import DistHelper
 13 | from quiver_feature import DistTensorPGAS
 14 | 
 15 | MASTER_IP = "155.198.152.17"
 16 | HLPER_PORT = 5678
 17 | 
 18 | NUM_ELEMENT = 10000000 * 3 * 2 * 2 * 2 * 2
 19 | FEATURE_DIM = 128
 20 | SAMPLE_SIZE = 250000
 21 | 
 22 | 
 23 | parser = argparse.ArgumentParser(description='')
 24 | parser.add_argument('-server_rank', type=int, help='server_rank')
 25 | parser.add_argument('-device_per_node', type=int, help="how many process per server")
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | def feature_process(rank, server_rank, tensor_endpoints, cached_range, SAMPLE_SIZE, FEATURE_DIM):
 31 | 
 32 |     torch.cuda.set_device(rank)
 33 |     peer_tensor_endpoint = None
 34 |     for tensor_endpoint in tensor_endpoints:
 35 |         if tensor_endpoint.server_rank != server_rank:
 36 |             peer_tensor_endpoint = tensor_endpoint
 37 |             break
 38 |     host_indice = np.random.randint(peer_tensor_endpoint.range.start, high= peer_tensor_endpoint.range.end, size=(SAMPLE_SIZE, ))
 39 |     indices = torch.from_numpy(host_indice).type(torch.long)
 40 | 
 41 |     pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
 42 |     dist_tensor = DistTensorPGAS(server_rank, tensor_endpoints, pipe_param, [SAMPLE_SIZE, FEATURE_DIM], None, cached_range)
 43 | 
 44 | 
 45 |     TEST_COUNT = 1
 46 |     start = time.time()
 47 |     consumed = 0
 48 |     for i in range(TEST_COUNT):
 49 | 
 50 |         host_indice = np.random.randint(peer_tensor_endpoint.range.start, high= peer_tensor_endpoint.range.end, size=(SAMPLE_SIZE, ))
 51 |         indices = torch.from_numpy(host_indice).type(torch.long)
 52 |         if config.TEST_TLB_OPTIMIZATION:
 53 |             indices, _ = torch.sort(indices)
 54 |         
 55 |         local_offsets =  torch.arange(0, SAMPLE_SIZE) * 4 * FEATURE_DIM
 56 |         remote_offsets = (indices - peer_tensor_endpoint.range.start) * 4 * FEATURE_DIM
 57 | 
 58 |         start = time.time()
 59 |         dist_tensor.dist_tensor_client.sync_read(peer_tensor_endpoint.server_rank, dist_tensor.registered_tensor, local_offsets, remote_offsets)
 60 |         consumed += time.time() - start
 61 | 
 62 |     print(f"Result Check Successed! Throughput = {dist_tensor.registered_tensor.numel() * 4 * TEST_COUNT/ 1024 / 1024 / consumed} MB/s")
 63 | 
 64 | 
 65 | 
 66 | 
 67 | if __name__ == "__main__":
 68 | 
 69 | 
 70 |     SERVER_WORLD_SIZE = 2
 71 |     START_SERVER = True
 72 |     CACHE_RATIO = 0
 73 |     LOCAL_SERVER_RANK = args.server_rank
 74 | 
 75 | 
 76 |     cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * SERVER_WORLD_SIZE))
 77 |     UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * SERVER_WORLD_SIZE - cached_range.end) // SERVER_WORLD_SIZE
 78 | 
 79 | 
 80 | 
 81 |     tensor = torch.empty((UNCACHED_NUM_ELEMENT + cached_range.end, FEATURE_DIM))
 82 | 
 83 |     print(f"Check Tensor Size: {tensor.numel() * 4 / 1024 / 1024 / 1024} GB")
 84 | 
 85 | 
 86 | 
 87 |     # Decide Range Information
 88 |     range_list = []
 89 |     for idx in range(SERVER_WORLD_SIZE):
 90 |         range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1))
 91 |         range_list.append(range_item)
 92 | 
 93 |     # Exchange information with each other
 94 |     dist_helper = DistHelper(MASTER_IP, HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK)
 95 |     tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK])
 96 | 
 97 |     # Start server thread
 98 |     def server_thread(dist_helper):
 99 |         dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, SERVER_WORLD_SIZE * args.device_per_node, config.QP_NUM)
100 |         dist_tensor_server.serve_tensor(tensor)
101 |         dist_helper.sync_start()
102 |         dist_tensor_server.join()
103 |     x = threading.Thread(target=server_thread, args=(dist_helper, ))
104 |     x.daemon = True
105 |     x.start()
106 | 
107 |     # Wait all servers start
108 |     dist_helper.sync_end()
109 | 
110 |     print(f"Check All TensorEndPoints {tensor_endpoints_list}")
111 | 
112 |     mp.spawn(feature_process, nprocs=args.device_per_node, args=(LOCAL_SERVER_RANK, tensor_endpoints_list, cached_range, SAMPLE_SIZE, FEATURE_DIM), join=True)
113 | 
114 |     time.sleep(10)
115 |     dist_helper.sync_all()
116 | 


--------------------------------------------------------------------------------
/tests/python/test_MultiMachineDistTensorPGAS.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import numpy as np
  4 | import time
  5 | from typing import List
  6 | import config
  7 | import torch.multiprocessing as mp
  8 | from quiver_feature import TensorEndPoint, Range, PipeParam, DistTensorDeviceParam, DistTensorServerParam
  9 | from quiver_feature import DistHelper
 10 | from quiver_feature import DistTensorPGAS
 11 | 
 12 | NUM_ELEMENT = 1000000
 13 | FEATURE_DIM = 600
 14 | SAMPLE_SIZE = 80000
 15 | 
 16 | 
 17 | parser = argparse.ArgumentParser(description='')
 18 | parser.add_argument('-server_rank', type=int, default=0, help='server_rank')
 19 | parser.add_argument('-device_per_node', type=int, default=1, help="how many process per server")
 20 | parser.add_argument('-server_world_size', type=int, default=1, help="world size")
 21 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache")
 22 | 
 23 | args = parser.parse_args()
 24 | 
 25 | 
 26 | def feature_process(rank, dist_tensor, whole_tensor, SAMPLE_SIZE):
 27 | 
 28 |     torch.cuda.set_device(rank)
 29 |     host_indice = np.random.randint(0, high=dist_tensor.shape[0] - 1, size=(SAMPLE_SIZE, ))
 30 |     indices = torch.from_numpy(host_indice).type(torch.long)
 31 |     indices_device = indices.to(rank)
 32 | 
 33 |     # warm up
 34 |     data = dist_tensor[indices_device]
 35 |     torch.cuda.synchronize()
 36 |     TEST_COUNT = 1000
 37 |     start = time.time()
 38 |     consumed = 0
 39 |     for i in range(TEST_COUNT):
 40 |         host_indice = np.random.randint(0, high=dist_tensor.shape[0] - 1, size=(SAMPLE_SIZE, ))
 41 |         indices = torch.from_numpy(host_indice).type(torch.long)
 42 |         if config.TEST_TLB_OPTIMIZATION:
 43 |             indices, _ = torch.sort(indices)
 44 | 
 45 |         indices_device = indices.to(rank)
 46 |         torch.cuda.synchronize()
 47 | 
 48 |         start = time.time()
 49 |         data = dist_tensor[indices_device]
 50 |         torch.cuda.synchronize()
 51 |         consumed += time.time() - start
 52 |         assert torch.equal(data.cpu(), whole_tensor[indices]), "Result Check Failed!"
 53 | 
 54 | 
 55 |     print(f"Result Check Successed! Throughput = {data.numel() * data.element_size() * TEST_COUNT/ 1024 / 1024 / consumed} MB/s")
 56 | 
 57 | 
 58 | 
 59 | 
 60 | if __name__ == "__main__":
 61 | 
 62 | 
 63 |     SERVER_WORLD_SIZE = args.server_world_size
 64 |     START_SERVER = True
 65 |     CACHE_RATIO = args.cache_ratio
 66 |     LOCAL_SERVER_RANK = args.server_rank
 67 | 
 68 | 
 69 |     cached_range = Range(0, int(CACHE_RATIO * NUM_ELEMENT * SERVER_WORLD_SIZE))
 70 |     UNCACHED_NUM_ELEMENT = (NUM_ELEMENT * SERVER_WORLD_SIZE - cached_range.end) // SERVER_WORLD_SIZE
 71 | 
 72 | 
 73 |     host_tensor = np.arange((UNCACHED_NUM_ELEMENT + cached_range.end ) * FEATURE_DIM)
 74 |     host_tensor = host_tensor.reshape((UNCACHED_NUM_ELEMENT + cached_range.end), FEATURE_DIM)
 75 |     host_tensor = host_tensor.astype(np.float16)
 76 |     tensor = torch.from_numpy(host_tensor)
 77 | 
 78 | 
 79 |     # Decide Range Information
 80 |     range_list = []
 81 |     for idx in range(SERVER_WORLD_SIZE):
 82 |         range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1))
 83 |         range_list.append(range_item)
 84 |     
 85 | 
 86 |     # Exchange information with each other
 87 |     dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK)
 88 |     print("Exchange Tensor End Point Infomation With Other Ranks")
 89 |     tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK])
 90 | 
 91 | 
 92 |     print(f"Check All TensorEndPoints {tensor_endpoints_list}")
 93 |     whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * SERVER_WORLD_SIZE)
 94 | 
 95 | 
 96 |     device_param = DistTensorDeviceParam(device_list=list(range(args.device_per_node)), device_cache_size="4G", cache_policy="device_replicate")
 97 |     server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size=args.server_world_size)
 98 |     buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, tensor.shape[1]]
 99 |     pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
100 | 
101 |     dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range, dtype=tensor.dtype)
102 |     dist_tensor.from_cpu_tensor(tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param)
103 | 
104 | 
105 |     mp.spawn(feature_process, nprocs=args.device_per_node, args=(dist_tensor, whole_tensor, SAMPLE_SIZE), join=True)
106 | 
107 |     dist_helper.sync_all()
108 | 


--------------------------------------------------------------------------------
/tests/python/test_MultiMachineDistTensorRPC.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import os
 4 | 
 5 | 
 6 | import multiprocessing as mp
 7 | 
 8 | 
 9 | def run(command):
10 |     os.system(command)
11 | 
12 | """
13 | 1. CPU & IB
14 | 2. Komodo1,2,3
15 | 3. can we do some GPU sampling when waiting for network
16 | """
17 | os.environ['MASTER_ADDR'] = '155.198.152.17'
18 | os.environ['MASTER_PORT'] = '5678'
19 | 
20 | os.environ["NCCL_SOCKET_IFNAME"] = "eth0"
21 | os.environ["TP_SOCKET_IFNAME"] = "eth0"
22 | os.environ["GLOO_SOCKET_IFNAME"] = "eth0"
23 | os.environ["TP_VERBOSE_LOGGING"] = "0"
24 | 
25 | 
26 | 
27 | parser = argparse.ArgumentParser(description='python3 test.py -rank x -world_size x  -cpu_collect True for test CPU')
28 | parser.add_argument('-world_size', type=int, help="world size")
29 | parser.add_argument("-device_per_node", type=int, default=1, help ="device per node")
30 | parser.add_argument("-cpu_collect", type=int, default=0, help ="test for cpu collection")
31 | parser.add_argument("-cpu_collect_gpu_send", type=int, default=0, help ="send from gpu")
32 | parser.add_argument("-test_ib", type=int, default=1, help ="test IB")
33 | parser.add_argument("-start_rank", type=int, default=0, help ="test IB")
34 | 
35 | args = parser.parse_args()
36 | 
37 | command = f"python3 test_DistTensorRPC.py -device_per_node {args.device_per_node} -cpu_collect {args.cpu_collect} -cpu_collect_gpu_send {args.cpu_collect_gpu_send} -test_ib {args.test_ib} -world_size {args.world_size} -device_per_node {args.device_per_node}"
38 | 
39 | process_lst = []
40 | for local_rank in range(args.device_per_node):
41 |     run_command = command + f" -rank {args.start_rank + local_rank} -local_rank {local_rank}"
42 |     print(f"Run Command: {run_command}")
43 |     process = mp.Process(target=run, args=(run_command, ))
44 |     process.start()
45 |     process_lst.append(process)
46 | 
47 | for process in process_lst:
48 |     process.join()
49 | 


--------------------------------------------------------------------------------
/tests/python/test_PipeParam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import qvf
 3 | 
 4 | pipe_param = qvf.PipeParam(1, 1, 1, 1)
 5 | print(f"ParamVec: {pipe_param.get_param_vec()}")
 6 | 
 7 | pipe_param2 = qvf.PipeParam()
 8 | pipe_param2.set_param_vec(pipe_param.get_param_vec())
 9 | print(f"ParamVec: {pipe_param2.get_param_vec()}")
10 | 
11 | 


--------------------------------------------------------------------------------
/tests/python/test_RealDataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch_geometric.datasets import Reddit
  3 | import os.path as osp
  4 | import time
  5 | from ogb.nodeproppred import PygNodePropPredDataset
  6 | 
  7 | import argparse
  8 | 
  9 | import numpy as np
 10 | import time
 11 | from typing import List
 12 | import config
 13 | import torch.multiprocessing as mp
 14 | from quiver_feature import TensorEndPoint, Range, PipeParam, DistTensorServerParam, DistTensorDeviceParam
 15 | from quiver_feature import DistHelper
 16 | from quiver_feature import DistTensorPGAS
 17 | 
 18 | 
 19 | def load_products():
 20 |     root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'products')
 21 |     dataset = PygNodePropPredDataset('ogbn-products', root)
 22 |     data = dataset[0]
 23 |     return data.x
 24 | 
 25 | 
 26 | def load_reddit():
 27 |     path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Reddit')
 28 |     dataset = Reddit(path)
 29 |     data = dataset[0]
 30 |     return data.x
 31 | 
 32 | def load_paper100M(dataset="paper100m", cache_ratio = 0.0, method="certain"):
 33 |     return torch.load(f"/data/dalong/sorted_feature_{dataset}_{method}_{cache_ratio:.2f}.pt")
 34 | 
 35 | SAMPLE_SIZE = 80000
 36 | 
 37 | 
 38 | parser = argparse.ArgumentParser(description='')
 39 | parser.add_argument('-server_rank', type=int, default=0, help='server_rank')
 40 | parser.add_argument('-device_per_node', type=int, default=1, help="how many process per server")
 41 | parser.add_argument('-server_world_size', type=int, default=1, help="world size")
 42 | parser.add_argument("-cache_ratio", type=float, default=0.0, help ="how much data you want to cache")
 43 | 
 44 | args = parser.parse_args()
 45 | 
 46 | 
 47 | def feature_process(rank, dist_tensor, whole_tensor, SAMPLE_SIZE):
 48 | 
 49 |     torch.cuda.set_device(rank)
 50 |     host_indice = np.random.randint(0, high= dist_tensor.size(0) - 1, size=(SAMPLE_SIZE, ))
 51 |     indices = torch.from_numpy(host_indice).type(torch.long)
 52 |     indices_device = indices.to(rank)
 53 | 
 54 |     # warm up
 55 |     data = dist_tensor[indices_device]
 56 |     torch.cuda.synchronize()
 57 |     TEST_COUNT = 1000
 58 |     start = time.time()
 59 |     consumed = 0
 60 |     for i in range(TEST_COUNT):
 61 |         host_indice = np.random.randint(0, high= dist_tensor.size(0) - 1, size=(SAMPLE_SIZE, ))
 62 |         indices = torch.from_numpy(host_indice).type(torch.long)
 63 |         if config.TEST_TLB_OPTIMIZATION:
 64 |             indices, _ = torch.sort(indices)
 65 |         indices_device = indices.to(rank)
 66 |         torch.cuda.synchronize()
 67 | 
 68 |         start = time.time()
 69 |         data = dist_tensor[indices_device]
 70 |         torch.cuda.synchronize()
 71 |         consumed += time.time() - start
 72 | 
 73 |     data = data.cpu()
 74 |     data_gt = whole_tensor[indices]
 75 | 
 76 |     assert torch.equal(data, data_gt), "Result Check Failed!"
 77 | 
 78 |     print(f"Result Check Successed! Throughput = {data.numel() * 4 * TEST_COUNT/ 1024 / 1024 / consumed} MB/s")
 79 | 
 80 | 
 81 | 
 82 | 
 83 | if __name__ == "__main__":
 84 | 
 85 | 
 86 |     tensor = load_reddit()
 87 |     SERVER_WORLD_SIZE = args.server_world_size
 88 |     START_SERVER = True
 89 |     CACHE_RATIO = 0
 90 |     LOCAL_SERVER_RANK = args.server_rank
 91 |     TOTAL_NODE_SIZE = tensor.shape[0]
 92 | 
 93 | 
 94 |     cached_range = Range(0, int(CACHE_RATIO * TOTAL_NODE_SIZE))
 95 |     UNCACHED_NUM_ELEMENT = (TOTAL_NODE_SIZE - cached_range.end) // SERVER_WORLD_SIZE
 96 | 
 97 | 
 98 |     # Decide Range Information
 99 |     range_list = []
100 |     for idx in range(SERVER_WORLD_SIZE):
101 |         range_item = Range(cached_range.end + UNCACHED_NUM_ELEMENT * idx, cached_range.end + UNCACHED_NUM_ELEMENT * (idx + 1))
102 |         range_list.append(range_item)
103 | 
104 |     # Build local_tensor
105 |     local_tensor = torch.cat([tensor[cached_range.start: cached_range.end], tensor[range_list[args.server_rank].start: range_list[args.server_rank].end]]).share_memory_()
106 | 
107 |     # Exchange information with each other
108 |     dist_helper = DistHelper(config.MASTER_IP, config.HLPER_PORT, SERVER_WORLD_SIZE, LOCAL_SERVER_RANK)
109 |     tensor_endpoints_list: List[TensorEndPoint] = dist_helper.exchange_tensor_endpoints_info(range_list[LOCAL_SERVER_RANK])
110 | 
111 |     print(f"Check All TensorEndPoints {tensor_endpoints_list}")
112 |     whole_tensor = torch.cat([tensor[:cached_range.end, ]] + [tensor[cached_range.end:, ]] * SERVER_WORLD_SIZE)
113 | 
114 |     device_param = DistTensorDeviceParam(device_list=list(range(args.device_per_node)), device_cache_size="4G", cache_policy="device_replicate")
115 |     server_param = DistTensorServerParam(port_num=config.PORT_NUMBER, server_world_size=args.server_world_size)
116 |     buffer_shape = [np.prod(config.SAMPLE_PARAM) * config.BATCH_SIZE, local_tensor.shape[1]]
117 |     pipe_param = PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
118 | 
119 |     dist_tensor = DistTensorPGAS(args.server_rank, tensor_endpoints_list, pipe_param, buffer_shape, cached_range)
120 |     dist_tensor.from_cpu_tensor(local_tensor, dist_helper=dist_helper, server_param=server_param, device_param=device_param)
121 | 
122 |     
123 |     
124 |     mp.spawn(feature_process, nprocs=args.device_per_node, args=(dist_tensor, whole_tensor, SAMPLE_SIZE), join=True)
125 | 
126 |     dist_helper.sync_all()
127 | 


--------------------------------------------------------------------------------
/tests/python/test_RegisteredTensorTransfer.py:
--------------------------------------------------------------------------------
 1 | from atexit import register
 2 | import torch
 3 | import qvf
 4 | import threading
 5 | import config
 6 | 
 7 | import time
 8 | 
 9 | 
10 | def server_thread():
11 |     print("Start Server Thread")
12 |     data = torch.empty((config.NODE_COUNT, config.FEATURE_DIM), dtype=torch.float)
13 |     dist_tensor_server = qvf.DistTensorServer(config.PORT_NUMBER, 1, config.QP_NUM)
14 |     dist_tensor_server.serve_tensor(data)
15 |     time.sleep(10)
16 | 
17 | x = threading.Thread(target=server_thread)
18 | x.daemon = True
19 | x.start()
20 | 
21 | pipe_param = qvf.PipeParam(config.QP_NUM, config.CTX_POLL_BATCH, config.TX_DEPTH, config.POST_LIST_SIZE)
22 | local_com_endpoint = qvf.ComEndPoint(0, config.SERVER_IP, config.PORT_NUMBER)
23 | remote_com_endpoint = qvf.ComEndPoint(1, config.SERVER_IP, config.PORT_NUMBER)
24 | dist_tensor_client = qvf.DistTensorClient(0, [local_com_endpoint, remote_com_endpoint], pipe_param)
25 | registered_tensor = torch.zeros((config.SAMPLE_NUM, config.FEATURE_DIM))
26 | registered_tensor = registered_tensor.pin_memory()
27 | dist_tensor_client.register_float32_tensor(registered_tensor)
28 | 
29 | data_cuda = registered_tensor.cuda()
30 | torch.cuda.synchronize()
31 | 
32 | start = time.time()
33 | data_cuda = registered_tensor.cuda()
34 | torch.cuda.synchronize()
35 | consumed = time.time() - start
36 | 
37 | print(f"Transfer Throughput is {data_cuda.numel() * 4 / 1024 / 1024 / 1024 / consumed} GB/s")
38 | 


--------------------------------------------------------------------------------
/tests/python/test_SharedLoader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import quiver_feature
 3 | import psutil
 4 | import time
 5 | from multiprocessing import Process
 6 | import os
 7 | 
 8 | def measure_process(parent_process_id):
 9 |     consumed = 0
10 |     start = time.time()
11 |     mem_use_lst = []
12 | 
13 |     while consumed < 20:
14 |         mem_use = psutil.Process(parent_process_id).memory_info().rss / 1024 / 1024
15 |         time.sleep(0.0001)
16 |         consumed += time.time() - start
17 |         start = time.time()
18 |         mem_use_lst.append(mem_use)
19 |     
20 |     print(f"Max Memory Usage: {max(mem_use_lst)}")
21 | 
22 | def check_shared(t: torch.Tensor):
23 |     print('tensor.is_shared() = {}'.format(t.is_shared()))
24 | 
25 | def save_huge_tensor():
26 |     a = torch.zeros((10, 1024, 1024, 256))
27 |     torch.save(a, 'huge.pt')
28 | 
29 | def torch_load_huge_shared_tensor():
30 |     a = torch.load('huge.pt')
31 |     print(f"Original Data Size = {a.numel() * 4  / 1024 / 1024} MB")
32 |     
33 |     print(f"Before Shared:", end="\t")
34 |     check_shared(a)
35 |     
36 |     a.share_memory_()
37 | 
38 |     print(f"After Shared:", end="\t")
39 |     check_shared(a)
40 |     
41 |     del a 
42 | 
43 | 
44 | def qvf_load_huge_shared_tensor():
45 |     a = quiver_feature.shared_load('huge.pt')
46 |     print(f"Original Data Size = {a.numel() * 4  / 1024 / 1024} MB")
47 |     
48 |     print(f"Before Shared:", end="\t")
49 |     check_shared(a)
50 | 
51 |     a.share_memory_()
52 | 
53 |     print(f"After Shared:", end="\t")
54 |     check_shared(a)
55 | 
56 |     del a
57 | 
58 | if __name__ == '__main__':
59 |     #save_huge_tensor()
60 | 
61 |     sub_process = Process(target=measure_process, args=(os.getpid(),))
62 |     sub_process.start()
63 | 
64 |     # Test Pytorch's Data Loading
65 |     #torch_load_huge_shared_tensor()
66 | 
67 |     # Test Quiver-Feature's SharedLoader
68 |     qvf_load_huge_shared_tensor()
69 | 
70 |     sub_process.join()


--------------------------------------------------------------------------------
/tests/python/test_TensorEndPoint.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import qvf
3 | 
4 | tensor_endpoint = qvf.TensorEndPoint("localhost", 3341, 0, 0, 60000)
5 | 


--------------------------------------------------------------------------------