├── .gitignore
├── doc
    ├── pic
    │   └── GEMM_pic.png
    ├── INSTALL.md
    └── EXP.md
├── .gitmodules
├── src
    ├── applications
    │   └── gemm
    │   │   ├── include
    │   │       ├── lightbam.cuh
    │   │       ├── ioctl.h
    │   │       ├── request.cuh
    │   │       ├── device.cuh
    │   │       ├── log.cuh
    │   │       ├── gemm.cuh
    │   │       ├── queue.cuh
    │   │       ├── util.cuh
    │   │       └── controller.cuh
    │   │   ├── Makefile
    │   │   ├── test.sh
    │   │   ├── gemm
    │   │       ├── cam_gemm.cu
    │   │       └── spdk_gemm.cu
    │   │   └── src
    │   │       ├── device.cu
    │   │       ├── queue.cu
    │   │       ├── controller.cu
    │   │       ├── controller_decouple.cu
    │   │       └── controller_legacy.cu
    ├── GPU_memory_lib
    │   ├── Makefile
    │   ├── GPU_memory_management.hpp
    │   └── GPU_memory_management.cpp
    ├── benchmarks
    │   ├── CAM_variable_core_benchmark
    │   │   ├── Makefile
    │   │   ├── variable_core_test_read.cu
    │   │   └── variable_core_test_write.cu
    │   └── CAM_benchmark
    │   │   ├── Makefile
    │   │   ├── test_seq_write.cu
    │   │   ├── test_random_read.cu
    │   │   ├── test_random_write.cu
    │   │   └── test_seq_read.cu
    ├── CAM_variable_core_lib
    │   ├── Makefile
    │   ├── CAM_variable_core.h
    │   └── threadPool.h
    └── CAM_lib
    │   ├── Makefile
    │   ├── sample_read.cu
    │   ├── sample_write.cu
    │   ├── CAM_interface.h
    │   ├── threadPool.h
    │   ├── gpu_transfer.cuh
    │   └── gpu_transfer.cu
├── run_GEMM.sh
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/doc/pic/GEMM_pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RC4ML/CAM/HEAD/doc/pic/GEMM_pic.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "spdk"]
2 | 	path = spdk
3 | 	url = https://github.com/RC4ML/spdk.git
4 | [submodule "gdrcopy"]
5 | 	path = gdrcopy
6 | 	url = https://github.com/RC4ML/gdrcopy.git
7 | 


--------------------------------------------------------------------------------
/src/applications/gemm/include/lightbam.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef __AEOLUS_CUH__
 2 | #define __AEOLUS_CUH__
 3 | 
 4 | #include "controller.cuh"
 5 | #include "device.cuh"
 6 | #include "log.cuh"
 7 | #include "queue.cuh"
 8 | #include "util.cuh"
 9 | #include "request.cuh"
10 | 
11 | #endif


--------------------------------------------------------------------------------
/run_GEMM.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export AEOLUS_LOG_LEVEL=ERROR
 3 | B=16k
 4 | ssd=6
 5 | IO=32k
 6 | 
 7 | for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k
 8 | do
 9 |     for ((i=0; i<3; i++))
10 |     do
11 |         .build/application/gemm/gemm-test $N $N $N 0 16UL*1024*1024*1024  32UL*1024*1024*1024  $B $IO 6
12 |         if [ $? -ne 0 ]; then
13 |             echo "Failed at $N"
14 |             # exit 1
15 |         fi
16 |     done
17 | done
18 | echo "All tests done"


--------------------------------------------------------------------------------
/src/applications/gemm/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | INCLUDE = -I./include
 3 | FLAGS = -D__DEBUG__ -rdc=true -g -lcublas
 4 | OUTPUT_DIR = ../../../build/application/gemm
 5 | LIB_PATH = ../../../build/lib
 6 | # Ensure the output directory exists
 7 | $(shell mkdir -p $(OUTPUT_DIR))
 8 | 
 9 | 
10 | gemm-test: src/queue.cu src/device.cu src/controller.cu src/controller_decouple.cu src/controller_legacy.cu gemm/cam_gemm.cu
11 | 	$(NVCC) $(INCLUDE) $(FLAGS) -I../../GPU_memory_lib -I ../../../src/CAM_lib  -L../../../spdk/build/lib  -L ../../../build/lib -lCAM_interface    $^ -o $(OUTPUT_DIR)/$@
12 | 
13 | 
14 | 
15 | 
16 | clean:
17 | 	rm -f gemm-test 


--------------------------------------------------------------------------------
/src/GPU_memory_lib/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | 
 3 | CUDAFLAGS = -arch=sm_80 -Xcompiler=-fPIC,-mavx512f
 4 | CUDALDFLAGS = -lgdrapi -lcuda -lcudart 
 5 | 
 6 | # Output directory
 7 | OUTPUT_DIR = ../../build/lib
 8 | 
 9 | # Source and object files
10 | SOURCE1 = GPU_memory_management.cpp
11 | OBJECT1 = $(OUTPUT_DIR)/$(SOURCE1:.cpp=.o)
12 | TARGET1 = $(OUTPUT_DIR)/libgpu_memory_management.so
13 | 
14 | # Default rule
15 | all: $(TARGET1)
16 | 
17 | # Rule to compile the source file into an object file
18 | $(OBJECT1): $(SOURCE1)
19 | 	$(NVCC) $(CUDAFLAGS) -c $< -o $@
20 | 
21 | # Rule to link the object file and create the shared library
22 | $(TARGET1): $(OBJECT1)
23 | 	$(NVCC) -shared $(OBJECT1) -o $@ $(CUDALDFLAGS)
24 | 
25 | # Clean rule to remove generated files
26 | clean:
27 | 	rm -f $(OBJECT1) $(TARGET1)


--------------------------------------------------------------------------------
/src/applications/gemm/include/ioctl.h:
--------------------------------------------------------------------------------
 1 | #ifndef __AEOLUS_IOCTL_H
 2 | #define __AEOLUS_IOCTL_H
 3 | #ifdef __linux__
 4 | 
 5 | #include <linux/types.h>
 6 | #include <asm/ioctl.h>
 7 | #include <cstdint>
 8 | #include <cstddef>
 9 | 
10 | #define NVM_IOCTL_TYPE          0x80
11 | 
12 | 
13 | 
14 | /* Memory map request */
15 | struct nvm_ioctl_map
16 | {
17 |     uint64_t    vaddr_start;
18 |     size_t      n_pages;
19 |     uint64_t*   ioaddrs;
20 | };
21 | 
22 | 
23 | 
24 | /* Supported operations */
25 | enum nvm_ioctl_type
26 | {
27 |     NVM_MAP_HOST_MEMORY         = _IOW(NVM_IOCTL_TYPE, 1, struct nvm_ioctl_map),
28 | #ifdef _CUDA
29 |     NVM_MAP_DEVICE_MEMORY       = _IOW(NVM_IOCTL_TYPE, 2, struct nvm_ioctl_map),
30 | #endif
31 |     NVM_UNMAP_MEMORY            = _IOW(NVM_IOCTL_TYPE, 3, uint64_t)
32 | };
33 | 
34 | 
35 | #endif /* __linux__ */
36 | #endif


--------------------------------------------------------------------------------
/src/benchmarks/CAM_variable_core_benchmark/Makefile:
--------------------------------------------------------------------------------
 1 | # Define variables for include, library, and output paths
 2 | INCLUDE_PATH = ../../CAM_variable_core_lib
 3 | LIB_PATH = ../../../build/lib
 4 | OUTPUT_DIR = ../../../build/benchmarks/CAM_variable_core_benchmark
 5 | 
 6 | # Ensure the output directory exists
 7 | $(shell mkdir -p $(OUTPUT_DIR))
 8 | 
 9 | # Compiler
10 | NVCC = nvcc
11 | 
12 | # Compiler flags
13 | NVCC_FLAGS = -I $(INCLUDE_PATH) -I../../GPU_memory_lib -L $(LIB_PATH)  -L../../../spdk/build/lib -lCAM_variable_core
14 | 
15 | 
16 | 
17 | 
18 | TARGET1 = variable_core_test_read
19 | SOURCE1 = variable_core_test_read.cu
20 | 
21 | TARGET2 = variable_core_test_write
22 | SOURCE2 = variable_core_test_write.cu
23 | 
24 | 
25 | 
26 | # Default rule
27 | all: $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2)
28 | 
29 | $(OUTPUT_DIR)/$(TARGET1): $(SOURCE1)
30 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
31 | 
32 | $(OUTPUT_DIR)/$(TARGET2): $(SOURCE2)
33 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
34 | 
35 | 
36 | 
37 | # Clean rule
38 | clean:
39 | 	rm -f $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2)


--------------------------------------------------------------------------------
/src/applications/gemm/include/request.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef __AEOLUS_REQUEST_CUH
 2 | #define __AEOLUS_REQUEST_CUH
 3 | 
 4 | #include <stdint.h>
 5 | #include "util.cuh"
 6 | 
 7 | enum aeolus_access_dir
 8 | {
 9 |     AEOLUS_DIR_READ   = 0,
10 |     AEOLUS_DIR_WRITE  = 1
11 | };
12 | 
13 | class Request
14 | {
15 | public:
16 |     uint64_t    start_lb;  // starting logical block
17 |     uint64_t    dest_addr; // physical address (pinned buffer) / virtual address (non-pinned buffer) of destination
18 |     uint64_t    next_addr; // only valid for pinned buffer, next page of dest_addr (io_size <= 8KB) / prp list offset (io_size > 8KB)
19 |     int         num_items; // number of logical blocks
20 | 
21 |     inline __host__ __device__ Request(uint64_t start_lb, int num_items)
22 |     {
23 |         this->start_lb = start_lb;
24 |         this->num_items = num_items;
25 |         // You may need to call cudaLimitMallocHeapSize beforehand in this implementation.
26 |         // this->dest_addr = (uint64_t*)malloc(sizeof(uint64_t)*num_items);
27 |     }
28 | 
29 |     inline __host__ __device__ bool operator<(const Request& other) const
30 |     {
31 |         return this->start_lb < other.start_lb;
32 |     }
33 | 
34 |     inline __host__ __device__ ~Request()
35 |     {
36 |         // free(this->dest_addr);
37 |     }
38 | };
39 | 
40 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export AEOLUS_LOG_LEVEL=ERROR
 3 | B=16k
 4 | ssd=6
 5 | IO=32k
 6 | # dd if=/dev/zero of=/share/data6/a bs=1M count=64K
 7 | # dd if=/dev/zero of=/share/data6/b bs=1M count=64K
 8 | #for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k
 9 | # for N in 32768 49152 65536 81920 98304 114688 131072 163840 196608 229376 262144
10 | for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k
11 | do
12 |     for ((i=0; i<1; i++))
13 |     do
14 |         # ./gemm-test-pinned $N $N $N 0 256g 512g $B $IO $ssd
15 |         # ./gemm-cublasxt $N $N $N 0 256g 512g $IO $ssd
16 |         # ./gemm-cublas-gds $N $N $N
17 |         # ~/bam/build.new/bin/nvm-gemm-bench --m=$N --n=$N --k=$N --a_offset=0 --b_offset=274877906944 --c_offset=549755813888 --block_size=16384 --page_size=32768 --blk_size=512 --queue_depth=4096 --pages=524288 --num_queues=128 --threads=4194304 --n_ctrls=$ssd --ssd=1 | grep result
18 |         #./gemm-gds-no-batch $N $N $N /share/data6/a /share/data6/b /share/data6/c $B
19 |         #./gemm-test $N $N $N 0 16UL*1024*1024*1024  32UL*1024*1024*1024  $B $IO 6
20 |         ./gemm-spdk-test $N $N $N 0 16UL*1024*1024*1024  32UL*1024*1024*1024  $B $IO 6
21 |         if [ $? -ne 0 ]; then
22 |             echo "Failed at $N"
23 |             # exit 1
24 |         fi
25 |     done
26 | done
27 | echo "All tests done"


--------------------------------------------------------------------------------
/src/benchmarks/CAM_benchmark/Makefile:
--------------------------------------------------------------------------------
 1 | # Define variables for include, library, and output paths
 2 | INCLUDE_PATH = ../../CAM_lib
 3 | LIB_PATH = ../../../build/lib
 4 | OUTPUT_DIR = ../../../build/benchmarks/CAM_benchmark
 5 | 
 6 | # Ensure the output directory exists
 7 | $(shell mkdir -p $(OUTPUT_DIR))
 8 | 
 9 | # Compiler
10 | NVCC = nvcc
11 | 
12 | # Compiler flags
13 | NVCC_FLAGS = -I $(INCLUDE_PATH) -I../../GPU_memory_lib -L $(LIB_PATH) -L../../../spdk/build/lib -lCAM_interface 
14 | 
15 | 
16 | 
17 | TARGET1 = test_seq_read
18 | SOURCE1 = test_seq_read.cu
19 | 
20 | TARGET2 = test_seq_write
21 | SOURCE2 = test_seq_write.cu
22 | 
23 | TARGET3 = test_random_read
24 | SOURCE3 = test_random_read.cu
25 | 
26 | TARGET4 = test_random_write
27 | SOURCE4 = test_random_write.cu
28 | 
29 | # Default rule
30 | all: $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) $(OUTPUT_DIR)/$(TARGET3) $(OUTPUT_DIR)/$(TARGET4)
31 | 
32 | $(OUTPUT_DIR)/$(TARGET1): $(SOURCE1)
33 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
34 | 
35 | $(OUTPUT_DIR)/$(TARGET2): $(SOURCE2)
36 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
37 | 
38 | $(OUTPUT_DIR)/$(TARGET3): $(SOURCE3)
39 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
40 | 
41 | $(OUTPUT_DIR)/$(TARGET4): $(SOURCE4)
42 | 	$(NVCC) -o $@ $(NVCC_FLAGS) $<
43 | 
44 | # Clean rule
45 | clean:
46 | 	rm -f $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) $(OUTPUT_DIR)/$(TARGET3) $(OUTPUT_DIR)/$(TARGET4)


--------------------------------------------------------------------------------
/src/GPU_memory_lib/GPU_memory_management.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GPU_MEMORY_MANAGEMENT
 2 | #define GPU_MEMORY_MANAGEMENT
 3 | 
 4 | #include <map>
 5 | #include <array>
 6 | #include <mutex>
 7 | #include <functional>
 8 | 
 9 | #include <cstdint>
10 | #include <immintrin.h>
11 | 
12 | 
13 | class MemCtl {
14 | public:
15 |     virtual ~MemCtl() = default;
16 | 
17 |     [[nodiscard]] size_t getPoolSize() const {
18 |         return pool_size;
19 |     }
20 | 
21 |     void *alloc(size_t size);
22 | 
23 |     void free(void *ptr);
24 | 
25 | protected:
26 |     MemCtl() = default;
27 | 
28 |     size_t pool_size{};
29 | 
30 |     std::mutex allocMutex;
31 |     /*<首地址, 块大小>*/
32 |     std::map<uint64_t, uint64_t> free_chunk, used_chunk;
33 |     /* n_pages, virt_addr_base, phy_addr_array */
34 |     std::tuple<uint32_t, uint64_t, uint64_t *> page_table;
35 | };
36 | 
37 | class GPUMemCtl : public MemCtl {
38 | public:
39 |     ~GPUMemCtl() override;
40 | 
41 |     static GPUMemCtl *getInstance(int32_t dev_id, size_t pool_size);
42 |     [[maybe_unused]] static void cleanCtx();
43 |     
44 | protected:
45 |     explicit GPUMemCtl(uint64_t size);
46 | 
47 | public:
48 |     /*
49 |      * void(uint32_t, uint32_t, uint64_t, uint64_t) => (page_index, page_size, virt_addr, phy_addr)
50 |      */
51 |     void writeTLB(const std::function<void(uint32_t, uint32_t, uint64_t, uint64_t)> &func, bool aggr_flag);
52 | 
53 |     uint64_t mapV2P(void *ptr);
54 | 
55 |     void *getDevPtr() const;
56 |     void *getMapDevPtr() const;
57 | 
58 |     bool chechPhyContiguous() const;
59 | 
60 | };
61 | 
62 | #endif


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CAM: Asynchronous GPU-Initiated, CPU-Managed SSD Management for Batching Storage Access
 2 | 
 3 | 
 4 | This is the source code for our paper.
 5 | 
 6 | 
 7 | 
 8 | ## Required hardware and software
 9 | 
10 | - NVIDIA 80GB-PCIe-A100
11 | - HugePage: At least 4096 huge pages
12 | - g++ >= 11.3.0
13 | 
14 | 
15 | 
16 | ## Install Dependencies and Build
17 | See [INSTALL.md](./doc/INSTALL.md) for install dependencies and build CAM on a single machine.
18 | 
19 | 
20 | ## Run Test
21 | If Check if the configuration is correct in Run Experiments of [EXP.md](./doc/EXP.md) passes, then everything will be fine. Please refer to exp.md for more details.
22 | 
23 | 
24 | ### Directory Structure:
25 | 
26 | ~~~
27 | .
28 | ├── spdk (our modified SPDK driver)
29 | ├── gdrcopy (our modified gdrcopy driver)
30 | ├── build.sh (Script for compiling CAM code project)
31 | ├── doc (Document on how to install and conduct experiments with CAM)
32 | ├── README.md
33 | └── src
34 |     ├── benchmarks
35 |     │   ├── CAM_benchmark (microbenchmark for CAM,one thread control one SSD)
36 |     │   │ 
37 |     │   └── CAM_variable_core_benchmark (microbenchmark for CAM,one thread control variable SSDs)
38 |     │       
39 |     ├── CAM_lib (source code of CAM)
40 |     │   
41 |     ├── CAM_variable_core_lib (source code of CAM with one thread control variable SSDs)
42 |     │  
43 |     └── GPU_memory_lib (source code for GPU memory management used in CAM)
44 |     │  
45 |     └── applications
46 |         └── gemm (end to end test in GEMM application)
47 |        
48 | ~~~
49 | 
50 | 
51 | 
52 | 
53 | 
54 | ### Getting help
55 | 
56 | Working in the process...
57 | 
58 | 
59 | 
60 | ### Contact
61 | 
62 | email at songziyu@zju.edu.cn
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/doc/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | This document shows all of the essential software installation process on test machine. 
 4 | 
 5 | ## 1. Install gdrcopy Driver:
 6 | 
 7 | 
 8 | ~~~bash
 9 | cd gdrcopy
10 | make
11 | make install
12 | ~~~
13 | 
14 | 
15 | ## 2. Config Hugepages
16 | 
17 | Our system need enough Hugepages, before run application, first alloc hugepages.
18 | 
19 | ~~~bash
20 | sudo sh -c "echo 32768 > /proc/sys/vm/nr_hugepages"
21 | ~~~
22 | 
23 | 
24 | ## 3. Build SPDK:
25 | 
26 | ~~~bash
27 | cd spdk
28 | git submodule update --init
29 | ./configure --with-shared
30 | make
31 | ~~~
32 | 
33 | ## 4. Install SPDK Driver on NVMe SSDs.
34 | ~~~bash
35 | cd spdk/scrips
36 | sudo ./setup.sh
37 | ~~~
38 | 
39 | ## 5. Build all CAM software
40 | 
41 | Now we create a `buildt` directory, and build all the software in the `build` directory.
42 | ~~~bash
43 | mkdir build
44 | bash build.sh
45 | ~~~
46 | 
47 | It should report no error. And we will get the output binary in the `build` directory.
48 | 
49 | 
50 | ## 4. Uninstall SPDK driver on NVMe SSD.
51 | 
52 | After experiment, you need to uninstall SPDK driver
53 | ~~~bash
54 | cd spdk/scrips
55 | sudo ./setup.sh reset
56 | ~~~
57 | 
58 | ## 5. Extra Attention
59 | 
60 | Before installing the SPDK driver, please ensure that there is no data present on the SSD.
61 | 
62 | During the installation of the SPDK driver on NVMe SSDs, it is possible that some SSDs may not be able to install the SPDK driver. This is mainly due to the SSD already being mounted or having a file system on it. Therefore, it is necessary to unmount the SSD and wipe the file system before installing the SPDK driver.
63 | 
64 | To unmount and wipe the file system on the selected SSDs, use the following command:
65 | ~~~bash
66 | sudo umount /dev/nvmeXn1
67 | sudo wipefs -a /dev/nvmeXn1
68 | ~~~


--------------------------------------------------------------------------------
/src/CAM_variable_core_lib/Makefile:
--------------------------------------------------------------------------------
 1 | SPDK_DIR = ../../spdk
 2 | OUTPUT_DIR = ../../build/lib
 3 | 
 4 | 
 5 | .PHONY:all
 6 | 
 7 | 
 8 | 
 9 | $(shell mkdir -p $(OUTPUT_DIR))
10 | exe:
11 | 	
12 | 
13 | 	g++ --std=c++17 -mcmodel=medium CAM_variable_core.h CAM_variable_core.cpp  -o $(OUTPUT_DIR)/libCAM_variable_core.so -shared -lcudart \
14 | 	-L/usr/local/cuda/lib64 -L../../build/lib -lgpu_memory_management -I../GPU_memory_lib \
15 | 	-g -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -fno-strict-aliasing \
16 | 	-I$(SPDK_DIR)/include -march=native -Wformat -Wformat-security -D_GNU_SOURCE -fPIC -fstack-protector -fno-common \
17 | 	-I$(SPDK_DIR)/isa-l/.. -I$(SPDK_DIR)/isalbuild -I$(SPDK_DIR)/isa-l-crypto/.. \
18 | 	-I$(SPDK_DIR)/isalcryptobuild -DNDEBUG -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -DSPDK_GIT_COMMIT=6ee9cd257 -pthread -std=c++17 \
19 | 	-Wl,-z,relro,-z,now -Wl,-z,noexecstack -fuse-ld=bfd -Wl,-rpath=$(SPDK_DIR)/isa-l/.libs -Wl,-rpath=$(SPDK_DIR)/isa-l-crypto/.libs \
20 | 	-L$(SPDK_DIR)/build/lib -Wl,--whole-archive -Wl,--no-as-needed -lspdk_sock_posix -lspdk_nvme -lspdk_keyring -lspdk_sock -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_dma -lspdk_vmd -lspdk_util -lspdk_log \
21 | 	-Wl,--no-whole-archive $(SPDK_DIR)/build/lib/libspdk_env_dpdk.so -Wl,--no-as-needed $(SPDK_DIR)/dpdk/build/lib/librte_bus_pci.so $(SPDK_DIR)/dpdk/build/lib/librte_cryptodev.so \
22 | 	$(SPDK_DIR)/dpdk/build/lib/librte_dmadev.so $(SPDK_DIR)/dpdk/build/lib/librte_eal.so $(SPDK_DIR)/dpdk/build/lib/librte_ethdev.so \
23 | 	$(SPDK_DIR)/dpdk/build/lib/librte_hash.so $(SPDK_DIR)/dpdk/build/lib/librte_kvargs.so $(SPDK_DIR)/dpdk/build/lib/librte_log.so $(SPDK_DIR)/dpdk/build/lib/librte_mbuf.so \
24 | 	$(SPDK_DIR)/dpdk/build/lib/librte_mempool.so $(SPDK_DIR)/dpdk/build/lib/librte_mempool_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_net.so $(SPDK_DIR)/dpdk/build/lib/librte_pci.so \
25 | 	$(SPDK_DIR)/dpdk/build/lib/librte_power.so $(SPDK_DIR)/dpdk/build/lib/librte_rcu.so $(SPDK_DIR)/dpdk/build/lib/librte_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_telemetry.so \
26 | 	$(SPDK_DIR)/dpdk/build/lib/librte_vhost.so -Wl,-as-needed -Wl,-rpath=$(SPDK_DIR)/dpdk/build/lib  -L$(SPDK_DIR)/isa-l/.libs -lisal -L$(SPDK_DIR)/isa-l-crypto/.libs -lisal_crypto -pthread -lrt -luuid -lssl -lcrypto -lm -lfuse3 -lkeyutils -laio
27 | 
28 | 
29 | 
30 | 	
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/src/CAM_variable_core_lib/CAM_variable_core.h:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #ifndef GPUSSD_BASELINE_H
 4 | #define GPUSSD_BASELINE_H
 5 | 
 6 | 
 7 | 
 8 | #include "GPU_memory_management.hpp"
 9 | 
10 | #include <tuple>
11 | #include <vector>
12 | #include <algorithm>
13 | #include <thread>
14 | 
15 | #include "threadPool.h"
16 | 
17 | #include "spdk/stdinc.h"
18 | 
19 | #include "spdk/env.h"
20 | #include "spdk/log.h"
21 | #include "spdk/nvme.h"
22 | #include "spdk/nvme_zns.h"
23 | #include "spdk/string.h"
24 | #include "spdk/vmd.h"
25 | 
26 | 
27 | #define MAX_EMBED_NUM  1000000
28 | struct ctrlr_entry {
29 |     struct spdk_nvme_ctrlr* ctrlr;
30 |     char name[1024];
31 | };
32 | 
33 | struct ns_entry {
34 |     int32_t id;
35 |     struct spdk_nvme_ctrlr* ctrlr;
36 |     struct spdk_nvme_ns* ns;
37 |     struct spdk_nvme_qpair* qpair;
38 | };
39 | 
40 | 
41 | 
42 | 
43 | 
44 | static  void read_complete(void* arg, const struct spdk_nvme_cpl* completion) ;
45 | //static   int thread_runner(int32_t dev_index);
46 | static    int thread_runner2(int32_t dev_index) ;
47 | static    bool probe_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr_opts* opts);
48 | static    void attach_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr* ctrlr, const struct spdk_nvme_ctrlr_opts* opts);
49 | static void register_ns(struct spdk_nvme_ctrlr* ctrlr, struct spdk_nvme_ns* ns) ;
50 | void task_submit(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr);
51 | // void task_submit(int64_t embed_num, int32_t *embed_id, void *dev_addr);    
52 | int rc4ml_spdk_init(u_int32_t emb_width);
53 |  static void alloc_qpair() ;
54 | inline std::pair<int64_t, int64_t> getEmbedAddr(int32_t embed_id) ;
55 | void spdkmap(void * map_ptr,size_t  pool_size,uint64_t  phy_addr);
56 | void clear_wait_flag();
57 | 
58 | //* new function
59 | 
60 | static  void write_complete(void* arg, const struct spdk_nvme_cpl* completion);
61 | static int thread_runner3(int32_t dev_index);
62 | void task_submit_write(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr);
63 | void clear_wait_flag_write();
64 | void cam_init(u_int32_t emb_width,uint32_t core_num);
65 | void* alloc_gpu(int64_t size);
66 | void free_gpu(void* p);
67 | void cam_clean_up(void);
68 | static int thread_runner_variablecore(int32_t thread_index);
69 | static int thread_runner_variablecore_write(int32_t thread_index);
70 | void seq_read_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr);
71 | void seq_write_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr);
72 | 
73 | 
74 | void cam_gemm_read(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr);
75 | void cam_gemm_write(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr);
76 | 
77 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/include/device.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef __AEOLUS_DEVICE_CUH
 2 | #define __AEOLUS_DEVICE_CUH
 3 | 
 4 | #define _CUDA
 5 | 
 6 | #include <stdio.h>
 7 | #include <fcntl.h>
 8 | #include <unistd.h>
 9 | #include <stdint.h>
10 | #include <vector>
11 | #include <string>
12 | #include <sys/mman.h>
13 | #include <sys/ioctl.h>
14 | #include "log.cuh"
15 | #include "util.cuh"
16 | #include "ioctl.h"
17 | #include "queue.cuh"
18 | 
19 | struct aeolus_dev_mem_context
20 | {
21 |     void *ptr;
22 |     void *ioaddrs;
23 | };
24 | 
25 | /**
26 |  * @brief Abstraction of an SSD device.
27 |  * 
28 |  */
29 | class Device
30 | {
31 | private:
32 |     void        *admin_queue_ptr;
33 |     uint64_t    *admin_queue_phys_addr;
34 | 
35 | public:
36 |     int         ssd_id;
37 |     int         device_fd;
38 |     void        *reg_ptr;
39 |     uint32_t    max_queue_num;
40 |     uint64_t    max_lb_num;
41 |     uint32_t    max_io_size;
42 | 
43 |     AdminQueuePair  *admin_qp;
44 |     uint32_t        active_ns;
45 |     
46 |     std::vector<int32_t> free_qps;
47 | 
48 |     /**
49 |      * @brief Construct a new Device object.
50 |      * 
51 |      * @param ssd_id ID of the SSD, typically the number in /dev/libnvm*
52 |      */
53 |     Device(int ssd_id);
54 | 
55 |     ~Device();
56 | 
57 |     /**
58 |      * @brief Allocate a pinned host memory buffer with physical address provided.
59 |      * 
60 |      * @param ptr Buffer pointer to be allocated. 
61 |      * @param size Size of the buffer.
62 |      * @param phys_addr A physical address list returned. Each entry of the list is a 4KB page.
63 |      * @return Allocation result. Can be read by strerror. 
64 |      */
65 |     int alloc_host_memory(void **ptr, uint64_t size, uint64_t** phys_addr);
66 | 
67 |     /**
68 |      * @brief Free a pinned host memory buffer.
69 |      * 
70 |      * @param ptr Buffer pointer.
71 |      * @param phys_addr Physical address list pointer. 
72 |      */
73 |     void free_host_memory(void *ptr, uint64_t* phys_addr);
74 | 
75 |     /**
76 |      * @brief Allocate a pinned device memory buffer with physical address provided. The buffer will be 64KB-aligned.
77 |      * 
78 |      * @param ptr Buffer pointer to be allocated. 
79 |      * @param context A context pointer which could be used for freeing the buffer.
80 |      * @param size Size of the buffer.
81 |      * @param phys_addr A physical address list returned. Each entry of the list is a 64KB page.
82 |      * @return Allocation result. Can be read by strerror.  
83 |      */
84 |     int alloc_device_memory(void **ptr, aeolus_dev_mem_context** context, uint64_t size, uint64_t** phys_addr);
85 | 
86 |     /**
87 |      * @brief Free a pinned device memory buffer.
88 |      * 
89 |      * @param context Device buffer context.
90 |      */
91 |     void free_device_memory(aeolus_dev_mem_context* context);
92 | };
93 | 
94 | #endif


--------------------------------------------------------------------------------
/src/CAM_lib/Makefile:
--------------------------------------------------------------------------------
 1 | SPDK_DIR = ../../spdk
 2 | 
 3 | CUDA_DIR = /usr/local/cuda-12.4
 4 | OUTPUT_DIR = ../../build/lib
 5 | 
 6 | 
 7 | $(shell mkdir -p $(OUTPUT_DIR))
 8 | 	
 9 | .PHONY:all
10 | 
11 | exe:
12 | 	g++ --std=c++17 -mcmodel=medium CAM_interface.h CAM_interface.cpp  -o $(OUTPUT_DIR)/libCAM_interface.so -shared -lcudart \
13 | 	-L/usr/local/cuda/lib64 -L../../build/lib -lgpu_memory_management -lcudart -lcuda -L$(CUDA_DIR)/lib64 -I../../src/GPU_memory_lib \
14 | 	-g -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -fno-strict-aliasing \
15 | 	-I$(SPDK_DIR)/include -march=native -Wformat -Wformat-security -D_GNU_SOURCE -fPIC -fstack-protector -fno-common \
16 | 	-I$(SPDK_DIR)/isa-l/.. -I$(SPDK_DIR)/isalbuild -I$(SPDK_DIR)/isa-l-crypto/.. -I$(CUDA_DIR)/include \
17 | 	-I$(SPDK_DIR)/isalcryptobuild -DNDEBUG -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -DSPDK_GIT_COMMIT=6ee9cd257 -pthread -std=c++17 \
18 | 	-Wl,-z,relro,-z,now -Wl,-z,noexecstack -fuse-ld=bfd -Wl,-rpath=$(SPDK_DIR)/isa-l/.libs -Wl,-rpath=$(SPDK_DIR)/isa-l-crypto/.libs \
19 | 	-L$(SPDK_DIR)/build/lib -Wl,--whole-archive -Wl,--no-as-needed -lspdk_sock_posix -lspdk_nvme -lspdk_keyring -lspdk_sock -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_dma -lspdk_vmd -lspdk_util -lspdk_log \
20 | 	-Wl,--no-whole-archive $(SPDK_DIR)/build/lib/libspdk_env_dpdk.so -Wl,--no-as-needed $(SPDK_DIR)/dpdk/build/lib/librte_bus_pci.so $(SPDK_DIR)/dpdk/build/lib/librte_cryptodev.so \
21 | 	$(SPDK_DIR)/dpdk/build/lib/librte_dmadev.so $(SPDK_DIR)/dpdk/build/lib/librte_eal.so $(SPDK_DIR)/dpdk/build/lib/librte_ethdev.so \
22 | 	$(SPDK_DIR)/dpdk/build/lib/librte_hash.so $(SPDK_DIR)/dpdk/build/lib/librte_kvargs.so $(SPDK_DIR)/dpdk/build/lib/librte_log.so $(SPDK_DIR)/dpdk/build/lib/librte_mbuf.so \
23 | 	$(SPDK_DIR)/dpdk/build/lib/librte_mempool.so $(SPDK_DIR)/dpdk/build/lib/librte_mempool_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_net.so $(SPDK_DIR)/dpdk/build/lib/librte_pci.so \
24 | 	$(SPDK_DIR)/dpdk/build/lib/librte_power.so $(SPDK_DIR)/dpdk/build/lib/librte_rcu.so $(SPDK_DIR)/dpdk/build/lib/librte_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_telemetry.so \
25 | 	$(SPDK_DIR)/dpdk/build/lib/librte_vhost.so -Wl,-as-needed -Wl,-rpath=$(SPDK_DIR)/dpdk/build/lib  -L$(SPDK_DIR)/isa-l/.libs -lisal -L$(SPDK_DIR)/isa-l-crypto/.libs -lisal_crypto -pthread -lrt -luuid -lssl -lcrypto -lm -lfuse3 -lkeyutils -laio
26 | 
27 | 	
28 | 	nvcc -dc   -Xcompiler -fPIC -I../../src/GPU_memory_lib -c gpu_transfer.cu -o $(OUTPUT_DIR)/gpu_transfer.o
29 | 	nvcc -dlink  -I ../../src/CAM_lib -o $(OUTPUT_DIR)/gpu_transfer_link.o $(OUTPUT_DIR)/gpu_transfer.o
30 | 	nvcc -rdc=true sample_read.cu $(OUTPUT_DIR)/gpu_transfer.o -o $(OUTPUT_DIR)/sample_read   -I../../src/GPU_memory_lib -L $(OUTPUT_DIR) -lCAM_interface
31 | 	nvcc -rdc=true sample_write.cu $(OUTPUT_DIR)/gpu_transfer.o -o $(OUTPUT_DIR)/sample_write  -I../../src/GPU_memory_lib -L $(OUTPUT_DIR) -lCAM_interface
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/src/CAM_lib/sample_read.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 | #include <unistd.h>
 4 | #include <string>
 5 | #include <fcntl.h>
 6 | #include <random>
 7 | #include <chrono>
 8 | #include <thread>
 9 | #include <cstdint>
10 | #include<ctime>
11 | #include "gpu_transfer.cuh"
12 | #include "CAM_interface.h"
13 | 
14 | __inline__ uint64_t get_tscp(void)
15 | {
16 |   uint32_t lo, hi;
17 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
18 |   __asm__ __volatile__ (
19 |       "rdtscp" : "=a"(lo), "=d"(hi)
20 |       );
21 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
22 | }
23 | 
24 | 
25 | 
26 | __global__ void myKernel( u_int64_t* d_data_dev,u_int64_t* d_data,uint64_t* dev_addr) {
27 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
28 |     for(int i=0;(256*i+idx)<1000000UL;i++)
29 |         d_data[256*i+idx]=d_data_dev[256*i+idx];
30 |     for(int i=0;i<10;i++){
31 |         prefetch(1000000UL,dev_addr);
32 |         prefetch_syncronize();
33 |     }
34 | }
35 | 
36 | 
37 | int main(int argc, char** argv) {
38 |     
39 |     cam_init(4096);
40 |     u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t));
41 |     //launch_idle_kernel();
42 |     for (int64_t i = 0; i < 10000000UL; i++) {
43 |         embed_id[i] = i;
44 |     }
45 |     std::random_shuffle(embed_id, embed_id + 10000000UL-1);
46 |     u_int64_t* embed_id_dev;
47 |     cudaMalloc(&embed_id_dev,10000000UL*sizeof(u_int64_t));
48 |     cudaMemcpy(embed_id_dev,embed_id,10000000UL*sizeof(u_int64_t),cudaMemcpyHostToDevice);
49 |     cudaStream_t stream1,stream2;
50 |     cudaError_t result;
51 |     result = cudaStreamCreate(&stream1);
52 |     result = cudaStreamCreate(&stream2);
53 |     Init(4096,stream1);
54 |     void* gem_memory = alloc_gpu(1000000UL*4096);
55 |     u_int64_t* p_d = get_d_data();
56 |     std::thread th(polling_thread);
57 |     double sum=0;
58 |     uint64_t beg_tsc, end_tsc, middle_tsc;
59 |     beg_tsc = get_tscp();
60 |     myKernel<<<1, 256,0,stream2>>>(embed_id_dev,p_d,(uint64_t*)gem_memory);
61 |     cudaDeviceSynchronize();
62 |     end_tsc = get_tscp();
63 |     sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
64 |     printf("time cost : %lf ms\n",1.0*sum/1000000);
65 |     std::cout<<"bandwidth: "<< 1000000UL*4096*10 / sum  << "GB/s" <<std::endl;
66 |     
67 |     cam_clean_up();
68 |     printf("done\n");
69 |     return 0;
70 | }
71 | 
72 | /*
73 | 
74 | nvcc  -dc  -o test_read.o   -I /home/szy/application/cam  -L /home/szy/application/cam -lspdk_interface -lgpu_transfer test.cu
75 | nvcc -dlink test_read.o b.o -o link.o
76 | nvcc -o test_read_executable   -L/home/szy/application/cam -lspdk_interface -lgpu_transfer test_read.o
77 | nvcc -o test_read_executable test_read.o -L/home/szy/application/cam -lgpu_transfer -lspdk_interface
78 | nvcc -rdc=true test.cu gpu_transfer.o -o main -I /home/szy/application/cam  -L /home/szy/application/cam -lspdk_interface
79 | */
80 | 


--------------------------------------------------------------------------------
/src/CAM_lib/sample_write.cu:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <math.h>
 3 | #include <unistd.h>
 4 | #include <string>
 5 | #include <fcntl.h>
 6 | #include <random>
 7 | #include <chrono>
 8 | #include <thread>
 9 | #include <cstdint>
10 | #include<ctime>
11 | #include "gpu_transfer.cuh"
12 | #include "CAM_interface.h"
13 | 
14 | __inline__ uint64_t get_tscp(void)
15 | {
16 |   uint32_t lo, hi;
17 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
18 |   __asm__ __volatile__ (
19 |       "rdtscp" : "=a"(lo), "=d"(hi)
20 |       );
21 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
22 | }
23 | 
24 | 
25 | 
26 | __global__ void myKernel( u_int64_t* d_data_dev,u_int64_t* d_data,uint64_t* dev_addr) {
27 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
28 |     for(int i=0;(256*i+idx)<1000000UL;i++)
29 |         d_data[256*i+idx]=d_data_dev[256*i+idx];
30 |     for(int i=0;i<10;i++){
31 |         writeback(1000000UL,dev_addr);
32 |         writeback_syncronize();
33 |     }
34 | }
35 | 
36 | 
37 | int main(int argc, char** argv) {
38 |     
39 |     cam_init(4096);
40 |       u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t));
41 |     //launch_idle_kernel();
42 |     for (int64_t i = 0; i < 10000000UL; i++) {
43 |         embed_id[i] = i;
44 |     }
45 |     std::random_shuffle(embed_id, embed_id + 10000000UL-1);
46 |     u_int64_t* embed_id_dev;
47 |     cudaMalloc(&embed_id_dev,10000000UL*sizeof(u_int64_t));
48 |     cudaMemcpy(embed_id_dev,embed_id,10000000UL*sizeof(u_int64_t),cudaMemcpyHostToDevice);
49 |     cudaStream_t stream1,stream2;
50 |     cudaError_t result;
51 |     result = cudaStreamCreate(&stream1);
52 |     result = cudaStreamCreate(&stream2);
53 |     Init(4096,stream1);
54 |     void* gem_memory = alloc_gpu(1000000UL*4096);
55 |     u_int64_t* p_d = get_d_data_write();
56 |     std::thread th(polling_thread_write);
57 |     double sum=0;
58 |     uint64_t beg_tsc, end_tsc, middle_tsc;
59 |     beg_tsc = get_tscp();
60 |     myKernel<<<1, 256,0,stream2>>>(embed_id_dev,p_d,(uint64_t*)gem_memory);
61 |     cudaDeviceSynchronize();
62 |     end_tsc = get_tscp();
63 |     sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
64 |     printf("time cost : %lf ms\n",1.0*sum/1000000);
65 |     std::cout<<"bandwidth: "<< 1000000UL*4096*10 / sum  << "GB/s" <<std::endl;
66 |     
67 |     cam_clean_up();
68 |     printf("done\n");
69 |     return 0;
70 | }
71 | 
72 | /*
73 | 
74 | nvcc  -dc  -o test_read.o   -I /home/szy/application/cam  -L /home/szy/application/cam -lspdk_interface -lgpu_transfer test.cu
75 | nvcc -dlink test_read.o b.o -o link.o
76 | nvcc -o test_read_executable   -L/home/szy/application/cam -lspdk_interface -lgpu_transfer test_read.o
77 | nvcc -o test_read_executable test_read.o -L/home/szy/application/cam -lgpu_transfer -lspdk_interface
78 | nvcc -rdc=true test.cu gpu_transfer.o -o main -I /home/szy/application/cam  -L /home/szy/application/cam -lspdk_interface
79 | */
80 | 


--------------------------------------------------------------------------------
/src/CAM_lib/CAM_interface.h:
--------------------------------------------------------------------------------
 1 | /*   SPDX-License-Identifier: BSD-3-Clause
 2 |  *   Copyright (C) 2016 Intel Corporation.
 3 |  *   All rights reserved.
 4 |  */
 5 | 
 6 | 
 7 | #ifndef GPUSSD_BASELINE_H
 8 | #define GPUSSD_BASELINE_H
 9 | 
10 | 
11 | #include <cuda.h>
12 | // #include "cuda_runtime.h"
13 | #include <cuda_runtime.h>
14 | #include <iostream>
15 | #include <GPU_memory_management.hpp>
16 | 
17 | #include <tuple>
18 | #include <vector>
19 | #include <algorithm>
20 | #include <thread>
21 | #include "threadPool.h"
22 | 
23 | #include "spdk/stdinc.h"
24 | 
25 | #include "spdk/env.h"
26 | #include "spdk/log.h"
27 | #include "spdk/nvme.h"
28 | #include "spdk/nvme_zns.h"
29 | #include "spdk/string.h"
30 | #include "spdk/vmd.h"
31 | 
32 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
33 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
34 | 
35 | #define MAX_EMBED_NUM  1000000
36 | struct ctrlr_entry {
37 |     struct spdk_nvme_ctrlr* ctrlr;
38 |     char name[1024];
39 | };
40 | 
41 | struct ns_entry {
42 |     int32_t id;
43 |     struct spdk_nvme_ctrlr* ctrlr;
44 |     struct spdk_nvme_ns* ns;
45 |     struct spdk_nvme_qpair* qpair;
46 | };
47 | 
48 | 
49 | 
50 | 
51 | 
52 | static  void read_complete(void* arg, const struct spdk_nvme_cpl* completion) ;
53 | //static   int thread_runner(int32_t dev_index);
54 | static    int thread_runner2(int32_t dev_index) ;
55 | static    bool probe_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr_opts* opts);
56 | static    void attach_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr* ctrlr, const struct spdk_nvme_ctrlr_opts* opts);
57 | static void register_ns(struct spdk_nvme_ctrlr* ctrlr, struct spdk_nvme_ns* ns) ;
58 | void task_submit(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr);
59 | // void task_submit(int64_t embed_num, int32_t *embed_id, void *dev_addr);    
60 | int rc4ml_spdk_init(u_int32_t emb_width);
61 |  static void alloc_qpair() ;
62 | inline std::pair<int64_t, int64_t> getEmbedAddr(int32_t embed_id) ;
63 | void spdkmap(void * map_ptr,size_t  pool_size,uint64_t  phy_addr);
64 | void clear_wait_flag();
65 | 
66 | //* new function
67 | 
68 | static  void write_complete(void* arg, const struct spdk_nvme_cpl* completion);
69 | static int thread_runner3(int32_t dev_index);
70 | void task_submit_write(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr);
71 | void clear_wait_flag_write();
72 | void cam_init(u_int32_t emb_width);
73 | void* alloc_gpu(int64_t size);
74 | void free_gpu(void* p);
75 | void cam_clean_up(void);
76 | 
77 | void seq_read_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr);
78 | void seq_write_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr);
79 | 
80 | 
81 | void cam_gemm_read(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr);
82 | void cam_gemm_write(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr);
83 | 
84 | #endif


--------------------------------------------------------------------------------
/src/CAM_lib/threadPool.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREAD_POOL_H
 2 | #define THREAD_POOL_H
 3 | 
 4 | #include <vector>
 5 | #include <queue>
 6 | #include <memory>
 7 | #include <thread>
 8 | #include <mutex>
 9 | #include <condition_variable>
10 | #include <future>
11 | #include <functional>
12 | #include <stdexcept>
13 | 
14 | class ThreadPool {
15 | public:
16 |     ThreadPool(size_t);
17 |     template<class F, class... Args>
18 |     auto enqueue(F&& f, Args&&... args) 
19 |         -> std::future<typename std::result_of<F(Args...)>::type>;
20 |     ~ThreadPool();
21 | private:
22 |     // need to keep track of threads so we can join them
23 |     std::vector< std::thread > workers;
24 |     // the task queue
25 |     std::queue< std::function<void()> > tasks;
26 |     
27 |     // synchronization
28 |     std::mutex queue_mutex;
29 |     std::condition_variable condition;
30 |     bool stop;
31 | };
32 |  
33 | // the constructor just launches some amount of workers
34 | inline ThreadPool::ThreadPool(size_t threads)
35 |     :   stop(false)
36 | {
37 |     for(size_t i = 0;i<threads;++i)
38 |         workers.emplace_back(
39 |             [this]
40 |             {
41 |                 for(;;)
42 |                 {
43 |                     std::function<void()> task;
44 | 
45 |                     {
46 |                         std::unique_lock<std::mutex> lock(this->queue_mutex);
47 |                         this->condition.wait(lock,
48 |                             [this]{ return this->stop || !this->tasks.empty(); });
49 |                         if(this->stop && this->tasks.empty())
50 |                             return;
51 |                         task = std::move(this->tasks.front());
52 |                         this->tasks.pop();
53 |                     }
54 | 
55 |                     task();
56 |                 }
57 |             }
58 |         );
59 | }
60 | 
61 | // add new work item to the pool
62 | template<class F, class... Args>
63 | auto ThreadPool::enqueue(F&& f, Args&&... args) 
64 |     -> std::future<typename std::result_of<F(Args...)>::type>
65 | {
66 |     using return_type = typename std::result_of<F(Args...)>::type;
67 | 
68 |     auto task = std::make_shared< std::packaged_task<return_type()> >(
69 |             std::bind(std::forward<F>(f), std::forward<Args>(args)...)
70 |         );
71 |         
72 |     std::future<return_type> res = task->get_future();
73 |     {
74 |         std::unique_lock<std::mutex> lock(queue_mutex);
75 | 
76 |         // don't allow enqueueing after stopping the pool
77 |         if(stop)
78 |             throw std::runtime_error("enqueue on stopped ThreadPool");
79 | 
80 |         tasks.emplace([task](){ (*task)(); });
81 |     }
82 |     condition.notify_one();
83 |     return res;
84 | }
85 | 
86 | // the destructor joins all threads
87 | inline ThreadPool::~ThreadPool()
88 | {
89 |     {
90 |         std::unique_lock<std::mutex> lock(queue_mutex);
91 |         stop = true;
92 |     }
93 |     condition.notify_all();
94 |     for(std::thread &worker: workers)
95 |         worker.join();
96 | }
97 | 
98 | #endif


--------------------------------------------------------------------------------
/src/CAM_variable_core_lib/threadPool.h:
--------------------------------------------------------------------------------
 1 | #ifndef THREAD_POOL_H
 2 | #define THREAD_POOL_H
 3 | 
 4 | #include <vector>
 5 | #include <queue>
 6 | #include <memory>
 7 | #include <thread>
 8 | #include <mutex>
 9 | #include <condition_variable>
10 | #include <future>
11 | #include <functional>
12 | #include <stdexcept>
13 | 
14 | class ThreadPool {
15 | public:
16 |     ThreadPool(size_t);
17 |     template<class F, class... Args>
18 |     auto enqueue(F&& f, Args&&... args) 
19 |         -> std::future<typename std::result_of<F(Args...)>::type>;
20 |     ~ThreadPool();
21 | private:
22 |     // need to keep track of threads so we can join them
23 |     std::vector< std::thread > workers;
24 |     // the task queue
25 |     std::queue< std::function<void()> > tasks;
26 |     
27 |     // synchronization
28 |     std::mutex queue_mutex;
29 |     std::condition_variable condition;
30 |     bool stop;
31 | };
32 |  
33 | // the constructor just launches some amount of workers
34 | inline ThreadPool::ThreadPool(size_t threads)
35 |     :   stop(false)
36 | {
37 |     for(size_t i = 0;i<threads;++i)
38 |         workers.emplace_back(
39 |             [this]
40 |             {
41 |                 for(;;)
42 |                 {
43 |                     std::function<void()> task;
44 | 
45 |                     {
46 |                         std::unique_lock<std::mutex> lock(this->queue_mutex);
47 |                         this->condition.wait(lock,
48 |                             [this]{ return this->stop || !this->tasks.empty(); });
49 |                         if(this->stop && this->tasks.empty())
50 |                             return;
51 |                         task = std::move(this->tasks.front());
52 |                         this->tasks.pop();
53 |                     }
54 | 
55 |                     task();
56 |                 }
57 |             }
58 |         );
59 | }
60 | 
61 | // add new work item to the pool
62 | template<class F, class... Args>
63 | auto ThreadPool::enqueue(F&& f, Args&&... args) 
64 |     -> std::future<typename std::result_of<F(Args...)>::type>
65 | {
66 |     using return_type = typename std::result_of<F(Args...)>::type;
67 | 
68 |     auto task = std::make_shared< std::packaged_task<return_type()> >(
69 |             std::bind(std::forward<F>(f), std::forward<Args>(args)...)
70 |         );
71 |         
72 |     std::future<return_type> res = task->get_future();
73 |     {
74 |         std::unique_lock<std::mutex> lock(queue_mutex);
75 | 
76 |         // don't allow enqueueing after stopping the pool
77 |         if(stop)
78 |             throw std::runtime_error("enqueue on stopped ThreadPool");
79 | 
80 |         tasks.emplace([task](){ (*task)(); });
81 |     }
82 |     condition.notify_one();
83 |     return res;
84 | }
85 | 
86 | // the destructor joins all threads
87 | inline ThreadPool::~ThreadPool()
88 | {
89 |     {
90 |         std::unique_lock<std::mutex> lock(queue_mutex);
91 |         stop = true;
92 |     }
93 |     condition.notify_all();
94 |     for(std::thread &worker: workers)
95 |         worker.join();
96 | }
97 | 
98 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/include/log.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __AEOLUS_LOG_CUH
  2 | #define __AEOLUS_LOG_CUH
  3 | 
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <stdarg.h>
  7 | #include <stdio.h>
  8 | 
  9 | #define AEOLUS_LOG_DEBUG(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_DEBUG, __FILE__, __LINE__, __VA_ARGS__)
 10 | #define AEOLUS_LOG_INFO(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_INFO, __FILE__, __LINE__, __VA_ARGS__)
 11 | #define AEOLUS_LOG_WARNING(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_WARNING, __FILE__, __LINE__, __VA_ARGS__)
 12 | #define AEOLUS_LOG_ERROR(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_ERROR, __FILE__, __LINE__, __VA_ARGS__)
 13 | 
 14 | #ifndef __CUDA_ARCH__
 15 |     #define AEOLUS_LOG(level, filename, lineno, ...) aeolus_log(level, filename, lineno, __VA_ARGS__); 
 16 | #else
 17 |     #define AEOLUS_LOG(level, filename, lineno, ...) \
 18 |         printf("%s:%d ", filename, lineno); \
 19 |         printf(__VA_ARGS__); 
 20 | #endif
 21 | 
 22 | enum aeolus_log_level
 23 | {
 24 |     AEOLUS_LOG_LEVEL_NULL = 0,
 25 |     AEOLUS_LOG_LEVEL_DEBUG = 1,
 26 |     AEOLUS_LOG_LEVEL_INFO = 2,
 27 |     AEOLUS_LOG_LEVEL_WARNING = 3,
 28 |     AEOLUS_LOG_LEVEL_ERROR = 4,
 29 | };
 30 | 
 31 | static aeolus_log_level log_level = AEOLUS_LOG_LEVEL_NULL;
 32 | 
 33 | inline aeolus_log_level get_log_level()
 34 | {
 35 |     if (log_level != AEOLUS_LOG_LEVEL_NULL)
 36 |     {
 37 |         return log_level;
 38 |     }
 39 | 
 40 |     char *log_level_env = getenv("AEOLUS_LOG_LEVEL");
 41 |     if (log_level_env == NULL)
 42 |     {
 43 | #ifdef __DEBUG__
 44 |         log_level = AEOLUS_LOG_LEVEL_INFO;
 45 | #else
 46 |         log_level = AEOLUS_LOG_LEVEL_WARNING;
 47 | #endif
 48 |     } else
 49 |     {
 50 |         if (strcmp(log_level_env, "DEBUG") == 0)
 51 |         {
 52 |             log_level = AEOLUS_LOG_LEVEL_DEBUG;
 53 |         } else if (strcmp(log_level_env, "INFO") == 0)
 54 |         {
 55 |             log_level = AEOLUS_LOG_LEVEL_INFO;
 56 |         } else if (strcmp(log_level_env, "WARNING") == 0)
 57 |         {
 58 |             log_level = AEOLUS_LOG_LEVEL_WARNING;
 59 |         } else if (strcmp(log_level_env, "ERROR") == 0)
 60 |         {
 61 |             log_level = AEOLUS_LOG_LEVEL_ERROR;
 62 |         } else
 63 |         {
 64 |             log_level = AEOLUS_LOG_LEVEL_INFO;
 65 |         }
 66 |     }
 67 | 
 68 |     return log_level;
 69 | }
 70 | 
 71 | __host__ inline void aeolus_log(aeolus_log_level level, const char *filename, int lineno, const char *format, ...)
 72 | {
 73 |     if (level < get_log_level())
 74 |     {
 75 |         return;
 76 |     }
 77 | 
 78 |     char *level_str;
 79 |     switch (level)
 80 |     {
 81 |         case AEOLUS_LOG_LEVEL_DEBUG:
 82 |             level_str = (char *)"DEBUG";
 83 |             break;
 84 |         case AEOLUS_LOG_LEVEL_INFO:
 85 |             level_str = (char *)"INFO";
 86 |             break;
 87 |         case AEOLUS_LOG_LEVEL_WARNING:
 88 |             level_str = (char *)"WARNING";
 89 |             break;
 90 |         case AEOLUS_LOG_LEVEL_ERROR:
 91 |             level_str = (char *)"ERROR";
 92 |             break;
 93 |         default:
 94 |             level_str = (char *)"UNKNOWN";
 95 |             break;
 96 |     }
 97 | 
 98 |     va_list args;
 99 |     va_start(args, format);
100 |     fprintf(stderr, "[%s] %s:%d ", level_str, filename, lineno);
101 |     vfprintf(stderr, format, args);
102 |     fprintf(stderr, "\n");
103 |     va_end(args);
104 | }
105 | 
106 | #endif


--------------------------------------------------------------------------------
/doc/EXP.md:
--------------------------------------------------------------------------------
  1 | # Evaluation
  2 | 
  3 | **Important:** Please see [INSTALL.md](./INSTALL.md) for install dependencies and build CAM on a single machine. 
  4 | 
  5 | ## 1. CAM Throughput Microbenchmark
  6 | 
  7 | This is the evalution for fig.8 in the paper.
  8 | 
  9 | 
 10 | 
 11 | ### 1.1 Run the Random Read Benchmark
 12 | 
 13 | 
 14 | ~~~bash
 15 | cd build/benchmarks/CAM_benchmark
 16 | sudo ./test_random_read
 17 | ~~~
 18 | 
 19 | 
 20 | 
 21 | 
 22 | Firstly, the otput will be like this, which means SSDs are initialized.
 23 | ~~~
 24 | Initializing NVMe Controllers
 25 | Attaching to 0000:50:00.0
 26 | Attaching to 0000:57:00.0
 27 | Attaching to 0000:66:00.0
 28 | Attaching to 0000:68:00.0
 29 | Attaching to 0000:9c:00.0
 30 | Attaching to 0000:9d:00.0
 31 | Attaching to 0000:e3:00.0
 32 | Attaching to 0000:e4:00.0
 33 | Attaching to 0000:e5:00.0
 34 | attach_cb 
 35 | Attached to 0000:68:00.0
 36 |   Namespace ID: 1 size: 3840GB
 37 | attach_cb 
 38 | Attached to 0000:66:00.0
 39 |   Namespace ID: 1 size: 3840GB
 40 | attach_cb 
 41 | Attached to 0000:50:00.0
 42 |   Namespace ID: 1 size: 3840GB
 43 | attach_cb 
 44 | Attached to 0000:57:00.0
 45 |   Namespace ID: 1 size: 3840GB
 46 | attach_cb 
 47 | Attached to 0000:e5:00.0
 48 |   Namespace ID: 1 size: 3840GB
 49 | attach_cb 
 50 | Attached to 0000:e4:00.0
 51 |   Namespace ID: 1 size: 3840GB
 52 | attach_cb 
 53 | Attached to 0000:9c:00.0
 54 |   Namespace ID: 1 size: 3840GB
 55 | attach_cb 
 56 | Attached to 0000:9d:00.0
 57 |   Namespace ID: 1 size: 3840GB
 58 | attach_cb 
 59 | Attached to 0000:e3:00.0
 60 |   Namespace ID: 1 size: 3840GB
 61 | Initialization complete.
 62 | ~~~
 63 | 
 64 | 
 65 | 
 66 | After initialization, the output will be like this:
 67 | 
 68 | ~~~bash
 69 | time cost : 617.335267 ms
 70 | bandwidth: 19.9049GB/s
 71 | ~~~
 72 | 
 73 | ## 1.2 Run Other Microbenchmark
 74 | 
 75 | The random write, sequential read, sequential write microbenchmark have similar output to the random read benchmark.
 76 | ~~~bash
 77 | cd build/benchmarks/CAM_benchmark
 78 | sudo ./test_random_write
 79 | sudo ./test_seq_read
 80 | sudo ./test_seq_write
 81 | ~~~
 82 | 
 83 | 
 84 | ## 2. CAM Throughput Microbenchmark Using One Thread to Manage Multiple SSDs
 85 | 
 86 | This is the evalution for fig.7 in the paper.
 87 | 
 88 | ### 2.1 Run the Benchmark
 89 | 
 90 | 
 91 | ~~~bash
 92 | cd build/benchmarks/CAM_variable_core_benchmark
 93 | sudo ./variable_core_test_read
 94 | sudo ./variable_core_test_write
 95 | ~~~
 96 | 
 97 | The output is similar to the `CAM Throughput Microbenchmark`
 98 | 
 99 | 
100 | 
101 | ### 2.2 Run the Benchmark Using One Thread to Manage Different SSDs
102 | 
103 | 
104 | 
105 | To run differenct SSDs number using one thread to control, please edit `/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_read.cu` and  `/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_write.cu`, change `thread_num` and recompile the program.
106 | 
107 | ## 3. Run CAM Sample Code
108 | 
109 | ~~~bash
110 | cd build/lib
111 | sudo ./sample_read
112 | sudo ./sample_write
113 | ~~~
114 | 
115 | ## 4. Run GEMM with CAM
116 |  
117 | ~~~bash
118 | sudo bash run_GEMM.sh
119 | ~~~
120 | 
121 | This script runs a series of tests using the gemm-test application located within a build directory. It iterates over a set of matrix sizes, specified as 'N', ranging from 32k to 256k with increments at approximately consistent steps. For each matrix size 'N', the gemm-test command is executed three times.
122 | 
123 | Each execution outputs the execution time and TFLOPS, as shown in the figure below
124 | ![alt text](pic/GEMM_pic.png "GEMM output")
125 | 
126 | ## 5. Extra Attention
127 | - the initialize operations are required after each reboot.
128 | 
129 | And if you have any questions, please contact us.


--------------------------------------------------------------------------------
/src/CAM_lib/gpu_transfer.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __GPU_TRANSFER_CUH__
  2 | #define __GPU_TRANSFER_CUH__
  3 | #include <cuda_runtime.h>
  4 | #include <thread>
  5 | #include "CAM_interface.h"
  6 | 
  7 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
  8 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
  9 | 
 10 | #define MAX_EMBED_NUM  1000000
 11 | 
 12 | 
 13 | 
 14 | //* A semaphore for sending signals from the host to the device.
 15 | class Host2DeviceSemaphore  {
 16 |  private:
 17 |     void* InboundSemaphore;
 18 |     u_int64_t expectedInboundSemaphore;
 19 |     void* outboundSemaphore;
 20 |     u_int64_t* p_outboundSemaphoreValue;
 21 |     cudaStream_t stream;
 22 |     u_int64_t* total_num;
 23 |  public:
 24 |     Host2DeviceSemaphore(void){ 
 25 |         cudaHostAlloc( (void**)&p_outboundSemaphoreValue,sizeof(u_int64_t),cudaHostAllocDefault ) ;
 26 |         expectedInboundSemaphore =1; 
 27 |         cudaMalloc(&outboundSemaphore,sizeof(u_int64_t));
 28 |         *p_outboundSemaphoreValue =0;
 29 |        
 30 |     }
 31 |     void ConnectToDeviceSemaphore(void* InboundSemaphore_,u_int64_t* total_num_ ) {InboundSemaphore=InboundSemaphore_;total_num=total_num_;}
 32 |     void* GetoutboundSemaphore(void) { return outboundSemaphore;}
 33 |     u_int64_t GetTotalNumber(void) { return *total_num;}
 34 |     void ConnectToStream(cudaStream_t stream1){ stream= stream1;}
 35 |     void signal() {
 36 |         //printf("signal\n");
 37 |         (*p_outboundSemaphoreValue)+=1;  
 38 |         cudaError_t return_value=cudaMemcpyAsync(outboundSemaphore, p_outboundSemaphoreValue, sizeof(u_int64_t), cudaMemcpyHostToDevice,stream);
 39 |         if (return_value != cudaSuccess) {
 40 |             std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorName(return_value) << " - " << cudaGetErrorString(return_value) << std::endl;
 41 |             // 处理错误
 42 |         } 
 43 |         
 44 |         //printf("p_outboundSemaphoreValue: %ld\n",*(u_int64_t*)p_outboundSemaphoreValue);
 45 |     }
 46 | 
 47 |     void wait() {
 48 |         // printf("wait\n");
 49 |         // printf("InboundSemaphore: %ld\n",*(u_int64_t*)InboundSemaphore);
 50 |         // printf("expectedInboundSemaphore: %ld\n",expectedInboundSemaphore);
 51 |         uint64_t start = 0;
 52 |         while((*(u_int64_t*)InboundSemaphore) < expectedInboundSemaphore){
 53 |             start = 0;
 54 |             while (start++ < 100000);
 55 |             
 56 |         }
 57 |         // printf("wait end\n");
 58 |         // printf("InboundSemaphore: %ld\n",*(u_int64_t*)InboundSemaphore);
 59 |         // printf("expectedInboundSemaphore: %ld\n",expectedInboundSemaphore);
 60 |         expectedInboundSemaphore ++;
 61 |     }
 62 | };
 63 | 
 64 | // struct SmDevice2DeviceSemaphoreDeviceHandle {
 65 |     
 66 | 
 67 | 
 68 | 
 69 | 
 70 | //     MSCCLPP_DEVICE_INLINE void signal(uint64_t num) {
 71 | //         *total_num = num;
 72 | //         semaphoreIncrement();
 73 | //         *outboundSemaphoreId = *outboundSemaphoreValue;
 74 | //     }
 75 | 
 76 | //     /// Increase the counter of the local semaphore.
 77 | //     MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { *outboundSemaphoreValue += 1; }
 78 | 
 79 | //     /// Get the value of the local semaphore.
 80 | //     MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return *outboundSemaphoreValue; }
 81 | 
 82 |     
 83 | // };
 84 | 
 85 | 
 86 | 
 87 | 
 88 | void SemaphoreInit(cudaStream_t stream1);
 89 | void Init(u_int32_t access_size,cudaStream_t stream1);
 90 | extern "C" __global__ void init_myKernel(void);
 91 | 
 92 | //* read functions
 93 | void polling_thread(void);
 94 | __device__ void prefetch(int64_t embed_num,uintptr_t *dev_addr);
 95 | __device__ void prefetch_syncronize(void);
 96 | uint64_t* get_d_data(void);
 97 | //*wrtie functions
 98 | void polling_thread_write(void);
 99 | __device__ void writeback(int64_t embed_num,uintptr_t *dev_addr);
100 | __device__ void writeback_syncronize(void);
101 | uint64_t* get_d_data_write(void);
102 | 
103 | void polling_thread_seq(void);
104 | void polling_thread_seq_write(void);
105 | __device__ void prefetch_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr);
106 | __device__ void writeback_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr);
107 | 
108 | 
109 | 
110 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/include/gemm.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <stdint.h>
  3 | #include <ctype.h>
  4 | #include <string.h>
  5 | #include "lightbam.cuh"
  6 | uint64_t parse_offset(char *str)
  7 | {
  8 |     int len = strlen(str);
  9 |     char unit = ' ';
 10 |     if (!isdigit(str[len - 1]))
 11 |     {
 12 |         unit = str[len - 1];
 13 |         str[len - 1] = '\0';
 14 |     }
 15 |     uint64_t offset = atoll(str);
 16 |     if (unit == 'K' || unit == 'k')
 17 |     {
 18 |         offset = offset * 1024;
 19 |     }
 20 |     if (unit == 'M' || unit == 'm')
 21 |     {
 22 |         offset = offset * 1024 * 1024;
 23 |     }
 24 |     if (unit == 'G' || unit == 'g')
 25 |     {
 26 |         offset = offset * 1024 * 1024 * 1024;
 27 |     }
 28 |     if (unit == 'T' || unit == 't')
 29 |     {
 30 |         offset = offset * 1024 * 1024 * 1024 * 1024;
 31 |     }
 32 |     return offset;
 33 | }
 34 | 
 35 | class PinnedBuffer
 36 | {
 37 | private:
 38 |     void *iobuf;
 39 |     aeolus_dev_mem_context *iobuf_ctx;
 40 |     uint64_t *h_iobuf_phys;
 41 |     uint64_t *d_iobuf_phys;
 42 |     uint64_t *prp_list;
 43 |     uint64_t *h_prp_phys;
 44 |     uint64_t *d_prp_phys;
 45 |     uint64_t max_io_size;
 46 |     Device *dev;
 47 | 
 48 | public:
 49 |     PinnedBuffer(Device* dev, uint64_t size, uint64_t max_io_size = 0)
 50 |     {
 51 |         int ret = dev->alloc_device_memory(
 52 |             &iobuf, &iobuf_ctx, size, &h_iobuf_phys
 53 |         );
 54 |         if (ret != 0)
 55 |         {
 56 |             AEOLUS_LOG_ERROR("Failed to allocate device memory for IO buffer: %s", strerror(ret));
 57 |             exit(-1);
 58 |         }
 59 |         size_t iobuf_phys_size = size / AEOLUS_DEVICE_PGSIZE * sizeof(uint64_t);
 60 |         AEOLUS_CUDA_CHECK(cudaMalloc(&d_iobuf_phys, iobuf_phys_size));
 61 |         AEOLUS_CUDA_CHECK(cudaMemcpy(d_iobuf_phys, h_iobuf_phys, iobuf_phys_size, cudaMemcpyHostToDevice));
 62 | 
 63 |         if (max_io_size > AEOLUS_HOST_PGSIZE * 2)
 64 |         {
 65 |             uint64_t prp_list_size = size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t);
 66 |             AEOLUS_LOG_INFO("Allocating PRP buffer.");
 67 |             dev->alloc_host_memory((void **)&prp_list, prp_list_size, &h_prp_phys);
 68 | 
 69 |             // Fill in PRP table.
 70 |             for (int i = 0; i < size / AEOLUS_DEVICE_PGSIZE; i++)
 71 |             {
 72 |                 for (int j = 0; j < AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE; j++)
 73 |                 {
 74 |                     if (i == 0 && j == 0)
 75 |                     {
 76 |                         continue;
 77 |                     }
 78 |                     prp_list[i * AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE + j - 1] = 
 79 |                         h_iobuf_phys[i] + j * AEOLUS_HOST_PGSIZE;
 80 |                 }
 81 |             }
 82 | 
 83 |             // Move PRP physical address to GPU.
 84 |             size_t prp_phys_size = CEIL(prp_list_size, AEOLUS_HOST_PGSIZE) * sizeof(uint64_t);
 85 |             AEOLUS_CUDA_CHECK(cudaMalloc((void **)&d_prp_phys, prp_phys_size));
 86 |             AEOLUS_CUDA_CHECK(cudaMemcpy(d_prp_phys, h_prp_phys, prp_phys_size, cudaMemcpyHostToDevice));
 87 |         }
 88 |         this->max_io_size = max_io_size;
 89 |         this->dev = dev;
 90 |     }
 91 | 
 92 |     Request create_request(uint64_t offset, uint64_t start_lb = 0, int num_items = 0)
 93 |     {
 94 |         if (num_items == 0)
 95 |             num_items = max_io_size / AEOLUS_LB_SIZE;
 96 |         Request req(start_lb, num_items);
 97 |         req.dest_addr = h_iobuf_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE;
 98 |         req.next_addr = offset / max_io_size;
 99 |         if (max_io_size <= AEOLUS_HOST_PGSIZE * 2)
100 |         {
101 |             offset += AEOLUS_HOST_PGSIZE;
102 |             req.next_addr = h_iobuf_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE;
103 |         }
104 |         return req;
105 |     }
106 | 
107 |     uint64_t *get_iobuf_phys()
108 |     {
109 |         return h_iobuf_phys;
110 |     }
111 | 
112 |     uint64_t *get_d_iobuf_phys()
113 |     {
114 |         return d_iobuf_phys;
115 |     }
116 | 
117 |     uint64_t *get_d_prp_phys()
118 |     {
119 |         return d_prp_phys;
120 |     }
121 | 
122 |     operator void *()
123 |     {
124 |         return iobuf;
125 |     }
126 | 
127 |     ~PinnedBuffer()
128 |     {
129 |         if (max_io_size > AEOLUS_HOST_PGSIZE * 2)
130 |         {
131 |             dev->free_host_memory(prp_list, h_prp_phys);
132 |             AEOLUS_CUDA_CHECK(cudaFree(d_prp_phys));
133 |         }
134 |         dev->free_device_memory(iobuf_ctx);
135 |     }
136 | };


--------------------------------------------------------------------------------
/src/benchmarks/CAM_benchmark/test_seq_write.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_interface.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 300000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(1000000UL*10*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |         uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |         beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 | 
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             //cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
 98 | 
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             seq_write_submit(100, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag_write();
104 | 
105 |     }
106 |     end_tsc = get_tscp();
107 |         sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |         printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |         std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 |     
111 |     // g_namespaces.resize(1);
112 | 
113 |     //printf("Start to submit task\n");
114 |     free_gpu(gem_memory);
115 |     // auto time_start = std::chrono::high_resolution_clock::now();
116 | 
117 |     
118 |     
119 |     // auto time_end = std::chrono::high_resolution_clock::now();
120 | 
121 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
122 |     // printf("Time: %f\n", time_span.count());
123 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
124 | }
125 | 
126 | int main(int argc, char** argv) {
127 |     
128 |     cam_init(4096);
129 |     run_task_function_test();    
130 |     cam_clean_up();
131 | 
132 |     return 0;
133 | }
134 | 
135 | /*
136 | nvcc -o test_seq_write  -I /home/szy/yzh_hyprion/spdk_interface  -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_seq_write.cu
137 | */


--------------------------------------------------------------------------------
/src/benchmarks/CAM_benchmark/test_random_read.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_interface.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 100000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(embed_num*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |         uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |         beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 | 
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             //cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
 98 | 
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag();
104 | 
105 |     }
106 |     end_tsc = get_tscp();
107 |         sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |         printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |         std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 |     
111 |     // g_namespaces.resize(1);
112 | 
113 |     //printf("Start to submit task\n");
114 |     free_gpu(gem_memory);
115 |     // auto time_start = std::chrono::high_resolution_clock::now();
116 | 
117 |     
118 |     
119 |     // auto time_end = std::chrono::high_resolution_clock::now();
120 | 
121 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
122 |     // printf("Time: %f\n", time_span.count());
123 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
124 | }
125 | 
126 | int main(int argc, char** argv) {
127 |     
128 |     cam_init(4096);
129 |     run_task_function_test();    
130 |     cam_clean_up();
131 | 
132 |     return 0;
133 | }
134 | 
135 | /*
136 | nvcc -o test_random_read  -I /home/szy/yzh_hyprion/spdk_interface  -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_random_read.cu
137 | */
138 | 


--------------------------------------------------------------------------------
/src/benchmarks/CAM_benchmark/test_random_write.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_interface.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 100000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(1000000UL*10*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |         uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |         beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 | 
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             //cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
 98 | 
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             cam_gemm_write((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag_write();
104 | 
105 |     }
106 |     end_tsc = get_tscp();
107 |         sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |         printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |         std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 |     
111 |     // g_namespaces.resize(1);
112 | 
113 |     //printf("Start to submit task\n");
114 |     free_gpu(gem_memory);
115 |     // auto time_start = std::chrono::high_resolution_clock::now();
116 | 
117 |     
118 |     
119 |     // auto time_end = std::chrono::high_resolution_clock::now();
120 | 
121 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
122 |     // printf("Time: %f\n", time_span.count());
123 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
124 | }
125 | 
126 | int main(int argc, char** argv) {
127 |     
128 |     cam_init(4096);
129 |     run_task_function_test();    
130 |     cam_clean_up();
131 | 
132 |     return 0;
133 | }
134 | 
135 | /*
136 | nvcc -o test_random_write  -I /home/szy/yzh_hyprion/spdk_interface  -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_random_write.cu
137 | */
138 | 


--------------------------------------------------------------------------------
/src/benchmarks/CAM_benchmark/test_seq_read.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_interface.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 300000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(embed_num*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     //std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |     uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |     beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 |         
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 98 |             //cam_gemm_read(embed_id, embed_num,(uintptr_t)gem_memory);
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag();
104 |         
105 |     }
106 |     end_tsc = get_tscp();
107 |     sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |     printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |     std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 | 
111 |     
112 |     // g_namespaces.resize(1);
113 | 
114 |     //printf("Start to submit task\n");
115 |     free_gpu(gem_memory);
116 |     // auto time_start = std::chrono::high_resolution_clock::now();
117 | 
118 |     
119 |     
120 |     // auto time_end = std::chrono::high_resolution_clock::now();
121 | 
122 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
123 |     // printf("Time: %f\n", time_span.count());
124 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
125 | }
126 | 
127 | int main(int argc, char** argv) {
128 |     
129 |     cam_init(4096);
130 |     run_task_function_test();    
131 |     cam_clean_up();
132 | 
133 |     return 0;
134 | }
135 | 
136 | /*
137 | nvcc -o test_seq_read  -I /home/szy/yzh_hyprion/spdk_interface  -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_seq_read.cu
138 | */


--------------------------------------------------------------------------------
/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_read.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_variable_core.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 1000000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(1000000UL*10*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |         uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |         beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 | 
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             //cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
 98 | 
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag();
104 | 
105 |     }
106 |     end_tsc = get_tscp();
107 |         sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |         printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |         std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 |     
111 |     // g_namespaces.resize(1);
112 | 
113 |     //printf("Start to submit task\n");
114 |     free_gpu(gem_memory);
115 |     // auto time_start = std::chrono::high_resolution_clock::now();
116 | 
117 |     
118 |     
119 |     // auto time_end = std::chrono::high_resolution_clock::now();
120 | 
121 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
122 |     // printf("Time: %f\n", time_span.count());
123 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
124 | }
125 | 
126 | int main(int argc, char** argv) {
127 |     int thread_num =2;
128 |     cam_init(4096,thread_num);
129 |     run_task_function_test();    
130 |     cam_clean_up();
131 | 
132 |     return 0;
133 | }
134 | 
135 | /*
136 | nvcc -o variable_core_test  -I /home/szy/application/spdk_variable_core  -L /home/szy/application/spdk_variable_core -lspdk_variable_core variable_core_test.cu
137 | */
138 | 


--------------------------------------------------------------------------------
/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_write.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <math.h>
  3 | #include <unistd.h>
  4 | #include <string>
  5 | #include <fcntl.h>
  6 | #include <random>
  7 | #include <chrono>
  8 | #include <thread>
  9 | #include <cstdint>
 10 | #include<ctime>
 11 | #include "CAM_variable_core.h"
 12 | #include <GPU_memory_management.hpp>
 13 | 
 14 | 
 15 | const int64_t embed_num = 1000000;
 16 | //uintptr_t dev_addr[embed_num];
 17 | //static GPUMemCtl* gpuMemCtl;
 18 | static const int64_t lba_size = 512;
 19 | static int64_t embed_entry_width ;
 20 | static int64_t embed_entry_lba;
 21 | 
 22 | 
 23 | 
 24 | __inline__ uint64_t get_tsc()
 25 | {
 26 |     uint64_t a, d;
 27 |     __asm__ volatile("rdtsc" : "=a"(a), "=d"(d));
 28 |     return (d << 32) | a;
 29 | }
 30 |  
 31 | __inline__ uint64_t get_tscp(void)
 32 | {
 33 |   uint32_t lo, hi;
 34 |   // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID
 35 |   __asm__ __volatile__ (
 36 |       "rdtscp" : "=a"(lo), "=d"(hi)
 37 |       );
 38 |   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
 39 | }
 40 | 
 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz)
 42 | {
 43 |   return cycles * (1000000000.0 / hz);
 44 | }
 45 | 
 46 | uint64_t get_cpu_freq()
 47 | {
 48 |     FILE *fp=popen("lscpu | grep CPU | grep MHz | awk  {'print $3'}","r");
 49 |     if(fp == nullptr)
 50 |         return 0;
 51 |  
 52 |     char cpu_mhz_str[200] = { 0 };
 53 |     fgets(cpu_mhz_str,80,fp);
 54 |     fclose(fp);
 55 |  
 56 |     return atof(cpu_mhz_str) * 1000 * 1000;
 57 | 
 58 | }
 59 | 
 60 | 
 61 | 
 62 | 
 63 | static void run_task_function_test() {
 64 |     u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t));
 65 |     //launch_idle_kernel();
 66 |     void* gem_memory = alloc_gpu(embed_num*10*4096);
 67 |     for (int64_t i = 0; i < embed_num; i++) {
 68 |         embed_id[i] = i;
 69 |         //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width;
 70 |     }
 71 |     // int buffer[1024];
 72 |     // int buffer_fake[1024];
 73 |     // int buffer2[1024];
 74 |     // for(int i=0;i<1024;i++){
 75 |     //     buffer[i]=i;
 76 |     //     buffer_fake[i] =0;
 77 |     // }
 78 |     std::cout<<" begin!"<<std::endl;
 79 |     std::random_shuffle(embed_id, embed_id + embed_num-1);
 80 |     //std::random_shuffle(dev_addr, dev_addr + embed_num);
 81 | 
 82 |     // cudaMemcpy((void*)(dev_addr[2]), buffer, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 83 |     // task_submit_write(embed_num, (u_int64_t)embed_id, dev_addr);
 84 |     // clear_wait_flag_write();
 85 |     // cudaMemcpy((void*)(dev_addr[2]), buffer_fake, 1024 * sizeof(int), cudaMemcpyHostToDevice);
 86 |     // task_submit(embed_num, (u_int64_t)embed_id, dev_addr);
 87 |     // clear_wait_flag();
 88 |     // cudaMemcpy(buffer2, (void*)(dev_addr[2]), 1024 * sizeof(int) , cudaMemcpyDeviceToHost);
 89 |     clock_t start,finish;
 90 |     std::cout<<"loop"<<std::endl;
 91 |     double sum=0;
 92 |         uint64_t beg_tsc, end_tsc, middle_tsc;
 93 |         beg_tsc = get_tscp();
 94 |     for(int i=0;i<10;i++){
 95 | 
 96 |             //seq_read_submit(0, embed_num,(uintptr_t)gem_memory);
 97 |             //cam_gemm_read((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
 98 | 
 99 |             //clear_wait_flag();
100 |             //seq_write_submit(0, embed_num,(uintptr_t)gem_memory);
101 |             cam_gemm_write((u_int64_t *)embed_id, embed_num,(uintptr_t)gem_memory);
102 |         
103 |             clear_wait_flag_write();
104 | 
105 |     }
106 |     end_tsc = get_tscp();
107 |         sum = 1.0*(end_tsc-beg_tsc)/ 2.2;
108 |         printf("time cost : %lf ms\n",1.0*sum/1000000);
109 |         std::cout<<"bandwidth: "<< embed_num*4096*10 / sum  << "GB/s" <<std::endl;
110 |     
111 |     // g_namespaces.resize(1);
112 | 
113 |     //printf("Start to submit task\n");
114 |     free_gpu(gem_memory);
115 |     // auto time_start = std::chrono::high_resolution_clock::now();
116 | 
117 |     
118 |     
119 |     // auto time_end = std::chrono::high_resolution_clock::now();
120 | 
121 |     // std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double>>(time_end - time_start);
122 |     // printf("Time: %f\n", time_span.count());
123 |     // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024);
124 | }
125 | 
126 | int main(int argc, char** argv) {
127 |     int thread_num =2;
128 |     cam_init(4096,thread_num);
129 |     run_task_function_test();    
130 |     cam_clean_up();
131 | 
132 |     return 0;
133 | }
134 | 
135 | /*
136 | nvcc -o variable_core_test_write  -I /home/szy/application/spdk_variable_core  -L /home/szy/application/spdk_variable_core -lspdk_variable_core variable_core_test_write.cu
137 | */
138 | 


--------------------------------------------------------------------------------
/src/applications/gemm/include/queue.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __AEOLUS_QUEUE_CUH
  2 | #define __AEOLUS_QUEUE_CUH
  3 | 
  4 | #include <stdint.h>
  5 | #include <assert.h>
  6 | #include "util.cuh"
  7 | #include "log.cuh"
  8 | 
  9 | /**
 10 |  * @brief Abstraction of an SSD SQ-CQ pair.
 11 |  * 
 12 |  */
 13 | class QueuePair
 14 | {
 15 | public:
 16 |     uint32_t *cmd_id_to_req_id;
 17 |     uint32_t *cmd_id_to_sq_pos;
 18 |     bool     *sq_entry_busy;
 19 |     uint32_t sq_tail;
 20 |     uint32_t cq_head;
 21 |     uint32_t cmd_id; // also number of commands submitted
 22 |     uint32_t *sqtdbl, *cqhdbl;
 23 |     uint32_t num_completed;
 24 |     volatile uint32_t *sq;
 25 |     volatile uint32_t *cq;
 26 | protected:
 27 |     uint32_t namespace_id;
 28 |     uint32_t queue_depth;
 29 | 
 30 | public:
 31 |     inline __host__ __device__ QueuePair()
 32 |     {
 33 |     }
 34 | 
 35 |     inline __host__ __device__ QueuePair(volatile uint32_t *sq, volatile uint32_t *cq, uint32_t namespace_id, uint32_t *sqtdbl, uint32_t *cqhdbl, uint32_t queue_depth, uint32_t *cmd_id_to_req_id = nullptr, uint32_t *cmd_id_to_sq_pos = nullptr, bool *sq_entry_busy = nullptr)
 36 |         : sq(sq), cq(cq), sq_tail(0), cq_head(0), cmd_id(0), namespace_id(namespace_id), sqtdbl(sqtdbl), cqhdbl(cqhdbl), cmd_id_to_req_id(cmd_id_to_req_id), cmd_id_to_sq_pos(cmd_id_to_sq_pos), sq_entry_busy(sq_entry_busy), queue_depth(queue_depth), num_completed(0)
 37 |     {
 38 |     }
 39 | 
 40 |     __host__ __device__ void submit(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0);
 41 | 
 42 |     __device__ void submit_fence(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0);
 43 | 
 44 |     __host__ __device__ void fill_sq(uint32_t cid, uint32_t pos, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0, uint32_t req_id = 0xffffffff);
 45 | 
 46 |     __host__ __device__ void poll(uint32_t &code, uint32_t cid);
 47 | 
 48 |     __host__ __device__ void poll_with_dw0(uint32_t &code, uint32_t cid, uint32_t &dw0);
 49 | };
 50 | 
 51 | /**
 52 |  * @brief Abstraction of an SSD IO queue pair.
 53 |  * 
 54 |  */
 55 | class IoQueuePair : public QueuePair {
 56 | public:
 57 |     inline __host__ IoQueuePair(
 58 |         volatile uint32_t *sq, 
 59 |         volatile uint32_t *cq, 
 60 |         uint32_t namespace_id, 
 61 |         uint32_t *sqtdbl, 
 62 |         uint32_t *cqhdbl, 
 63 |         uint32_t queue_depth, 
 64 |         uint32_t *cmd_id_to_req_id = nullptr, 
 65 |         uint32_t *cmd_id_to_sq_pos = nullptr, 
 66 |         bool *sq_entry_busy = nullptr
 67 |     ) : QueuePair(
 68 |         sq, cq, namespace_id, sqtdbl, cqhdbl, queue_depth, 
 69 |         cmd_id_to_req_id, cmd_id_to_sq_pos, sq_entry_busy
 70 |     )
 71 |     {
 72 |         // AEOLUS_LOG_INFO("IoQueuePair sqtdbl %p cqhdbl %p", sqtdbl, cqhdbl);
 73 |     }
 74 | 
 75 |     __device__ void poll_range(uint32_t &code, int expected_sq_head, bool should_break);
 76 |     __device__ void poll_multiple(uint32_t &code, int cnt);
 77 |     __device__ void poll_until_sq_entry_free(uint32_t &code, int expected_sq_pos);
 78 | };
 79 | 
 80 | /**
 81 |  * @brief Abstraction of an SSD admin queue pair.
 82 |  * 
 83 |  */
 84 | class AdminQueuePair : public QueuePair {
 85 | public:
 86 |     inline __host__ AdminQueuePair(
 87 |         volatile uint32_t *sq, 
 88 |         volatile uint32_t *cq, 
 89 |         uint32_t namespace_id, 
 90 |         uint32_t *sqtdbl, 
 91 |         uint32_t *cqhdbl, 
 92 |         uint32_t queue_depth, 
 93 |         uint32_t *cmd_id_to_req_id = nullptr, 
 94 |         uint32_t *cmd_id_to_sq_pos = nullptr, 
 95 |         bool *sq_entry_busy = nullptr
 96 |     ) : QueuePair(
 97 |         sq, cq, namespace_id, sqtdbl, cqhdbl, queue_depth, 
 98 |         cmd_id_to_req_id, cmd_id_to_sq_pos, sq_entry_busy
 99 |     )
100 |     {
101 |         // AEOLUS_LOG_INFO("AdminQueuePair sqtdbl %p cqhdbl %p", sqtdbl, cqhdbl);
102 |     }
103 |     __host__ __device__ void submit_with_ns(uint32_t &cid, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0);
104 |     __host__ __device__ void fill_sq_with_ns(uint32_t cid, uint32_t pos, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0, uint32_t req_id = 0xffffffff);
105 |     __host__ uint32_t set_num_queues(uint16_t nsqr, uint16_t ncqr);
106 |     __host__ uint32_t get_num_queues(uint16_t &nsqa, uint16_t &ncqa);
107 |     __host__ uint32_t identify(uint8_t cns, uint16_t cntid, uint32_t nsid, uint64_t prp1);
108 |     __host__ uint32_t create_cq_cont(uint16_t cqid, uint64_t cq_phys, uint16_t queue_depth);
109 |     __host__ uint32_t create_sq_cont(uint16_t sqid, uint16_t cqid, uint64_t sq_phys, uint16_t queue_depth);
110 |     __host__ uint32_t delete_sq(uint16_t sqid);
111 |     __host__ uint32_t delete_cq(uint16_t cqid);
112 | };
113 | 
114 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/include/util.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __AEOLUS_UTIL_CUH
  2 | #define __AEOLUS_UTIL_CUH
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string>
  7 | #include <string.h>
  8 | #include "cufile.h"
  9 | 
 10 | // NVMe BAR0 register sizes and offsets.
 11 | 
 12 | #define NVME_BAR0_SIZE      0x4000
 13 | #define NVME_REG_CC         0x14    // addr: controller configuration
 14 | #define NVME_REG_CC_EN      0x1     // mask: enable controller
 15 | #define NVME_REG_CSTS       0x1c    // addr: controller status
 16 | #define NVME_REG_CSTS_RDY   0x1     // mask: controller ready
 17 | #define NVME_REG_AQA        0x24    // addr: admin queue attributes
 18 | #define NVME_REG_ASQ        0x28    // addr: admin submission queue base addr
 19 | #define NVME_REG_ACQ        0x30    // addr: admin completion queue base addr
 20 | #define NVME_REG_SQTDBL     0x1000  // addr: submission queue 0 tail doorbell
 21 | #define NVME_REG_CQHDBL     0x1004  // addr: completion queue 0 sq_tail doorbell
 22 | 
 23 | // NVMe admin opcode
 24 | #define NVME_ADMIN_OPCODE_DELETE_SQ     0x00
 25 | #define NVME_ADMIN_OPCODE_CREATE_SQ     0x01
 26 | #define NVME_ADMIN_OPCODE_DELETE_CQ     0x04
 27 | #define NVME_ADMIN_OPCODE_CREATE_CQ     0x05
 28 | #define NVME_ADMIN_OPCODE_IDENTIFY      0x06
 29 | #define NVME_ADMIN_OPCODE_SET_FEATURES  0x09
 30 | #define NVME_ADMIN_OPCODE_GET_FEATURES  0x0a
 31 | 
 32 | // NVMe opcode
 33 | #define NVME_OPCODE_READ                0x02
 34 | #define NVME_OPCODE_WRITE               0x01
 35 | 
 36 | // NVMe feature ID.
 37 | #define NVME_FEATURE_ID_NUM_QUEUES      0x07
 38 | 
 39 | // NVMe field masks.
 40 | #define NVME_ENTRY_PHASE_MASK   0x10000
 41 | #define NVME_ENTRY_CID_MASK     0xffff  // mask: command id
 42 | #define NVME_ENTRY_SC_MASK      0xff    // mask: status code
 43 | #define NVME_ENTRY_SQ_HEAD_MASK 0xffff
 44 | #define NVME_RW_LIMITED_RETRY_MASK 0x80000000
 45 | 
 46 | // NVMe misc
 47 | #define NVME_BROADCAST_NSID     0xffffffff
 48 | #define NVME_SQ_ENTRY_SIZE      64
 49 | #define NVME_CQ_ENTRY_SIZE      16
 50 | #define NVME_DBLSTRIDE          8
 51 | 
 52 | // Other constants.
 53 | 
 54 | #define AEOLUS_HOST_PGSIZE        4096
 55 | #define AEOLUS_DEVICE_PGSIZE      0x10000
 56 | #define AEOLUS_ADMIN_QUEUE_DEPTH  64
 57 | #define AEOLUS_WARP_SIZE          32
 58 | #define AEOLUS_LB_SIZE            512
 59 | #define AEOLUS_NUM_THREADS_PER_BLOCK 512
 60 | #define AEOLUS_MAX_NUM_REQUESTS   4000000
 61 | 
 62 | #define AEOLUS_MAX_NUM_QUEUES     -1
 63 | #define AEOLUS_MAX_DATA_TRANSFER  -1
 64 | 
 65 | // Check cuda errors.
 66 | 
 67 | #define AEOLUS_CUDA_CHECK(ans) gpuAssert((ans), __FILE__, __LINE__)
 68 | 
 69 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
 70 | {
 71 |     if (code != cudaSuccess)
 72 |     {
 73 |         fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
 74 |         if (abort)
 75 |             exit(1);
 76 |     }
 77 | }
 78 | 
 79 | inline bool isPowerOfTwo(int num) {
 80 |     return (num > 0) && ((num & (num - 1)) == 0);
 81 | }
 82 | 
 83 | inline uint64_t longrand(uint64_t max, uint64_t min = 0) {
 84 |     return min + (((unsigned long)rand() << 31 | rand()) % (max - min));
 85 | }
 86 | 
 87 | #define MIN(a, b) ((a) < (b) ? (a) : (b))
 88 | #define MAX(a, b) ((b) < (a) ? (a) : (b))
 89 | #define CEIL(a, b) (((a)+(b)-1) / (b))
 90 | 
 91 | //
 92 | // cuda driver error description
 93 | //
 94 | static inline const char *GetCuErrorString(CUresult curesult) {
 95 | 	const char *descp;
 96 | 	if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS)
 97 | 		descp = "unknown cuda error";
 98 | 	return descp;
 99 | }
100 | 
101 | //
102 | // cuFile APIs return both cuFile specific error codes as well as POSIX error codes
103 | // for ease, the below template can be used for getting the error description depending
104 | // on its type.
105 | 
106 | // POSIX
107 | template<class T,
108 | 	typename std::enable_if<std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
109 | std::string cuFileGetErrorString(T status) {
110 | 	status = std::abs(status);
111 | 	return IS_CUFILE_ERR(status) ?
112 | 		std::string(CUFILE_ERRSTR(status)) : std::string(strerror(status));
113 | }
114 | 
115 | // CUfileError_t
116 | template<class T,
117 | 	typename std::enable_if<!std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
118 | std::string cuFileGetErrorString(T status) {
119 | 	std::string errStr = cuFileGetErrorString(static_cast<int>(status.err));
120 | 	if (IS_CUDA_ERR(status))
121 | 		errStr.append(".").append(GetCuErrorString(status.cu_err));
122 | 	return errStr;
123 | }
124 | 
125 | #define AEOLUS_CUFILE_CHECK(ans) cufileAssert((ans), __FILE__, __LINE__)
126 | 
127 | inline void cufileAssert(CUfileError_t status, const char *file, int line, bool abort = true)
128 | {
129 |     if (status.err != CU_FILE_SUCCESS)
130 |     {
131 |         fprintf(stderr, "CUfileAssert: %s %s %d\n", cuFileGetErrorString(status).c_str(), file, line);
132 |         if (abort)
133 |             exit(1);
134 |     }
135 | }
136 | 
137 | #endif


--------------------------------------------------------------------------------
/src/applications/gemm/gemm/cam_gemm.cu:
--------------------------------------------------------------------------------
  1 | #include "lightbam.cuh"
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <cublas_v2.h>
  7 | #include <algorithm>
  8 | #include "gemm.cuh"
  9 | #include <vector>
 10 | #include "CAM_interface.h"
 11 | 
 12 | typedef float fp_t;
 13 | int main(int argc, char *argv[])
 14 | {
 15 |     if (argc != 10)
 16 |     {
 17 |         printf("Usage: %s m n k a_offset b_offset c_offset block_size max_io_size num_ssds\n", argv[0]);
 18 |         return 1;
 19 |     }
 20 |     int m = parse_offset(argv[1]);
 21 |     int n = parse_offset(argv[2]);
 22 |     int k = parse_offset(argv[3]);
 23 |     uint64_t a_offset = parse_offset(argv[4]);
 24 |     uint64_t b_offset = parse_offset(argv[5]);
 25 |     uint64_t c_offset = parse_offset(argv[6]);
 26 |     uint64_t block_size = parse_offset(argv[7]);
 27 |     uint64_t max_io_size = parse_offset(argv[8]);
 28 |     int num_ssds = atoi(argv[9]);
 29 |     if (m % block_size != 0 || n % block_size != 0)
 30 |     {
 31 |         std::cout<<"m and n must be a multiple of block_size"<<std::endl;
 32 |         return 1;
 33 |     }
 34 |     int m_blocks = m / block_size;
 35 |     int n_blocks = n / block_size;
 36 |     if (block_size * sizeof(fp_t) % max_io_size != 0)
 37 |     {
 38 |         std::cout<<"block_size * sizeof(fp_t) must be a multiple of max_io_size"<<std::endl;
 39 |         return 1;
 40 |     }
 41 |     int num_queues_per_ssd = CEIL(block_size * k * sizeof(fp_t), num_ssds * 4096 * max_io_size) + 1;
 42 |     fp_t *a0, *a1, *b0, *b1, *c0, *c1;
 43 |     cam_init(max_io_size);
 44 |     a0 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 45 |     a1 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 46 |     b0 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 47 |     b1 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 48 |     c0 = (fp_t*)alloc_gpu(block_size * block_size * sizeof(fp_t));
 49 |     c1 = (fp_t*)alloc_gpu(block_size * block_size * sizeof(fp_t));
 50 |     if(a0 == NULL || a1 == NULL || b0 == NULL || b1 == NULL || c0 == NULL || c1 == NULL){
 51 |         printf("alloc gpu memory failed\n");
 52 |         return 1;
 53 |     }
 54 |     int num_reqs = CEIL(block_size * k * sizeof(fp_t), max_io_size);
 55 |     u_int64_t *h_reqs = (u_int64_t *)malloc(num_reqs * sizeof(u_int64_t));
 56 |     u_int64_t *h_reqs2 = (u_int64_t *)malloc(num_reqs * sizeof(u_int64_t));
 57 |     cublasHandle_t handle;
 58 |     cublasCreate(&handle);
 59 |     cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
 60 |     fp_t alpha = 1.0f, beta = 0.0f;
 61 |     cudaEvent_t start, stop, gemm_start, gemm_stop;
 62 |     cudaEventCreate(&start);
 63 |     cudaEventCreate(&stop);
 64 |     cudaEventCreate(&gemm_start);
 65 |     cudaEventCreate(&gemm_stop);
 66 |     cudaEventRecord(start, 0);
 67 |     float gemm_ms = 0;
 68 |     for (int j = 0; j < n_blocks; j++)
 69 |     {
 70 |         for (int i = 0; i < num_reqs; i++)
 71 |         {
 72 |             uint64_t offset = 1ll * i * max_io_size / sizeof(fp_t);
 73 |             int row = offset / block_size;
 74 |             int col = j * block_size + offset % block_size;
 75 |             h_reqs[i] = (b_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
 76 |         }
 77 |         cam_gemm_read(h_reqs,num_reqs,(uintptr_t)b1);
 78 |         clear_wait_flag();
 79 |         for (int i = -1; i <= m_blocks; i++)
 80 |         {
 81 |             if (i >= 0 && i < m_blocks)
 82 |             {
 83 |                 //clear_wait_flag();
 84 |                 std::swap(a0, a1);
 85 |             }
 86 |             if (i + 1 < m_blocks)
 87 |             {
 88 |                 for (int l = 0; l < num_reqs; l++)
 89 |                 {
 90 |                     uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t);
 91 |                     h_reqs[l] = (a_offset + ((i + 1) * block_size * k + offset) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
 92 |                 }
 93 |                 cam_gemm_read(h_reqs,num_reqs,(uintptr_t)a1);
 94 |                 clear_wait_flag();
 95 |             }
 96 |             if (i - 1 >= 0)
 97 |             {
 98 |                 // if (i - 2 >= 0)
 99 |                 // {
100 |                 //     clear_wait_flag_write();
101 |                 // }
102 |                 std::swap(c0, c1);
103 |                 int num_reqs = CEIL(block_size * block_size * sizeof(fp_t), max_io_size);
104 |                 for (int l = 0; l < num_reqs; l++)
105 |                 {
106 |                     uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t);
107 |                     int row = (i - 1) * block_size + offset / block_size;
108 |                     int col = j * block_size + offset % block_size;
109 |                     h_reqs2[l] = (c_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
110 |                 }
111 |                 cam_gemm_write(h_reqs2,num_reqs,(uintptr_t)c1);
112 |                 clear_wait_flag_write();
113 |             }
114 |             if (i == 0)
115 |             {
116 |                 std::swap(b0, b1);
117 |             }
118 |             if (i >= 0 && i < m_blocks)
119 |             {
120 |                 cudaEventRecord(gemm_start, 0);
121 |                 cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, block_size, block_size, k, &alpha, b0, CUDA_R_32F, block_size, a0, CUDA_R_32F, k, &beta, c0, CUDA_R_32F, block_size, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
122 |                 cudaEventRecord(gemm_stop, 0);
123 |                 cudaEventSynchronize(gemm_stop);
124 |                 float ms;
125 |                 cudaEventElapsedTime(&ms, gemm_start, gemm_stop);
126 |                 gemm_ms += ms;
127 |             }
128 |         }
129 |         // clear_wait_flag_write();
130 |     }
131 |     cudaEventRecord(stop, 0);
132 |     cudaEventSynchronize(stop);
133 |     float ms;
134 |     cudaEventElapsedTime(&ms, start, stop);
135 |     printf("m = %d, n = %d, k = %d, block_size = %ld, time = %f ms, tflops = %f\n", m, n, k, block_size, ms, 2.0 * m * n * k / ms / 1e9);
136 |     printf("gemm time = %f ms, num_ssds = %d, max_io_size = %ld, num_queues = %d\n", gemm_ms, num_ssds, max_io_size, num_queues_per_ssd);
137 |     printf("%d %ld %d %ld %f %f %d\n", n, block_size, num_ssds, max_io_size, gemm_ms, ms, num_queues_per_ssd);
138 |     cublasDestroy(handle);
139 |     free_gpu(a0);
140 |     free_gpu(a1);
141 |     free_gpu(b0);
142 |     free_gpu(b1);
143 |     free_gpu(c0);
144 |     free_gpu(c1);
145 |     free(h_reqs);
146 |     cam_clean_up();
147 |     return 0;
148 | }


--------------------------------------------------------------------------------
/src/applications/gemm/src/device.cu:
--------------------------------------------------------------------------------
  1 | #include "device.cuh"
  2 | 
  3 | Device::Device(int ssd_id)
  4 | {
  5 | 
  6 |     // Open file and map BAR0 of SSD
  7 | 
  8 |     this->ssd_id = ssd_id;
  9 |     AEOLUS_LOG_INFO("Setting up device %d", ssd_id);
 10 |     char device_path[64];
 11 |     sprintf(device_path, "/dev/libnvm%d", ssd_id);
 12 |     device_fd = open(device_path, O_RDWR);
 13 |     if (device_fd < 0)
 14 |     {
 15 |         AEOLUS_LOG_ERROR("Failed to open: %s", strerror(errno));
 16 |         exit(1);
 17 |     }
 18 |     reg_ptr = mmap(NULL, NVME_BAR0_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, device_fd, 0);
 19 |     if (reg_ptr == MAP_FAILED)
 20 |     {
 21 |         AEOLUS_LOG_ERROR("Failed to mmap: %s\n", strerror(errno));
 22 |         exit(1);
 23 |     }
 24 |     AEOLUS_CUDA_CHECK(cudaHostRegister(reg_ptr, NVME_BAR0_SIZE, cudaHostRegisterIoMemory));
 25 | 
 26 |     // Reset controller.
 27 | 
 28 |     uint64_t reg_ptr_uint = (uint64_t)reg_ptr;
 29 |     *(uint32_t *)(reg_ptr_uint + NVME_REG_CC) &= ~NVME_REG_CC_EN;
 30 |     while (*(uint32_t volatile *)(reg_ptr_uint + NVME_REG_CSTS) & NVME_REG_CSTS_RDY)
 31 |         ;
 32 |     AEOLUS_LOG_INFO("Reset done.");
 33 | 
 34 |     // Set admin queue attributes.
 35 | 
 36 |     int ret = alloc_host_memory(&admin_queue_ptr, 2*AEOLUS_HOST_PGSIZE, &admin_queue_phys_addr);
 37 |     if (ret != 0)
 38 |     {
 39 |         AEOLUS_LOG_ERROR("Allocate admin queue memory failed: %s", strerror(ret));
 40 |         exit(1);
 41 |     }
 42 | 
 43 |     uint64_t asq = (uint64_t)admin_queue_ptr;
 44 |     uint64_t acq = (uint64_t)admin_queue_ptr + AEOLUS_HOST_PGSIZE;
 45 |     *(uint32_t *)(reg_ptr_uint + NVME_REG_AQA) = ((AEOLUS_ADMIN_QUEUE_DEPTH - 1) << 16) | (AEOLUS_ADMIN_QUEUE_DEPTH - 1);
 46 |     *(uint64_t *)(reg_ptr_uint + NVME_REG_ASQ) = admin_queue_phys_addr[0];
 47 |     *(uint64_t *)(reg_ptr_uint + NVME_REG_ACQ) = admin_queue_phys_addr[1];
 48 |     // AEOLUS_LOG_INFO("Admin queue phy addr: 0x%lx, 0x%lx", admin_queue_phys_addr[0], admin_queue_phys_addr[1]);
 49 | 
 50 |     admin_qp = new AdminQueuePair(
 51 |         (volatile uint32_t *)asq, 
 52 |         (volatile uint32_t *)acq, 
 53 |         NVME_BROADCAST_NSID, 
 54 |         (uint32_t *)(reg_ptr_uint + NVME_REG_SQTDBL), 
 55 |         (uint32_t *)(reg_ptr_uint + NVME_REG_CQHDBL), 
 56 |         AEOLUS_ADMIN_QUEUE_DEPTH
 57 |     );
 58 |     AEOLUS_LOG_INFO("Set admin_qp queue attributes done.");
 59 | 
 60 |     // Enable controller.
 61 |     *(uint32_t *)(reg_ptr_uint + NVME_REG_CC) |= NVME_REG_CC_EN;
 62 |     while (!(*(uint32_t volatile *)(reg_ptr_uint + NVME_REG_CSTS) & NVME_REG_CSTS_RDY))
 63 |         ;
 64 |     AEOLUS_LOG_INFO("Enable controller done.");
 65 | 
 66 |     // Set number of I/O queues. We will tentatively set a large number to the queue number
 67 |     // and then run the get-feature command so as to get the largest queue number supported.
 68 | 
 69 |     uint32_t status = admin_qp->set_num_queues(0xfffe, 0xfffe);   // Maximum queue pairs supported by NVMe.
 70 |     if (status != 0)
 71 |     {
 72 |         AEOLUS_LOG_ERROR("Set number of queues failed with status 0x%x", status);
 73 |         exit(1);
 74 |     }
 75 |     AEOLUS_LOG_INFO("Set number of queues done.");
 76 |     
 77 |     uint16_t max_sq_num, max_cq_num;
 78 |     status = admin_qp->get_num_queues(max_sq_num, max_cq_num);
 79 |     if (status != 0)
 80 |     {
 81 |         AEOLUS_LOG_ERROR("Get number of queues failed with status 0x%x", status);
 82 |         exit(1);
 83 |     }
 84 |     max_queue_num = MIN(max_sq_num, max_cq_num);
 85 |     AEOLUS_LOG_INFO("Maximum queue number supported: %d.", max_queue_num);
 86 | 
 87 |     // Decide the namespace to use. The namespace with the lowest number will be chosen.
 88 | 
 89 |     void *temp_buffer;
 90 |     uint64_t *temp_buffer_phys_addr;
 91 |     alloc_host_memory(&temp_buffer, AEOLUS_HOST_PGSIZE, &temp_buffer_phys_addr);
 92 |     status = admin_qp->identify(0x02, 0x0, 0, temp_buffer_phys_addr[0]);
 93 |     if (status != 0)
 94 |     {
 95 |         AEOLUS_LOG_ERROR("Get namespace list failed with status 0x%x", status);
 96 |         exit(1);
 97 |     }
 98 |     active_ns = *((uint32_t *)temp_buffer);
 99 | 
100 |     // Get device capacity.
101 | 
102 |     status = admin_qp->identify(0x00, 0x0, active_ns, temp_buffer_phys_addr[0]);
103 |     if (status != 0)
104 |     {
105 |         AEOLUS_LOG_ERROR("Get namespace structure 0x%x", status);
106 |         exit(1);
107 |     }
108 |     max_lb_num = *((uint64_t *)temp_buffer);
109 |     AEOLUS_LOG_INFO("Active ns: %d, Maximum logical block number supported: %lu.", active_ns, max_lb_num);
110 | 
111 |     // Get maximum IO size.
112 | 
113 |     status = admin_qp->identify(0x01, 0x0, 0x0, temp_buffer_phys_addr[0]);
114 |     if (status != 0)
115 |     {
116 |         AEOLUS_LOG_ERROR("Get controller structure failed with status 0x%x", status);
117 |         exit(1);
118 |     }
119 |     max_io_size = *((uint8_t *)((uint64_t)temp_buffer + 77));
120 |     max_io_size = AEOLUS_HOST_PGSIZE * (1 << max_io_size);
121 |     AEOLUS_LOG_INFO("Maximum IO size supported: %d B.", max_io_size);
122 |     free_host_memory(temp_buffer, temp_buffer_phys_addr);
123 | 
124 |     // Get free queue pair IDs.
125 |     for (int i=1; i<max_queue_num; i++)
126 |     {
127 |         free_qps.push_back(i);
128 |     }
129 | }
130 | 
131 | Device::~Device()
132 | {
133 |     AEOLUS_LOG_INFO("Closing device %d", ssd_id);
134 |     free_host_memory(admin_queue_ptr, admin_queue_phys_addr);
135 |     AEOLUS_CUDA_CHECK(cudaHostUnregister(reg_ptr));
136 |     munmap(reg_ptr, NVME_BAR0_SIZE);
137 |     close(device_fd);
138 | }
139 | 
140 | int Device::alloc_host_memory(void **ptr, uint64_t size, uint64_t** phys_addr)
141 | {
142 |     posix_memalign(ptr, AEOLUS_HOST_PGSIZE, size);
143 |     memset(*ptr, 0, size);
144 |     size_t num_page = CEIL(size, AEOLUS_HOST_PGSIZE);
145 |     *phys_addr      = (uint64_t*)malloc(sizeof(uint64_t) * num_page);
146 |     nvm_ioctl_map req;  // Request physical address.
147 |     req.vaddr_start = (uint64_t)*ptr;
148 |     req.n_pages     = num_page;
149 |     req.ioaddrs     = *phys_addr;
150 | 
151 |     return ioctl(device_fd, NVM_MAP_HOST_MEMORY, &req);
152 | }
153 | 
154 | void Device::free_host_memory(void *ptr, uint64_t* phys_addr)
155 | {
156 |     ioctl(device_fd, NVM_UNMAP_MEMORY, (uint64_t)ptr);
157 |     free(phys_addr);
158 |     free(ptr);
159 | }
160 | 
161 | int Device::alloc_device_memory(void **ptr, aeolus_dev_mem_context** context, uint64_t size, uint64_t** phys_addr)
162 | {
163 |     *context = (aeolus_dev_mem_context*)malloc(sizeof(aeolus_dev_mem_context));
164 |     size = size / AEOLUS_DEVICE_PGSIZE * AEOLUS_DEVICE_PGSIZE + AEOLUS_DEVICE_PGSIZE;
165 |     AEOLUS_CUDA_CHECK(cudaMalloc(&((*context)->ptr), size + AEOLUS_DEVICE_PGSIZE));
166 |     *ptr = (void *)((uint64_t)((*context)->ptr) / AEOLUS_DEVICE_PGSIZE * AEOLUS_DEVICE_PGSIZE + AEOLUS_DEVICE_PGSIZE);
167 |     int flag = 0;
168 |     if ((uint64_t)*ptr != (uint64_t)((*context)->ptr))
169 |     {
170 |         flag = 1;
171 |     }
172 |     (*context)->ioaddrs = malloc(sizeof(uint64_t) * (size / AEOLUS_DEVICE_PGSIZE + flag));
173 |     *phys_addr = (uint64_t*)(*context)->ioaddrs;
174 |     nvm_ioctl_map req;
175 |     req.vaddr_start = (uint64_t)((*context)->ptr);
176 |     req.n_pages     = size / AEOLUS_DEVICE_PGSIZE + flag;
177 |     req.ioaddrs     = *phys_addr;
178 |     *phys_addr      += flag;
179 |     
180 |     return ioctl(device_fd, NVM_MAP_DEVICE_MEMORY, &req);
181 | }
182 | 
183 | void Device::free_device_memory(aeolus_dev_mem_context* context)
184 | {
185 |     ioctl(device_fd, NVM_UNMAP_MEMORY, (uint64_t)(context->ptr));
186 |     free(context->ioaddrs);
187 |     AEOLUS_CUDA_CHECK(cudaFree(context->ptr));
188 |     free(context);
189 | }


--------------------------------------------------------------------------------
/src/applications/gemm/gemm/spdk_gemm.cu:
--------------------------------------------------------------------------------
  1 | #include "lightbam.cuh"
  2 | #include <iostream>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <stdint.h>
  6 | #include <cublas_v2.h>
  7 | #include <algorithm>
  8 | #include "gemm.cuh"
  9 | #include <vector>
 10 | #include "spdk_read.h"
 11 | 
 12 | typedef float fp_t;
 13 | int main(int argc, char *argv[])
 14 | {
 15 |     if (argc != 10)
 16 |     {
 17 |         printf("Usage: %s m n k a_offset b_offset c_offset block_size max_io_size num_ssds\n", argv[0]);
 18 |         return 1;
 19 |     }
 20 |     int m = parse_offset(argv[1]);
 21 |     int n = parse_offset(argv[2]);
 22 |     int k = parse_offset(argv[3]);
 23 |     uint64_t a_offset = parse_offset(argv[4]);
 24 |     uint64_t b_offset = parse_offset(argv[5]);
 25 |     uint64_t c_offset = parse_offset(argv[6]);
 26 |     uint64_t block_size = parse_offset(argv[7]);
 27 |     uint64_t max_io_size = parse_offset(argv[8]);
 28 |     int num_ssds = atoi(argv[9]);
 29 |     if (m % block_size != 0 || n % block_size != 0)
 30 |     {
 31 |         std::cout<<"m and n must be a multiple of block_size"<<std::endl;
 32 |         return 1;
 33 |     }
 34 |     int m_blocks = m / block_size;
 35 |     int n_blocks = n / block_size;
 36 |     if (block_size * sizeof(fp_t) % max_io_size != 0)
 37 |     {
 38 |         std::cout<<"block_size * sizeof(fp_t) must be a multiple of max_io_size"<<std::endl;
 39 |         return 1;
 40 |     }
 41 |     int num_queues_per_ssd = CEIL(block_size * k * sizeof(fp_t), num_ssds * 4096 * max_io_size) + 1;
 42 |     fp_t *a0, *a1, *b0, *b1, *c0, *c1;
 43 |     printf("max_io_size = %ld\n", max_io_size); 
 44 |     cam_init(max_io_size);
 45 |     // a0 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 46 |     // a1 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 47 |     // b0 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 48 |     // b1 = (fp_t*)alloc_gpu(block_size * k * sizeof(fp_t));
 49 |     // c0 = (fp_t*)alloc_gpu(block_size * block_size * sizeof(fp_t));
 50 |     // c1 = (fp_t*)alloc_gpu(block_size * block_size * sizeof(fp_t));
 51 |     a1 = (fp_t*)alloc_pinmemory(block_size * k * sizeof(fp_t));
 52 |     b1 = (fp_t*)alloc_pinmemory(block_size * k * sizeof(fp_t));
 53 |     c1 = (fp_t*)alloc_pinmemory(block_size * block_size * sizeof(fp_t));
 54 |     if(a1 == NULL || b1 == NULL || c1 == NULL){
 55 |         printf("alloc pin memory  failed\n");
 56 |         
 57 |     }  
 58 |     cudaHostRegister(a1, block_size * k * sizeof(fp_t), cudaHostRegisterDefault);
 59 |     cudaHostRegister(b1, block_size * k * sizeof(fp_t), cudaHostRegisterDefault);
 60 |     cudaHostRegister(c1, block_size * block_size * sizeof(fp_t), cudaHostRegisterDefault);
 61 |     cudaMalloc(&a0, block_size * k * sizeof(fp_t));
 62 |     cudaMalloc(&b0, block_size * k * sizeof(fp_t));
 63 |     cudaMalloc(&c0, block_size * block_size * sizeof(fp_t));
 64 |     int num_reqs = CEIL(block_size * k * sizeof(fp_t), max_io_size);
 65 |     u_int64_t *h_reqs = (u_int64_t *)malloc(num_reqs * sizeof(u_int64_t));
 66 |     u_int64_t *h_reqs2 = (u_int64_t *)malloc(num_reqs * sizeof(u_int64_t));
 67 |     cublasHandle_t handle;
 68 |     cublasCreate(&handle);
 69 |     cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
 70 |     fp_t alpha = 1.0f, beta = 0.0f;
 71 |     cudaEvent_t start, stop, gemm_start, gemm_stop;
 72 |     cudaStream_t streama,streamb,streamc;
 73 |     cudaStreamCreate(&streama);
 74 |     cudaStreamCreate(&streamb);
 75 |     cudaStreamCreate(&streamc);
 76 |     cudaEventCreate(&start);
 77 |     cudaEventCreate(&stop);
 78 |     cudaEventCreate(&gemm_start);
 79 |     cudaEventCreate(&gemm_stop);
 80 |     cudaEventRecord(start, 0);
 81 |     float gemm_ms = 0;
 82 |     for (int j = 0; j < n_blocks; j++)
 83 |     {
 84 |         for (int i = 0; i < num_reqs; i++)
 85 |         {
 86 |             uint64_t offset = 1ll * i * max_io_size / sizeof(fp_t);
 87 |             int row = offset / block_size;
 88 |             int col = j * block_size + offset % block_size;
 89 |             h_reqs[i] = (b_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
 90 |         }
 91 |         cam_gemm_read(h_reqs,num_reqs,(uintptr_t)b1);
 92 |         clear_wait_flag();
 93 |         //std::swap(b0, b1);
 94 |         cudaMemcpyAsync(b0, b1, block_size * k * sizeof(fp_t), cudaMemcpyHostToDevice, streamb);
 95 |         for (int i = -1; i <= m_blocks+2; i++)
 96 |         {
 97 |             if (i >= 0 && i < m_blocks)
 98 |             {
 99 |                 // clear_wait_flag();
100 |                 cudaMemcpyAsync(a0, a1, block_size * k * sizeof(fp_t), cudaMemcpyHostToDevice, streama);
101 |                 //std::swap(a0, a1);
102 |             }
103 |             if (i  < m_blocks -1)    //read phase
104 |             {
105 |                 for (int l = 0; l < num_reqs; l++)
106 |                 {
107 |                     uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t);
108 |                     h_reqs[l] = (a_offset + ((i + 1) * block_size * k + offset) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
109 |                 }
110 |                 cam_gemm_read(h_reqs,num_reqs,(uintptr_t)a1);
111 |                 clear_wait_flag();
112 |             }
113 |             if(i>=2 && i<= m_blocks+1){
114 |                 // std::swap(c0, c1);
115 |                 cudaMemcpyAsync(c0, c1, block_size * block_size * sizeof(fp_t), cudaMemcpyHostToDevice, streamc);
116 |             }
117 |             if (i >= 3)    //write phase
118 |             {
119 |                 // if (i  >= 4)
120 |                 // {
121 |                 //     clear_wait_flag_write();
122 |                 // }
123 |                 
124 |                 int num_reqs = CEIL(block_size * block_size * sizeof(fp_t), max_io_size);
125 |                 for (int l = 0; l < num_reqs; l++)
126 |                 {
127 |                     uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t);
128 |                     int row = (i - 1) * block_size + offset / block_size;
129 |                     int col = j * block_size + offset % block_size;
130 |                     h_reqs2[l] = (c_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE;
131 |                     // h_reqs2[l] = l;
132 |                 }
133 |                 cudaStreamSynchronize(streamc);
134 |                 cam_gemm_write(h_reqs2,num_reqs,(uintptr_t)c1);
135 |                 clear_wait_flag_write();
136 |                 
137 |             }
138 |             
139 |             if (i >= 1 && i <= m_blocks)   //gemm compute phase
140 |             {
141 |                 cudaEventRecord(gemm_start, 0);
142 |                 cudaStreamSynchronize(streama);
143 |                 cudaStreamSynchronize(streamb);
144 |                 cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, block_size, block_size, k, &alpha, b0, CUDA_R_32F, block_size, a0, CUDA_R_32F, k, &beta, c0, CUDA_R_32F, block_size, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
145 |                 cudaEventRecord(gemm_stop, 0);
146 |                 cudaEventSynchronize(gemm_stop);
147 |                 float ms;
148 |                 cudaEventElapsedTime(&ms, gemm_start, gemm_stop);
149 |                 gemm_ms += ms;
150 |             }
151 |          }
152 |         // clear_wait_flag_write();
153 |     }
154 |     cudaEventRecord(stop, 0);
155 |     cudaEventSynchronize(stop);
156 |     float ms;
157 |     cudaEventElapsedTime(&ms, start, stop);
158 |     printf("m = %d, n = %d, k = %d, block_size = %ld, time = %f ms, tflops = %f\n", m, n, k, block_size, ms, 2.0 * m * n * k / ms / 1e9);
159 |     printf("gemm time = %f ms, num_ssds = %d, max_io_size = %ld, num_queues = %d\n", gemm_ms, num_ssds, max_io_size, num_queues_per_ssd);
160 |     printf("%d %ld %d %ld %f %f %d\n", n, block_size, num_ssds, max_io_size, gemm_ms, ms, num_queues_per_ssd);
161 |     cublasDestroy(handle);
162 |     free_pinmemory(a1);
163 |     free_pinmemory(b1);
164 |     free_pinmemory(c1);
165 |     cudaFree(a0);
166 |     cudaFree(b0);
167 |     cudaFree(c0);
168 |     cudaStreamDestroy(streama);
169 |     cudaStreamDestroy(streamb);
170 |     cudaStreamDestroy(streamc);
171 |     free(h_reqs);
172 |     cam_clean_up();
173 |     return 0;
174 | }


--------------------------------------------------------------------------------
/src/applications/gemm/include/controller.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef __AEOLUS_CONTROLLER_CUH
  2 | #define __AEOLUS_CONTROLLER_CUH
  3 | 
  4 | #include <vector>
  5 | #include "device.cuh"
  6 | #include "request.cuh"
  7 | 
  8 | enum aeolus_access_type
  9 | {
 10 |     AEOLUS_ACCESS_SEQUENTIAL  = 0,
 11 |     AEOLUS_ACCESS_RANDOM      = 1
 12 | };
 13 | 
 14 | enum aeolus_dist_type
 15 | {
 16 |     AEOLUS_DIST_STRIPE  = 0,
 17 |     AEOLUS_DIST_REPLICA = 1
 18 | };
 19 | 
 20 | enum aeolus_buf_type
 21 | {
 22 |     AEOLUS_BUF_USER   = 0,
 23 |     AEOLUS_BUF_PINNED = 1
 24 | };
 25 | 
 26 | /**
 27 |  * @brief A controller manages multiple IO queues of multiple SSDs and provides a simple interface for user.
 28 |  * 
 29 |  */
 30 | class Controller
 31 | {
 32 | protected:
 33 |     std::vector<Device*>ssd_list;
 34 |     int                 ssd_count;
 35 |     int                 gpu_id;
 36 |     int32_t             max_io_size;
 37 |     int32_t             num_queue_per_ssd;
 38 |     int32_t             queue_depth;
 39 |     aeolus_dist_type  dist_type;
 40 |     aeolus_buf_type   buf_type;
 41 |     
 42 |     int32_t             max_queue_num;
 43 |     int32_t             max_trans_size;
 44 | 
 45 |     int32_t             **qpid_list;
 46 |     IoQueuePair         *d_ssdqp;
 47 |     void                *d_iobuf_ptr;
 48 |     uint64_t            *d_iobuf_phys;
 49 |     uint64_t            *prp_list;
 50 |     uint64_t            *h_prp_phys;
 51 |     uint64_t            *d_prp_phys;
 52 |     uint64_t            *h_ssd_num_lbs;
 53 |     uint64_t            *d_ssd_num_lbs;
 54 | 
 55 |     aeolus_dev_mem_context *qp_ctx;
 56 |     aeolus_dev_mem_context *iobuf_ctx;
 57 | 
 58 |     int* ssd_num_reqs;
 59 |     Request *distributed_reqs;
 60 |     int *req_ids;
 61 | public:
 62 |     uint64_t max_lb_number;
 63 |     uint64_t *qp_phys;
 64 | 
 65 |     /**
 66 |      * @brief Construct a new Controller object. A controller manages multiple IO queues of multiple SSDs and provides a simple interface for user.
 67 |      * 
 68 |      * @param ssd_list List of SSD devices to be managed by the controller.
 69 |      * @param num_queue_per_ssd Number of IO queues allocated to each SSD.
 70 |      * @param max_io_size Maximum IO size in bytes in a single NVMe command.
 71 |      * @param queue_depth Depth of each IO queue.
 72 |      * @param dist_type Pattern for data distribution. AEOLUS_DIST_STRIPE means data is striped across SSDs, AEOLUS_DIST_REPLICA means data is replicated across SSDs.
 73 |      * @param buf_type Type of data buffer for IO. AEOLUS_BUF_USER means the buffer can be arbitrary buffer provided by user, AEOLUS_BUF_PINNED means the buffer is pinned by user beforehand.
 74 |      * @param pinned_buf_phys Physical addresses of the pinned buffer. Only valid when buf_type is AEOLUS_BUF_PINNED.
 75 |      * @param pinned_buf_size Size of the pinned buffer. Only valid when buf_type is AEOLUS_BUF_PINNED.
 76 |      */
 77 |     Controller(
 78 |         std::vector<Device*> ssd_list, 
 79 |         int32_t num_queue_per_ssd    = AEOLUS_MAX_NUM_QUEUES,
 80 |         int32_t max_io_size          = 4096,
 81 |         int32_t queue_depth          = 4096,
 82 |         aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE,
 83 |         aeolus_buf_type  buf_type  = AEOLUS_BUF_USER,
 84 |         uint64_t *pinned_buf_phys = nullptr,
 85 |         uint64_t pinned_buf_size = 0
 86 |     );
 87 | 
 88 |     /**
 89 |      * @brief Construct a new Controller object. This interface hides the details of queue depth and IO size and provides pre-defined configurations to user.
 90 |      * 
 91 |      * @param ssd_list List of SSD devices to be managed by the controller.
 92 |      * @param access_type Preset of IO pattern. AEOLUS_ACCESS_SEQUENTIAL means the user prefers sequential access, AEOLUS_ACCESS_RANDOM means random access.
 93 |      * @param dist_type Pattern for data distribution. AEOLUS_DIST_STRIPE means data is striped across SSDs, AEOLUS_DIST_REPLICA means data is replicated across SSDs.
 94 |      * @param buf_type Type of data buffer for IO. AEOLUS_BUF_USER means the buffer can be arbitrary buffer provided by user, AEOLUS_BUF_PINNED means the buffer is pinned by user beforehand.
 95 |      */
 96 |     inline Controller(
 97 |         std::vector<Device*> ssd_list, 
 98 |         aeolus_access_type access_type = AEOLUS_ACCESS_SEQUENTIAL,
 99 |         aeolus_dist_type   dist_type   = AEOLUS_DIST_STRIPE,
100 |         aeolus_buf_type    buf_type    = AEOLUS_BUF_USER
101 |     ) : Controller(
102 |         ssd_list,
103 |         access_type == AEOLUS_ACCESS_SEQUENTIAL ? 8 : AEOLUS_MAX_NUM_QUEUES,
104 |         access_type == AEOLUS_ACCESS_SEQUENTIAL ? AEOLUS_MAX_DATA_TRANSFER : 4096,
105 |         access_type == AEOLUS_ACCESS_SEQUENTIAL ? 256 : 4096,
106 |         dist_type,
107 |         buf_type
108 |     )
109 |     {};
110 | 
111 |     ~Controller();
112 |     void read_data(uint64_t start_lb, uint64_t num_lb, void *buf);
113 |     void write_data(uint64_t start_lb, uint64_t num_lb, void *buf);
114 |     IoQueuePair *get_io_queue_pair() { return d_ssdqp; }
115 | 
116 |     /**
117 |      * @brief Submit a batch of IO requests to the controller and process them in a helper thread
118 |      * until completion.
119 |      * 
120 |      * @param req List of IO requests.
121 |      * @param num_req Number of IO requests.
122 |      * @param dir Direction of requests. All requests must have the same direction.
123 |      * @param stream cuda stream used.
124 |      * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional)
125 |      */
126 |     virtual void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t *d_prp_phys = nullptr) = 0;
127 | 
128 | private:
129 |     void lb_to_ssd_id(uint64_t lb, int &ssd_id, uint64_t &local_lb);
130 | };
131 | 
132 | /**
133 |  * @brief A controller instance with helper thread-based IO processing functions.
134 |  * requests are processed in multiple batches. 
135 |  * 
136 |  */
137 | class ControllerLegacy : public Controller
138 | {
139 | public:
140 |     inline ControllerLegacy(
141 |         std::vector<Device*> ssd_list, 
142 |         int32_t num_queue_per_ssd    = AEOLUS_MAX_NUM_QUEUES,
143 |         int32_t max_io_size          = 4096,
144 |         int32_t queue_depth          = 4096,
145 |         aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE,
146 |         aeolus_buf_type  buf_type  = AEOLUS_BUF_USER,
147 |         uint64_t *pinned_buf_phys = nullptr,
148 |         uint64_t pinned_buf_size = 0
149 |     ) : Controller(
150 |         ssd_list,
151 |         num_queue_per_ssd,
152 |         max_io_size,
153 |         queue_depth,
154 |         dist_type,
155 |         buf_type,
156 |         pinned_buf_phys,
157 |         pinned_buf_size
158 |     )
159 |     {};
160 | 
161 |     /**
162 |      * @brief Submit a batch of IO requests to the controller and process them in a helper thread
163 |      * until completion.
164 |      * 
165 |      * @param req List of IO requests.
166 |      * @param num_req Number of IO requests.
167 |      * @param dir Direction of requests. All requests must have the same direction.
168 |      * @param stream cuda stream used.
169 |      * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional)
170 |      */
171 |     void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t *d_prp_phys = nullptr) override;
172 | };
173 | 
174 | /**
175 |  * @brief A controller instance with submit-poll processing interface.
176 |  * @warning Each instance can only process one batch of request at one time,
177 |  * and that the number of requests in a batch should not exceed the queue number times queue depth.
178 |  * 
179 |  */
180 | class ControllerDecoupled : public Controller
181 | {
182 | private:
183 |     int* ssd_num_reqs_prefix_sum;
184 |     int num_reqs;
185 |     cudaStream_t stream;
186 |     aeolus_access_dir dir;
187 | public:
188 |     inline ControllerDecoupled(
189 |         std::vector<Device*> ssd_list, 
190 |         int32_t num_queue_per_ssd    = AEOLUS_MAX_NUM_QUEUES,
191 |         int32_t max_io_size          = 4096,
192 |         int32_t queue_depth          = 4096,
193 |         aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE,
194 |         aeolus_buf_type  buf_type  = AEOLUS_BUF_USER,
195 |         uint64_t *pinned_buf_phys = nullptr,
196 |         uint64_t pinned_buf_size = 0
197 |     ) : Controller(
198 |         ssd_list,
199 |         num_queue_per_ssd,
200 |         max_io_size,
201 |         queue_depth,
202 |         dist_type,
203 |         buf_type,
204 |         pinned_buf_phys,
205 |         pinned_buf_size
206 |     )
207 |     {
208 |         AEOLUS_CUDA_CHECK(cudaMalloc(&ssd_num_reqs_prefix_sum, ssd_count * sizeof(int)));
209 |     }
210 | 
211 |     /**
212 |      * @brief Submit a batch of IO requests to the NVMe SSDs. User needs to ensure the completion
213 |      * of the requests by `poll()` function.
214 |      * 
215 |      * @param req List of IO requests.
216 |      * @param num_req Number of IO requests.
217 |      * @param dir Direction of requests. All requests must have the same direction.
218 |      * @param stream cuda stream used.
219 |      * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional)
220 |      * 
221 |      * @warning User must make sure that number of requests is no greater than the queue number times queue depth.
222 |      */
223 |     void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys = nullptr) override;
224 | 
225 |     /**
226 |      * @brief Poll the in-flight requests until completion.
227 |      * 
228 |      * @warning User must not submit new requests before the completion of the previous batch.
229 |      */
230 |     void poll();
231 | };
232 | 
233 | #endif


--------------------------------------------------------------------------------
/src/CAM_lib/gpu_transfer.cu:
--------------------------------------------------------------------------------
  1 | #include "gpu_transfer.cuh"
  2 | 
  3 | //static  struct SmDevice2DeviceSemaphoreDeviceHandle d_sm;
  4 | u_int64_t read_block_num = 1000000UL;
  5 | //* read arguments
  6 | static Host2DeviceSemaphore h_sm;
  7 | static u_int64_t *h_data;
  8 | __device__ static u_int64_t *d_data;
  9 | std::vector<u_int64_t> a;
 10 | 
 11 | static u_int64_t *h_submit_info;
 12 | __device__ static u_int64_t *d_submit_info;
 13 | 
 14 | 
 15 | //* D2H Semaphore arguments
 16 | __device__ uint64_t* D2H_inboundSemaphoreId;
 17 | __device__ uint64_t* D2H_expectedInboundSemaphore;
 18 | __device__ uint64_t* D2H_outboundSemaphoreId;
 19 | __device__ uint64_t* D2H_outboundSemaphoreValue;
 20 | __device__ uint64_t* D2H_total_num;
 21 | 
 22 | //* write arguments
 23 | static Host2DeviceSemaphore h_sm_2;
 24 | static u_int64_t *h_data_2;
 25 | __device__ static u_int64_t *d_data_2;
 26 | std::vector<u_int64_t> a_2;
 27 | 
 28 | static u_int64_t *h_submit_info_2;
 29 | __device__ static u_int64_t *d_submit_info_2;
 30 | 
 31 | 
 32 | //* D2H Semaphore arguments
 33 | __device__ uint64_t* D2H_inboundSemaphoreId_2;
 34 | __device__ uint64_t* D2H_expectedInboundSemaphore_2;
 35 | __device__ uint64_t* D2H_outboundSemaphoreId_2;
 36 | __device__ uint64_t* D2H_outboundSemaphoreValue_2;
 37 | __device__ uint64_t* D2H_total_num_2;
 38 | 
 39 | void SemaphoreInit(cudaStream_t stream1)
 40 | {
 41 |     void* tmp;
 42 |     cudaMalloc(&(tmp),sizeof(u_int64_t));
 43 |     cudaMemcpyToSymbol(D2H_outboundSemaphoreValue, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 44 |     cudaMalloc(&(tmp),sizeof(u_int64_t));
 45 |     cudaMemcpyToSymbol(D2H_expectedInboundSemaphore, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 46 |     void *tmp_h_data;
 47 |     cudaHostAlloc((void**)&tmp_h_data, sizeof(u_int64_t), cudaHostAllocMapped);
 48 |     cudaHostGetDevicePointer(&(tmp), (u_int64_t *)tmp_h_data, 0);
 49 |     cudaMemcpyToSymbol(D2H_outboundSemaphoreId, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 50 |     
 51 |     void *tmp_num;
 52 |     cudaHostAlloc((void**)&tmp_num, sizeof(u_int64_t), cudaHostAllocMapped);
 53 |     cudaHostGetDevicePointer((void**)&(D2H_total_num), tmp_num, 0);
 54 | 
 55 | 
 56 |     h_sm.ConnectToStream(stream1);
 57 |     h_sm.ConnectToDeviceSemaphore(tmp_h_data,(u_int64_t *)tmp_num);
 58 |     tmp = (uint64_t*)h_sm.GetoutboundSemaphore();
 59 |     cudaMemcpyToSymbol(D2H_inboundSemaphoreId, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 60 |     
 61 |     cudaMalloc(&(tmp),sizeof(u_int64_t));
 62 |     cudaMemcpyToSymbol(D2H_outboundSemaphoreValue_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 63 |     cudaMalloc(&(tmp),sizeof(u_int64_t));
 64 |     cudaMemcpyToSymbol(D2H_expectedInboundSemaphore_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 65 |     void *tmp_h_data_2;
 66 |     cudaHostAlloc((void**)&tmp_h_data_2, sizeof(u_int64_t), cudaHostAllocMapped);
 67 |     cudaHostGetDevicePointer(&(tmp), (u_int64_t *)tmp_h_data_2, 0);
 68 |     cudaMemcpyToSymbol(D2H_outboundSemaphoreId_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 69 |     
 70 |     
 71 |     cudaHostAlloc((void**)&tmp_num, sizeof(u_int64_t), cudaHostAllocMapped);
 72 |     cudaHostGetDevicePointer((void**)&(D2H_total_num_2), tmp_num, 0);
 73 | 
 74 | 
 75 |     h_sm_2.ConnectToStream(stream1);
 76 |     h_sm_2.ConnectToDeviceSemaphore(tmp_h_data_2,(u_int64_t *)tmp_num);
 77 |     tmp = (uint64_t*)h_sm_2.GetoutboundSemaphore();
 78 |     cudaMemcpyToSymbol(D2H_inboundSemaphoreId_2, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice);
 79 | }
 80 | 
 81 | __device__ void prefetch(int64_t embed_num,uintptr_t *dev_addr)
 82 | {
 83 |     __syncthreads();
 84 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
 85 |     {
 86 |         
 87 |         d_submit_info[0] = 1;
 88 |         d_submit_info[1] = embed_num;
 89 |         d_submit_info[2] = (uint64_t)dev_addr;
 90 | 
 91 |         *D2H_outboundSemaphoreValue += 1;
 92 |         //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue);
 93 |         *D2H_outboundSemaphoreId = *D2H_outboundSemaphoreValue;
 94 |         //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId);
 95 |     }
 96 | }
 97 | 
 98 | __device__ void prefetch_syncronize(void)
 99 | {
100 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
101 |     {
102 |         //printf("leadind thread wait\n");
103 |         (*D2H_expectedInboundSemaphore) += 1;
104 |         uint64_t value;
105 |         uint64_t value2= (*D2H_expectedInboundSemaphore);
106 |         //printf("value2: %ld\n",value2);
107 |         while(true){
108 |             value= atomicMin((unsigned long long int*)D2H_inboundSemaphoreId,(unsigned long long int)value2);
109 |             if(value >= value2)
110 |                 break;  
111 |             uint64_t start = 0;
112 |             while (start++ < 100000);
113 |         }
114 | 
115 |         //printf("leadind thread wait done\n");
116 |     }
117 |     __syncthreads();
118 | }
119 | 
120 | 
121 | void polling_thread(void)
122 | {
123 |     while(1){
124 |         h_sm.wait();
125 |         uint64_t embed_num = h_submit_info[1];
126 |         uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]);
127 |         cam_gemm_read(h_data, embed_num,(uintptr_t)gem_memory);
128 |         clear_wait_flag();
129 |         h_sm.signal();
130 |     }
131 | }
132 | 
133 | 
134 | void polling_thread_seq(void)
135 | {
136 |     while(1){
137 |         h_sm.wait();
138 |         u_int64_t start_lba = h_submit_info[0];
139 |         uint64_t embed_num = h_submit_info[1];
140 |         uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]);
141 |         seq_read_submit(start_lba,embed_num,(uintptr_t)gem_memory);
142 |         clear_wait_flag();
143 |         h_sm.signal();
144 |     }
145 | }
146 | 
147 | void polling_thread_seq_write(void)
148 | {
149 |     while(1){
150 |         h_sm.wait();
151 |         u_int64_t start_lba = h_submit_info[0];
152 |         uint64_t embed_num = h_submit_info[1];
153 |         uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]);
154 |         seq_write_submit(start_lba,embed_num,(uintptr_t)gem_memory);
155 |         clear_wait_flag_write();
156 |         h_sm.signal();
157 |     }
158 | }
159 | 
160 | __device__ void prefetch_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr)
161 | {
162 |     __syncthreads();
163 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
164 |     {
165 |         
166 |         d_submit_info[0] = start_lba;
167 |         d_submit_info[1] = embed_num;
168 |         d_submit_info[2] = (uint64_t)dev_addr;
169 | 
170 |         *D2H_outboundSemaphoreValue += 1;
171 |         //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue);
172 |         *D2H_outboundSemaphoreId = *D2H_outboundSemaphoreValue;
173 |         //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId);
174 |     }
175 | }
176 | 
177 | __device__ void writeback_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr)
178 | {
179 |     __syncthreads();
180 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
181 |     {
182 |         
183 |         d_submit_info_2[0] = start_lba;
184 |         d_submit_info_2[1] = embed_num;
185 |         d_submit_info_2[2] = (uint64_t)dev_addr;
186 | 
187 |         *D2H_outboundSemaphoreValue_2 += 1;
188 |         //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue);
189 |         *D2H_outboundSemaphoreId_2 = *D2H_outboundSemaphoreValue_2;
190 |         //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId);
191 |     }
192 | }
193 | 
194 | 
195 | void Init(u_int32_t access_size,cudaStream_t stream1)
196 | {
197 |     SemaphoreInit(stream1);
198 |     // 在主机端分配页锁内存（零拷贝内存）
199 |     cudaHostAlloc((void**)&h_data, read_block_num * sizeof(u_int64_t), cudaHostAllocMapped);
200 |     // 获取对应的设备指针
201 |     cudaHostGetDevicePointer((void**)&d_data, h_data, 0);
202 | 
203 |     cudaHostAlloc((void**)&h_submit_info, 3 * sizeof(u_int64_t), cudaHostAllocMapped);
204 |     
205 |     void* tmp;
206 |     cudaHostGetDevicePointer((void**)&tmp, h_submit_info, 0);
207 |     cudaMemcpyToSymbol(d_submit_info, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice);
208 |     
209 |     cudaHostAlloc((void**)&h_data_2, read_block_num * sizeof(u_int64_t), cudaHostAllocMapped);
210 |     // 获取对应的设备指针
211 |     cudaHostGetDevicePointer((void**)&d_data_2, h_data_2, 0);
212 | 
213 |     cudaHostAlloc((void**)&h_submit_info_2, 3 * sizeof(u_int64_t), cudaHostAllocMapped);
214 |     
215 | 
216 |     cudaHostGetDevicePointer((void**)&tmp, h_submit_info_2, 0);
217 |     cudaMemcpyToSymbol(d_submit_info_2, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice);
218 |     //cam_init(access_size);
219 |     init_myKernel<<<1, 1,0,stream1>>>();
220 |     
221 |     std::cout<<"init done"<<std::endl;
222 | }
223 | 
224 | extern "C" __global__ void init_myKernel(void) {
225 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
226 |     // 进行一些计算并更新设备内存数据
227 |     if(idx == 0)
228 |     {
229 |         printf("init kernel\n");
230 |         *(D2H_expectedInboundSemaphore) =0;
231 |         *(D2H_outboundSemaphoreValue)=0;
232 | 
233 |         *(D2H_expectedInboundSemaphore_2) =0;
234 |         *(D2H_outboundSemaphoreValue_2)=0;
235 |     }
236 |     
237 |     /*
238 |     /* computation 
239 |     */
240 | 
241 |     __syncthreads();
242 | }
243 | 
244 | uint64_t* get_d_data(void){return d_data;}
245 | 
246 | uint64_t* get_d_data_write(void){return d_data_2;}
247 | 
248 | 
249 | void polling_thread_write(void)
250 | {
251 |     while(1){
252 |         h_sm_2.wait();
253 |         uint64_t embed_num = h_submit_info_2[1];
254 |         // uint64_t* embed_id = h_data;//(uint64_t*)(h_submit_info[1]);
255 |         uintptr_t *gem_memory = (uintptr_t *)(h_submit_info_2[2]);
256 |         // printf("embed_num: %ld\n",h_submit_info[0]);
257 |         // printf("embed_id: %ld\n",h_submit_info[1]);
258 |         // printf("gem_memory: %ld\n",h_submit_info[2]);
259 |         //cout<< "total number: "<< h_sm.GetTotalNumber()<<endl;
260 |         //void* gem_memory = alloc_gpu(read_block_num*4096);
261 |         //cam_gemm_read(h_data,read_block_num,(uintptr_t)gem_memory);
262 |         //clear_wait_flag();
263 |         cam_gemm_write(h_data_2, embed_num,(uintptr_t)gem_memory);
264 |         //std::cout<<"submit done"<<std::endl;
265 |         clear_wait_flag_write();
266 |         h_sm_2.signal();
267 |     }
268 | }
269 | 
270 | __device__ void writeback(int64_t embed_num,uintptr_t *dev_addr)
271 | {
272 |     __syncthreads();
273 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
274 |     {
275 |         
276 |         d_submit_info_2[0] = 1;
277 |         d_submit_info_2[1] = embed_num;
278 |         d_submit_info_2[2] = (uint64_t)dev_addr;
279 | 
280 |         *D2H_outboundSemaphoreValue_2 += 1;
281 |         //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue);
282 |         *D2H_outboundSemaphoreId_2 = *D2H_outboundSemaphoreValue_2;
283 |         //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId);
284 |     }
285 | }
286 | 
287 | __device__ void writeback_syncronize(void)
288 | {
289 |     if((threadIdx.x + blockIdx.x * blockDim.x) == 0)
290 |     {
291 |         //printf("leadind thread wait\n");
292 |         (*D2H_expectedInboundSemaphore_2) += 1;
293 |         uint64_t value;
294 |         uint64_t value2= (*D2H_expectedInboundSemaphore_2);
295 |         //printf("value2: %ld\n",value2);
296 |         while(true){
297 |             value= atomicMin((unsigned long long int*)D2H_inboundSemaphoreId_2,(unsigned long long int)value2);
298 |             if(value >= value2)
299 |                 break;   
300 |             // uint64_t start = 0;
301 |             // while (start++ < 10000000000);
302 |         }
303 | 
304 |         //printf("leadind thread wait done\n");
305 |     }
306 |     __syncthreads();
307 | }


--------------------------------------------------------------------------------
/src/GPU_memory_lib/GPU_memory_management.cpp:
--------------------------------------------------------------------------------
  1 | #include "GPU_memory_management.hpp"
  2 | 
  3 | 
  4 | #include <map>
  5 | #include <memory>
  6 | #include <string>
  7 | #include <string_view>
  8 | 
  9 | // #include <fmt/core.h>
 10 | // #include <fmt/chrono.h>
 11 | // #include <fmt/ranges.h>
 12 | // #include <fmt/os.h>
 13 | // #include <fmt/args.h>
 14 | // #include <fmt/ostream.h>
 15 | // #include <fmt/std.h>	
 16 | // #include <fmt/color.h>
 17 | 
 18 | #include <unistd.h>
 19 | #include <sys/mman.h>
 20 | #include <sys/ioctl.h>
 21 | #include <fcntl.h>
 22 | #include <immintrin.h>
 23 | 
 24 | #include <rc4ml.h>
 25 | 
 26 | 
 27 | 
 28 | #include <cuda.h>
 29 | #include <gdrapi.h>
 30 | 
 31 | #define ASSERT(x)                                               \
 32 |     do                                                          \
 33 |     {                                                           \
 34 |         if (!(x))                                               \
 35 |         {                                                       \
 36 |             fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__); \
 37 |             exit(EXIT_FAILURE);                                 \
 38 |         }                                                       \
 39 |     } while (0)
 40 | 
 41 | #define ASSERTDRV(stmt)                     \
 42 |     do                                      \
 43 |     {                                       \
 44 |         CUresult result = (stmt);           \
 45 |         if (result != CUDA_SUCCESS) {       \
 46 |             const char *_err_name;          \
 47 |             cuGetErrorName(result, &_err_name); \
 48 |             fprintf(stderr, "CUDA error: %s\n", _err_name); \
 49 |         }                                   \
 50 |         ASSERT(CUDA_SUCCESS == result);     \
 51 |     } while (0)
 52 | 
 53 | #define ASSERT_EQ(P, V) ASSERT((P) == (V))
 54 | #define ASSERT_NEQ(P, V) ASSERT(!((P) == (V)))
 55 | 
 56 | 
 57 | 
 58 | [[maybe_unused]] static bool debug_flag = false;
 59 | 
 60 | 
 61 | 
 62 | // [[maybe_unused]] static void errorPrint(std::string_view str) {
 63 | // 	fmt::print(fg(fmt::color::red), "{}\n", str);
 64 | // }
 65 | 
 66 | // [[maybe_unused]] static void passPrint(std::string_view str) {
 67 | // 	fmt::print(fg(fmt::color::green), "{}\n", str);
 68 | // }
 69 | 
 70 | // [[maybe_unused]] static void warnPrint(std::string_view str) {
 71 | // 	fmt::print(fg(fmt::color::yellow), "{}\n", str);
 72 | // }
 73 | 
 74 | // [[maybe_unused]] static void infoPrint(std::string_view str) {
 75 | // 	fmt::print(fg(fmt::color::cyan), "{}\n", str);
 76 | // }
 77 | 
 78 | static const size_t config_region_size = 256*1024;
 79 | static const size_t lite_region_size = 4*1024;
 80 | static const size_t bridge_region_size = 1024*1024*1024;
 81 | 
 82 | std::pair<uint64_t, uint64_t> findFreeChunk(const std::map<uint64_t, uint64_t> &freeChunk, uint64_t mSize) {
 83 |     for (auto const &it: freeChunk) {
 84 |         if (it.second >= mSize) {
 85 |             return {it.first, it.second};
 86 |         }
 87 |     }
 88 |     return {0, 0};
 89 | }
 90 | 
 91 | static bool contains(const std::map<uint64_t, uint64_t> &mp, uint64_t addr) {
 92 |     auto it = mp.find(addr);
 93 |     if (it == mp.end()) {
 94 |         return false;
 95 |     } else {
 96 |         return true;
 97 |     }
 98 | }
 99 | 
100 | void *MemCtl::alloc(size_t size) {
101 |     size = (size + 64UL - 1) & ~(64UL - 1);
102 |     std::lock_guard<std::mutex> lock(allocMutex);
103 |     /*查找大小大于申请空间大小的空闲内存块*/
104 |     auto ck = findFreeChunk(free_chunk, size);
105 |     auto &free_addr = ck.first;
106 |     auto &free_size = ck.second;
107 |     /*如果找到的块为空则报告申请失败*/
108 |     if (free_addr == 0) {
109 |        // warnPrint(fmt::format("No Free CPU Chunk. Alloc failed!"));
110 |         return nullptr;
111 |     }
112 |     /*如果内存块分配后仍存在剩余空间, 从内存块高地址部分分配*/
113 |     if (free_size > size) {
114 |         free_chunk[free_addr] = free_size - size;
115 |         used_chunk[free_addr + free_size - size] = size;
116 |         return (void *) (free_addr + free_size - size);
117 |     } else {
118 |         free_chunk.erase(free_addr);
119 |         used_chunk[free_addr] = size;
120 |         return (void *) (free_addr);
121 |     }
122 | }
123 | 
124 | void MemCtl::free(void *ptr) {
125 |     std::lock_guard<std::mutex> lock(allocMutex);
126 |     /*检查释放的内存块的合法性*/
127 |     if (!contains(used_chunk, (uint64_t) ptr)) {
128 |        // errorPrint(fmt::format("Pointer to free is not in Alloc Log"));
129 |         exit(1);
130 |     }
131 |     auto it = used_chunk.find((uint64_t) ptr);
132 |     uint64_t free_size = it->second;
133 |     used_chunk.erase(it);
134 |     /*寻找第一个首地址大于ptr的空闲块, 返回map结构的迭代器*/
135 |     auto nextIt = free_chunk.upper_bound((uint64_t) ptr);
136 |     if (!free_chunk.empty()) {
137 |         auto prevIt = std::prev(nextIt);
138 |         /*检查前置空闲块 首地址+块大小 与 释放块首地址 是否连续, 连续则将释放块合并到前置空闲块中*/
139 |         if (prevIt->first + prevIt->second == (uint64_t) ptr) {
140 |             free_size += prevIt->second;
141 |             ptr = (void *) prevIt->first;
142 |         }
143 |     }
144 |     /*合并后置块*/
145 |     if (nextIt != free_chunk.end() && (uint64_t) ptr + free_size == nextIt->first) {
146 |         free_size += nextIt->second;
147 |         free_chunk.erase(nextIt);
148 |     }
149 |     free_chunk[(int64_t) ptr] = free_size;
150 | }
151 | 
152 | 
153 | 
154 | class gdrMemAllocator {
155 | public:
156 |     ~gdrMemAllocator();
157 | 
158 |     CUresult gpuMemAlloc(CUdeviceptr *pptr, size_t psize, bool align_to_gpu_page = true, bool set_sync_memops = true);
159 | 
160 |     CUresult gpuMemFree(CUdeviceptr pptr);
161 | 
162 | private:
163 |     std::map<CUdeviceptr, CUdeviceptr> _allocations;
164 | };
165 | 
166 | gdrMemAllocator::~gdrMemAllocator() {
167 |     for (auto &it: _allocations) {
168 |         CUresult ret;
169 |         ret = cuMemFree(it.second);
170 |         if (ret != CUDA_SUCCESS) {
171 |            // warnPrint(fmt::format("Fail to free cuMemAlloc GPU Memory"));
172 |         }
173 |     }
174 | }
175 | 
176 | CUresult gdrMemAllocator::gpuMemAlloc(CUdeviceptr *pptr, size_t psize, bool align_to_gpu_page, bool set_sync_memops) {
177 |     CUresult ret = CUDA_SUCCESS;
178 |     CUdeviceptr ptr;
179 |     size_t size;
180 | 
181 |     if (align_to_gpu_page) {
182 |         size = psize + GPU_PAGE_SIZE - 1;
183 |     } else {
184 |         size = psize;
185 |     }
186 | 
187 |     ret = cuMemAlloc(&ptr, size);
188 |     if (ret != CUDA_SUCCESS)
189 |         return ret;
190 | 
191 |     if (set_sync_memops) {
192 |         unsigned int flag = 1;
193 |         ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr);
194 |         if (ret != CUDA_SUCCESS) {
195 |             cuMemFree(ptr);
196 |             return ret;
197 |         }
198 |     }
199 | 
200 |     if (align_to_gpu_page) {
201 |         *pptr = (ptr + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK;
202 |     } else {
203 |         *pptr = ptr;
204 |     }
205 |     // Record the actual pointer for doing gpuMemFree later.
206 |     _allocations[*pptr] = ptr;
207 | 
208 |     return CUDA_SUCCESS;
209 | }
210 | 
211 | CUresult gdrMemAllocator::gpuMemFree(CUdeviceptr pptr) {
212 |     CUresult ret = CUDA_SUCCESS;
213 |     CUdeviceptr ptr;
214 | 
215 |     if (_allocations.count(pptr) > 0) {
216 |         ptr = _allocations[pptr];
217 |         ret = cuMemFree(ptr);
218 |         if (ret == CUDA_SUCCESS)
219 |             _allocations.erase(pptr);
220 |         return ret;
221 |     } else {
222 |         return CUDA_ERROR_INVALID_VALUE;
223 |     }
224 | }
225 | 
226 | static gdrMemAllocator allocator;
227 | 
228 | static int32_t devID{-1};
229 | 
230 | static const gdr_mh_t null_mh = {0};
231 | 
232 | static gdr_t gdrDev{};
233 | static gdr_mh_t gdrUserMapHandler{null_mh};
234 | static gpu_tlb_t gdrPageTable{};
235 | static gdr_info_t info{};
236 | 
237 | static CUdeviceptr devAddr{};
238 | static void *mapDevPtr{};
239 | 
240 | static inline bool operator==(const gdr_mh_t &a, const gdr_mh_t &b) {
241 |     return a.h == b.h;
242 | }
243 | 
244 | 
245 | static std::vector<std::shared_ptr<GPUMemCtl>> gpu_mem_ctl_list;
246 | 
247 | GPUMemCtl::GPUMemCtl([[maybe_unused]]uint64_t size) {
248 | 
249 |     pool_size = size;
250 |     auto page_size = 64UL * 1024;
251 | 
252 |     CUdevice dev;
253 |     CUcontext devCtx;
254 |     ASSERTDRV(cuInit(0));
255 |     ASSERTDRV(cuDeviceGet(&dev, devID));
256 |     ASSERTDRV(cuDevicePrimaryCtxRetain(&devCtx, dev));
257 |     ASSERTDRV(cuCtxSetCurrent(devCtx));
258 | 
259 |     ASSERTDRV(allocator.gpuMemAlloc(&devAddr, size));
260 | 
261 |     gdrDev = gdr_open();
262 |     ASSERT_NEQ(gdrDev, nullptr);
263 | 
264 |     // 64KB * 64K = 4GB
265 |     // 4GB * 20 = 80GB
266 |     gdrPageTable.pages = new uint64_t[65536 * 20];
267 | 
268 |     ASSERT_EQ(rc4ml_pin_buffer(gdrDev, devAddr, size, 0, 0, &gdrUserMapHandler, &gdrPageTable), 0);
269 |     ASSERT_NEQ(gdrUserMapHandler, null_mh);
270 | 
271 |     ASSERT_EQ(gdr_map(gdrDev, gdrUserMapHandler, &mapDevPtr, size), 0);
272 | 
273 |     ASSERT_EQ(gdr_get_info(gdrDev, gdrUserMapHandler, &info), 0);
274 | 
275 |     ASSERT_EQ((info.va - devAddr), 0);
276 |     ASSERT_EQ((devAddr & (page_size - 1)), 0);
277 | 
278 |     page_table = {gdrPageTable.page_entries, (uint64_t) (devAddr), gdrPageTable.pages};
279 |     free_chunk.emplace((uint64_t) devAddr, size);
280 | 
281 | }
282 | 
283 | GPUMemCtl::~GPUMemCtl() {
284 | 
285 |     const auto size = std::get<0>(page_table) * 64UL * 1024;
286 |     delete[] std::get<2>(page_table);
287 |     ASSERT_EQ(gdr_unmap(gdrDev, gdrUserMapHandler, mapDevPtr, size), 0);
288 |     ASSERT_EQ(gdr_unpin_buffer(gdrDev, gdrUserMapHandler), 0);
289 |     ASSERT_EQ(gdr_close(gdrDev), 0);
290 |     ASSERTDRV(allocator.gpuMemFree(devAddr));
291 | 
292 | }
293 | 
294 | GPUMemCtl *GPUMemCtl::getInstance([[maybe_unused]]int32_t dev_id, [[maybe_unused]]size_t pool_size) {
295 | 
296 |     if (devID >= 0 && devID != dev_id) {
297 |         // errorPrint(fmt::format("This QDMA library now only support one GPU Memory Pool"));
298 |         // errorPrint(fmt::format("New device id {} is not equal to previous device id {}", dev_id, devID));
299 |         exit(1);
300 |     }
301 |     // up round to 64KB
302 |     pool_size = (pool_size + 64UL * 1024 - 1) & ~(64UL * 1024 - 1);
303 | 
304 |     if (pool_size % (2UL * 1024 * 1024) != 0) {
305 |         // warnPrint(fmt::format("Suggest GPU Memory Pool Size to be multiple of 2MB for Page Aggregation"));
306 |         // errorPrint(fmt::format("For correctness safety, the program will exit. Please change the pool size"));
307 |         exit(1);
308 |     }
309 | 
310 |     if (gpu_mem_ctl_list.empty()) {
311 |         devID = dev_id;
312 |         auto tmp = new GPUMemCtl(pool_size);
313 |         gpu_mem_ctl_list.push_back(std::shared_ptr<GPUMemCtl>(tmp));
314 |         return tmp;
315 |     } else {
316 |         static bool warn_flag = false;
317 |         if (!warn_flag) {
318 |             warn_flag = true;
319 |             // warnPrint(fmt::format("This QDMA library now only support one GPU Memory Pool"));
320 |             // warnPrint(fmt::format("Request pool size will be ignored"));
321 |             // warnPrint(fmt::format("The previous GPU Memory Pool with size {} will be returned",
322 |             //                       gpu_mem_ctl_list[0]->getPoolSize()));
323 |         }
324 |         return gpu_mem_ctl_list[0].get();
325 |     }
326 | 
327 | }
328 | 
329 | void GPUMemCtl::cleanCtx() {
330 | 
331 |     gpu_mem_ctl_list.clear();
332 | 
333 | 
334 | }
335 | 
336 | void GPUMemCtl::writeTLB([[maybe_unused]]const std::function<void(uint32_t, uint32_t, uint64_t, uint64_t)> &func, [[maybe_unused]]bool aggr_flag) {
337 | 
338 |     const auto &[n_pages, vaddr, parray] = page_table;
339 | 
340 |     if (aggr_flag) {
341 |         const auto page_size = 2UL * 1024 * 1024;
342 |         auto aggr_n_pages = n_pages / 32;
343 |         for (uint32_t i = 0; i < aggr_n_pages; ++i) {
344 |             for (uint32_t j = 1; j < 32; ++j) {
345 |                 ASSERT_EQ((parray[i * 32 + j] - parray[i * 32 + j - 1]), 65536);
346 |             }
347 |             func(i, page_size, vaddr + i * page_size, parray[i * 32]);
348 |         }
349 |     } else {
350 |         const auto page_size = 64UL * 1024;
351 |         for (int i = 0; i < n_pages; i++) {
352 |             func(i, page_size, vaddr + i * page_size, parray[i]);
353 |         }
354 |     }
355 | 
356 | }
357 | 
358 | uint64_t GPUMemCtl::mapV2P(void *ptr) {
359 |     const auto &[n_pages, vaddr, parray] = page_table;
360 |     const auto page_size = 64UL * 1024;
361 |     uint64_t offset = (uint64_t) ptr - vaddr;
362 |     return parray[offset / page_size] + (offset & (page_size - 1));
363 | }
364 | 
365 | void *GPUMemCtl::getDevPtr() const {
366 | 
367 |     return (void *)devAddr;
368 | 
369 | }
370 | 
371 | void *GPUMemCtl::getMapDevPtr() const {
372 | 
373 |     return mapDevPtr;
374 | 
375 | }
376 | 
377 | bool GPUMemCtl::chechPhyContiguous() const {
378 | 
379 |     const auto &[n_pages, vaddr, parray] = page_table;
380 |     const auto page_size = 64UL * 1024;
381 |     for (int i = 1; i < n_pages; i++) {
382 |         if (parray[i] - parray[i - 1] != page_size) {
383 |             return false;
384 |         }
385 |     }
386 |     return true;
387 | 
388 | }
389 | 


--------------------------------------------------------------------------------
/src/applications/gemm/src/queue.cu:
--------------------------------------------------------------------------------
  1 | #include "queue.cuh"
  2 | 
  3 | __host__ __device__ void QueuePair::submit(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12)
  4 | {
  5 |     fill_sq(cmd_id, sq_tail, opcode, prp1, prp2, dw10, dw11, dw12);
  6 |     sq_tail = (sq_tail + 1) % queue_depth;
  7 |     *sqtdbl = sq_tail;
  8 |     cid = cmd_id;
  9 |     cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK;
 10 | }
 11 | 
 12 | __device__ void QueuePair::submit_fence(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12)
 13 | {
 14 |     fill_sq(cmd_id, sq_tail, opcode, prp1, prp2, dw10, dw11, dw12);
 15 |     __threadfence_system();
 16 |     sq_tail = (sq_tail + 1) % queue_depth;
 17 |     *sqtdbl = sq_tail;
 18 |     cid = cmd_id;
 19 |     cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK;
 20 | }
 21 | 
 22 | __host__ __device__ void QueuePair::fill_sq(uint32_t cid, uint32_t pos, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12, uint32_t req_id)
 23 | {
 24 |     // if (req_id == 1152)
 25 |     //     printf("%lx %lx %x %x %x %x %x %x\n", prp1, prp2, dw10, dw11, dw12, opcode, cid, namespace_id);
 26 |     sq[pos * 16 + 0] = opcode | (cid << 16);
 27 |     sq[pos * 16 + 1] = namespace_id;
 28 |     sq[pos * 16 + 6] = prp1 & 0xffffffff;
 29 |     sq[pos * 16 + 7] = prp1 >> 32;
 30 |     sq[pos * 16 + 8] = prp2 & 0xffffffff;
 31 |     sq[pos * 16 + 9] = prp2 >> 32;
 32 |     sq[pos * 16 + 10] = dw10;
 33 |     sq[pos * 16 + 11] = dw11;
 34 |     sq[pos * 16 + 12] = dw12;
 35 |     if (cmd_id_to_req_id)
 36 |         cmd_id_to_req_id[cid % queue_depth] = req_id;
 37 |     if (cmd_id_to_sq_pos)
 38 |         cmd_id_to_sq_pos[cid % queue_depth] = pos;
 39 |     if (sq_entry_busy)
 40 |         sq_entry_busy[pos] = true;
 41 | }
 42 | 
 43 | __host__ __device__ void QueuePair::poll(uint32_t &code, uint32_t cid)
 44 | {
 45 |     uint32_t current_phase = ((cmd_id - 1) / queue_depth) & 1;
 46 |     uint32_t status = cq[cq_head * 4 + 3];
 47 |     while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
 48 |         status = cq[cq_head * 4 + 3];
 49 |     if ((status & NVME_ENTRY_CID_MASK) != cid)
 50 |     {
 51 |         AEOLUS_LOG_ERROR("expected cid: %d, actual cid: %d", cid, status & NVME_ENTRY_CID_MASK);
 52 |         assert(0);
 53 |     }
 54 |     code = (status >> 17) & NVME_ENTRY_SC_MASK;
 55 |     num_completed++;
 56 |     cq_head = (cq_head + 1) % queue_depth;
 57 |     *cqhdbl = cq_head;
 58 | }
 59 | 
 60 | __host__ __device__ void QueuePair::poll_with_dw0(uint32_t &code, uint32_t cid, uint32_t &dw0)
 61 | {
 62 |     uint32_t current_phase = ((cmd_id - 1) / queue_depth) & 1;
 63 |     uint32_t status = cq[cq_head * 4 + 3];
 64 |     while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
 65 |         status = cq[cq_head * 4 + 3];
 66 |     if ((status & NVME_ENTRY_CID_MASK) != cid)
 67 |     {
 68 |         AEOLUS_LOG_ERROR("expected cid: %d, actual cid: %d", cid, status & NVME_ENTRY_CID_MASK);
 69 |         assert(0);
 70 |     }
 71 |     code = (status >> 17) & NVME_ENTRY_SC_MASK;
 72 |     dw0 = cq[cq_head * 4];
 73 |     num_completed++;
 74 |     cq_head = (cq_head + 1) % queue_depth;
 75 |     *cqhdbl = cq_head;
 76 | }
 77 | 
 78 | __device__ void IoQueuePair::poll_range(uint32_t &code, int expected_sq_head, bool should_break)
 79 | {
 80 |     // printf("cmd_id: %d, size: %d, current_phase: %d\n", cmd_id, size, current_phase);
 81 |     int i;
 82 |     uint32_t last_sq_head = ~0U;
 83 |     // int last_num_completed = num_completed;
 84 |     // int thread_id = threadIdx.x + blockIdx.x * blockDim.x;
 85 |     for (i = cq_head; (num_completed & NVME_ENTRY_CID_MASK) != (cmd_id & NVME_ENTRY_CID_MASK); i = (i + 1) % queue_depth)
 86 |     {
 87 |         uint32_t current_phase = (num_completed / queue_depth) & 1;
 88 |         uint32_t status = cq[i * 4 + 3];
 89 |         uint64_t start = clock64();
 90 |         while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
 91 |         {
 92 |             status = cq[i * 4 + 3];
 93 |             if (clock64() - start > 1000000000)
 94 |             {
 95 |                 AEOLUS_LOG_ERROR("timeout sq_tail=%d, cq_head=%d, i=%d, num_completed=%d, cmd_id=%d\n", sq_tail, cq_head, i, num_completed, cmd_id);
 96 |                 AEOLUS_LOG_ERROR("last_sq_head: %d, expected_sq_head: %d\n", last_sq_head, expected_sq_head);
 97 |                 // int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 98 |                 // if (thread_id)
 99 |                 //     return 0;
100 |                 // for (int m = 0; m < queue_depth; m++)
101 |                 // {
102 |                 //     printf("SQE %d\n", m);
103 |                 //     for (int n = 0; n < 16; n++)
104 |                 //         printf("DW%2d, %08x\n", n, sq[m * 16 + n]);
105 |                 // }
106 |                 // for (int m = 0; m < queue_depth; m++)
107 |                 // {
108 |                 //     printf("CQE %d\n", m);
109 |                 //     for (int n = 0; n < 4; n++)
110 |                 //         printf("DW%2d, %08x\n", n, cq[m * 4 + n]);
111 |                 // }
112 |                 code = 1;
113 |             }
114 |         }
115 |         int cmd_id = status & NVME_ENTRY_CID_MASK;
116 |         int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth];
117 |         if ((status >> 17) & NVME_ENTRY_SC_MASK)
118 |         {
119 |             printf("cq[%d] status: 0x%x, cid: %d\n", i, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK);
120 |             int req_id = cmd_id_to_req_id[cmd_id % queue_depth];
121 |             printf("req_id: %d, sq_pos: %d\n", req_id, sq_pos);
122 |             // for (int i = 0; i < 16; i++)
123 |             //     printf("%08x ", sq[sq_pos * 16 + i]);
124 |             // printf("\n");
125 |             code = (status >> 17) & NVME_ENTRY_SC_MASK;
126 |         }
127 |         last_sq_head = cq[i * 4 + 2] & NVME_ENTRY_SQ_HEAD_MASK;
128 |         sq_entry_busy[sq_pos] = false;
129 |         // printf("thread %d freed sq_pos %d\n", thread_id, sq_pos);
130 |         num_completed++;
131 |         if (should_break && ((cq[i * 4 + 2] & NVME_ENTRY_SQ_HEAD_MASK) - expected_sq_head + queue_depth) % queue_depth <= AEOLUS_WARP_SIZE)
132 |         {
133 |             // printf("cq[%d] sq_head: %d, expected_sq_head: %d\n", i, cq[i * 4 + 2] & SQ_HEAD_MASK, expected_sq_head);
134 |             i = (i + 1) % queue_depth;
135 |             // if (num_completed - last_num_completed > 64)
136 |             //     printf("%d: %d completed\n", thread_id, num_completed - last_num_completed);
137 |             break;
138 |         }
139 |     }
140 |     if (i != cq_head)
141 |     {
142 |         cq_head = i;
143 |         // printf("cq_head is %p, set cqhdbl to %d\n", cqhdbl, cq_head);
144 |         *cqhdbl = cq_head;
145 |     }
146 |     code = 0;
147 | }
148 | 
149 | __device__ void IoQueuePair::poll_multiple(uint32_t &code, int cnt)
150 | {
151 |     for (int i = 0; i < cnt; i++)
152 |     {
153 |         uint32_t current_phase = (num_completed / queue_depth) & 1;
154 |         int pos = (cq_head + i) % queue_depth;
155 |         uint32_t status = cq[pos * 4 + 3];
156 |         while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
157 |             status = cq[pos * 4 + 3];
158 |         int cmd_id = status & NVME_ENTRY_CID_MASK;
159 |         int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth];
160 |         if ((status >> 17) & NVME_ENTRY_SC_MASK)
161 |         {
162 |             printf("cq[%d] status: 0x%x, cid: %d\n", pos, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK);
163 |             code = (status >> 17) & NVME_ENTRY_SC_MASK;
164 |         }
165 |         sq_entry_busy[sq_pos] = false;
166 |         num_completed++;
167 |     }
168 |     cq_head = (cq_head + cnt) % queue_depth;
169 |     *cqhdbl = cq_head;
170 |     code = 0;
171 | }
172 | 
173 | __device__ void IoQueuePair::poll_until_sq_entry_free(uint32_t &code, int expected_sq_pos) {
174 |     // int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
175 |     // int last_num_completed = num_completed;
176 |     // printf("thread %d want to free sq_pos: %d num_completed %d cmd_id %d\n", thread_id, expected_sq_pos, num_completed, cmd_id);
177 |     int i;
178 |     for (i = cq_head; (num_completed & NVME_ENTRY_CID_MASK) != (cmd_id & NVME_ENTRY_CID_MASK); i = (i + 1) % queue_depth)
179 |     {
180 |         uint32_t current_phase = (num_completed / queue_depth) & 1;
181 |         uint32_t status = cq[i * 4 + 3];
182 |         while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
183 |             status = cq[i * 4 + 3];
184 |         int cmd_id = status & NVME_ENTRY_CID_MASK;
185 |         int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth];
186 |         if ((status >> 17) & NVME_ENTRY_SC_MASK)
187 |         {
188 |             printf("cq[%d] status: 0x%x, cid: %d\n", i, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK);
189 |             int req_id = cmd_id_to_req_id[cmd_id % queue_depth];
190 |             printf("req_id: %d, sq_pos: %d\n", req_id, sq_pos);
191 |             // for (int i = 0; i < 16; i++)
192 |             //     printf("%08x ", sq[sq_pos * 16 + i]);
193 |             // printf("\n");
194 |             code = (status >> 17) & NVME_ENTRY_SC_MASK;
195 |         }
196 |         sq_entry_busy[sq_pos] = false;
197 |         // printf("thread %d manually freed sq_pos %d\n", thread_id, sq_pos);
198 |         num_completed++;
199 |         if (sq_pos == expected_sq_pos)
200 |         {
201 |             cq_head = (i + 1) % queue_depth;
202 |             // printf("cq_head is %p, set cqhdbl to %d\n", cqhdbl, cq_head);
203 |             *cqhdbl = cq_head;
204 |             // if (num_completed - last_num_completed > 64)
205 |             //     printf("%d: %d completed\n", thread_id, num_completed - last_num_completed);
206 |             code = 0;
207 |         }
208 |     }
209 |     // printf("thread %d failed to free sq_pos %d\n", thread_id, expected_sq_pos);
210 |     code = 1;
211 | }
212 | 
213 | __host__ __device__ void AdminQueuePair::submit_with_ns(uint32_t &cid, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12)
214 | {
215 |     fill_sq_with_ns(cmd_id, sq_tail, opcode, nsid, prp1, prp2, dw10, dw11, dw12);
216 |     sq_tail = (sq_tail + 1) % queue_depth;
217 |     *sqtdbl = sq_tail;
218 |     cid = cmd_id;
219 |     cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK;
220 | }
221 | __host__ __device__ void AdminQueuePair::fill_sq_with_ns(uint32_t cid, uint32_t pos, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12, uint32_t req_id)
222 | {
223 |     sq[pos * 16 + 0] = opcode | (cid << 16);
224 |     sq[pos * 16 + 1] = nsid;
225 |     sq[pos * 16 + 6] = prp1 & 0xffffffff;
226 |     sq[pos * 16 + 7] = prp1 >> 32;
227 |     sq[pos * 16 + 8] = prp2 & 0xffffffff;
228 |     sq[pos * 16 + 9] = prp2 >> 32;
229 |     sq[pos * 16 + 10] = dw10;
230 |     sq[pos * 16 + 11] = dw11;
231 |     sq[pos * 16 + 12] = dw12;
232 |     if (cmd_id_to_req_id)
233 |         cmd_id_to_req_id[cid % queue_depth] = req_id;
234 |     if (cmd_id_to_sq_pos)
235 |         cmd_id_to_sq_pos[cid % queue_depth] = pos;
236 |     if (sq_entry_busy)
237 |         sq_entry_busy[pos] = true;
238 | }
239 | 
240 | __host__ uint32_t AdminQueuePair::set_num_queues(uint16_t nsqr, uint16_t ncqr)
241 | {
242 |     uint32_t cid;
243 |     submit(
244 |         cid, NVME_ADMIN_OPCODE_SET_FEATURES, 0x0, 0x0, NVME_FEATURE_ID_NUM_QUEUES, 
245 |         ((ncqr-1) << 16) | (nsqr-1) 
246 |     );
247 |     uint32_t status;
248 |     poll(status, cid);
249 |     return status;
250 | }
251 | 
252 | __host__ uint32_t AdminQueuePair::get_num_queues(uint16_t &nsqa, uint16_t &ncqa)
253 | {
254 |     uint32_t cid;
255 |     submit(
256 |         cid, NVME_ADMIN_OPCODE_GET_FEATURES, 0x0, 0x0, NVME_FEATURE_ID_NUM_QUEUES, 0x0
257 |     );
258 |     uint32_t dw0;
259 |     uint32_t status;
260 |     poll_with_dw0(status, cid, dw0);
261 |     nsqa = (dw0 & 0xffff) + 1;
262 |     ncqa = ((dw0 >> 16) & 0xffff) + 1;
263 |     return status;
264 | }
265 | 
266 | __host__ uint32_t AdminQueuePair::identify(uint8_t cns, uint16_t cntid, uint32_t nsid, uint64_t prp1)
267 | {
268 |     uint32_t cid;
269 |     submit_with_ns(
270 |         cid, NVME_ADMIN_OPCODE_IDENTIFY, nsid, prp1, 0x0, cns | (cntid << 16), 0x0
271 |     );
272 |     uint32_t status;
273 |     poll(status, cid);
274 |     return status;
275 | }
276 | 
277 | __host__ uint32_t AdminQueuePair::create_cq_cont(uint16_t cqid, uint64_t cq_phys, uint16_t queue_depth)
278 | {
279 |     uint32_t cid;
280 |     submit(
281 |         cid, NVME_ADMIN_OPCODE_CREATE_CQ, cq_phys, 0x0, cqid | ((queue_depth-1) << 16), 0x1
282 |     );
283 |     uint32_t status;
284 |     poll(status, cid);
285 |     return status;
286 | }
287 | 
288 | __host__ uint32_t AdminQueuePair::create_sq_cont(uint16_t sqid, uint16_t cqid, uint64_t sq_phys, uint16_t queue_depth)
289 | {
290 |     uint32_t cid;
291 |     submit(
292 |         cid, NVME_ADMIN_OPCODE_CREATE_SQ, sq_phys, 0x0, sqid | ((queue_depth-1) << 16), (cqid << 16) | 0x1
293 |     );
294 |     uint32_t status;
295 |     poll(status, cid);
296 |     return status;
297 | }
298 | 
299 | __host__ uint32_t AdminQueuePair::delete_sq(uint16_t sqid)
300 | {
301 |     uint32_t cid;
302 |     submit(
303 |         cid, NVME_ADMIN_OPCODE_DELETE_SQ, 0x0, 0x0, sqid, 0x0
304 |     );
305 |     uint32_t status;
306 |     poll(status, cid);
307 |     return status;
308 | }
309 | 
310 | __host__ uint32_t AdminQueuePair::delete_cq(uint16_t cqid)
311 | {
312 |     uint32_t cid;
313 |     submit(
314 |         cid, NVME_ADMIN_OPCODE_DELETE_SQ, 0x0, 0x0, cqid, 0x0
315 |     );
316 |     uint32_t status;
317 |     poll(status, cid);
318 |     return status;
319 | }


--------------------------------------------------------------------------------
/src/applications/gemm/src/controller.cu:
--------------------------------------------------------------------------------
  1 | #include "controller.cuh"
  2 | #include <algorithm>
  3 | 
  4 | Controller::Controller(
  5 |     std::vector<Device *> ssd_list, int32_t num_queue_per_ssd, int32_t max_io_size,
  6 |     int32_t queue_depth, aeolus_dist_type dist_type, aeolus_buf_type buf_type,
  7 |     uint64_t *pinned_buf_phys, uint64_t pinned_buf_size
  8 | )
  9 | {
 10 |     // Check if the input parameters are valid.
 11 | 
 12 |     AEOLUS_CUDA_CHECK(cudaGetDevice(&gpu_id));
 13 |     ssd_count = ssd_list.size();
 14 |     if (ssd_count <= 0)
 15 |     {{
 16 |         AEOLUS_LOG_ERROR("Empty SSD list delivered to Controller.");
 17 |         exit(-1);
 18 |     }}
 19 |     // Get maximum queue number and IO size of SSDs.
 20 |     max_queue_num   = INT_MAX;
 21 |     max_trans_size  = INT_MAX;
 22 |     for (auto ssd : ssd_list)
 23 |     {
 24 |         max_queue_num   = MIN(max_queue_num, ssd->free_qps.size());
 25 |         max_trans_size  = MIN(max_trans_size, ssd->max_io_size);
 26 |     }
 27 |     if (num_queue_per_ssd < 0)
 28 |     {
 29 |         num_queue_per_ssd = max_queue_num + 1 + num_queue_per_ssd;
 30 |     }
 31 |     if (num_queue_per_ssd <= 0 || num_queue_per_ssd > max_queue_num)
 32 |     {
 33 |         AEOLUS_LOG_ERROR(
 34 |             "Invalid queue number per SSD delivered to Controller."
 35 |             "The range should be between 1 and %d.", max_queue_num
 36 |         );
 37 |         exit(-1);
 38 |     }
 39 |     if (max_io_size == AEOLUS_MAX_DATA_TRANSFER)
 40 |     {
 41 |         // Less than 2 MiB IO size is to ensure the PRP list of a request won't exceed a page.
 42 |         max_io_size = MIN(max_trans_size, 2*1024*1024); 
 43 |     }
 44 |     if (max_io_size < 512 || max_io_size > max_trans_size || max_io_size > 2*1024*1024)
 45 |     {
 46 |         AEOLUS_LOG_ERROR(
 47 |             "Invalid max io size delivered to Controller."
 48 |             "The range should be between 512 and %d", MIN(max_trans_size, 2*1024*1024)
 49 |         );
 50 |         exit(-1);
 51 |     }
 52 |     if (!isPowerOfTwo(max_io_size))
 53 |     {
 54 |         AEOLUS_LOG_ERROR(
 55 |             "Invalid max io size delivered to Controller."
 56 |             "The value should be power of 2."
 57 |         );
 58 |         exit(-1);
 59 |     }
 60 | 
 61 |     this->ssd_list          = ssd_list;
 62 |     this->num_queue_per_ssd = num_queue_per_ssd;
 63 |     this->max_io_size       = max_io_size;
 64 |     this->queue_depth       = queue_depth;
 65 |     this->dist_type         = dist_type;
 66 |     this->buf_type          = buf_type;
 67 | 
 68 |     // Compute SSD LB prefix sum.
 69 | 
 70 |     h_ssd_num_lbs = new uint64_t[ssd_count];
 71 |     for (int i = 0; i < ssd_count; i++)
 72 |         h_ssd_num_lbs[i] = ssd_list[i]->max_lb_num;
 73 |     AEOLUS_CUDA_CHECK(cudaMalloc(&d_ssd_num_lbs, ssd_count * sizeof(uint64_t)));
 74 |     AEOLUS_CUDA_CHECK(cudaMemcpy(d_ssd_num_lbs, h_ssd_num_lbs, ssd_count * sizeof(uint64_t), cudaMemcpyHostToDevice));
 75 | 
 76 |     // Alloc shared buffers.
 77 |     
 78 |     AEOLUS_CUDA_CHECK(cudaMalloc(&ssd_num_reqs, ssd_count * sizeof(int)));
 79 |     if (dist_type != AEOLUS_DIST_STRIPE)
 80 |     {
 81 |         AEOLUS_LOG_ERROR("Controller only supports AEOLUS_DIST_STRIPE distribution type for now\n");
 82 |     }
 83 |     AEOLUS_CUDA_CHECK(cudaMalloc(&distributed_reqs, AEOLUS_MAX_NUM_REQUESTS * sizeof(Request)));
 84 |     AEOLUS_CUDA_CHECK(cudaMalloc(&req_ids, ssd_count * sizeof(int)));
 85 | 
 86 |     // Create SSD IO queue pairs.
 87 | 
 88 |     qpid_list = new int32_t *[ssd_count];
 89 |     for (int i=0; i<ssd_count; i++)
 90 |     {
 91 |         qpid_list[i] = new int32_t[num_queue_per_ssd];
 92 |         for (int j=0; j<num_queue_per_ssd; j++)
 93 |         {
 94 |             qpid_list[i][j] = ssd_list[i]->free_qps[0];
 95 |             ssd_list[i]->free_qps.erase(ssd_list[i]->free_qps.begin());
 96 |         }
 97 |     }
 98 | 
 99 |     int sq_size = MAX(AEOLUS_HOST_PGSIZE, queue_depth*NVME_SQ_ENTRY_SIZE);
100 |     assert((sq_size % AEOLUS_HOST_PGSIZE) == 0);
101 | 
102 |     void *d_qp_ptr;
103 |     int ret = ssd_list[0]->alloc_device_memory(&d_qp_ptr, &qp_ctx, 2*sq_size*ssd_count*num_queue_per_ssd, &qp_phys);
104 |     if (ret != 0)
105 |     {
106 |         AEOLUS_LOG_ERROR("Failed to allocate device memory for SSD IO queues: %s", strerror(ret));
107 |         exit(-1);
108 |     }
109 |     AEOLUS_CUDA_CHECK(cudaMemset(d_qp_ptr, 0, 2*sq_size*ssd_count*num_queue_per_ssd));
110 | 
111 |     AEOLUS_CUDA_CHECK(cudaMalloc(&d_ssdqp, ssd_count*num_queue_per_ssd*sizeof(IoQueuePair)));
112 | 
113 |     for (int i=0; i<ssd_count; i++)
114 |     {
115 |         for (int j=0; j<num_queue_per_ssd; j++)
116 |         {
117 |             uint64_t sq_virt = (uint64_t)d_qp_ptr + sq_size * (2*i*num_queue_per_ssd+2*j);
118 |             uint64_t cq_virt = (uint64_t)d_qp_ptr + sq_size * (2*i*num_queue_per_ssd+2*j+1);
119 | 
120 |             int qid = qpid_list[i][j];
121 | 
122 |             // Create CQ.
123 |             int offset = sq_size * (2*i*num_queue_per_ssd+2*j+1);
124 |             uint64_t cq_phys = qp_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE;
125 |             ret = ssd_list[i]->admin_qp->create_cq_cont(qid, cq_phys, queue_depth);
126 |             if (ret != 0)
127 |             {
128 |                 AEOLUS_LOG_ERROR(
129 |                     "Failed to create CQ %d for SSD %d with status 0x%x", 
130 |                     qid, i, ret
131 |                 );
132 |                 exit(-1);
133 |             }
134 | 
135 |             // Create SQ.
136 |             offset = sq_size * (2*i*num_queue_per_ssd+2*j);
137 |             uint64_t sq_phys = qp_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE;
138 |             ret = ssd_list[i]->admin_qp->create_sq_cont(qid, qid, sq_phys, queue_depth);
139 |             if (ret != 0)
140 |             {
141 |                 AEOLUS_LOG_ERROR(
142 |                     "Failed to create SQ %d for SSD %d with status 0x%x", 
143 |                     qid, i, ret
144 |                 );
145 |                 exit(-1);
146 |             }
147 |             // AEOLUS_LOG_INFO("CQ phy addr: 0x%lx, SQ phy addr: 0x%lx", cq_phys, sq_phys);
148 | 
149 |             // Create auxiliary data structures.
150 |             uint32_t *d_cmd_id_to_req_id;
151 |             AEOLUS_CUDA_CHECK(cudaMalloc(&d_cmd_id_to_req_id, sizeof(uint32_t)*queue_depth));
152 |             uint32_t *d_cmd_id_to_sq_pos;
153 |             AEOLUS_CUDA_CHECK(cudaMalloc(&d_cmd_id_to_sq_pos, sizeof(uint32_t)*queue_depth));
154 |             bool *d_sq_entry_busy;
155 |             AEOLUS_CUDA_CHECK(cudaMalloc(&d_sq_entry_busy, 1*queue_depth));
156 |             AEOLUS_CUDA_CHECK(cudaMemset(d_sq_entry_busy, 0, 1*queue_depth));
157 |             IoQueuePair h_ssdqp(
158 |                 (volatile uint32_t *)sq_virt, (volatile uint32_t *)cq_virt,
159 |                 ssd_list[i]->active_ns, 
160 |                 (uint32_t *)((uint64_t)ssd_list[i]->reg_ptr + NVME_REG_SQTDBL + qid * NVME_DBLSTRIDE),
161 |                 (uint32_t *)((uint64_t)ssd_list[i]->reg_ptr + NVME_REG_CQHDBL + qid * NVME_DBLSTRIDE),
162 |                 queue_depth, d_cmd_id_to_req_id, d_cmd_id_to_sq_pos, d_sq_entry_busy
163 |             );
164 |             // AEOLUS_LOG_INFO("Created SSD IO queue pair %d for SSD %d.", qid, i);
165 |             AEOLUS_CUDA_CHECK(cudaMemcpy(
166 |                 d_ssdqp + i*num_queue_per_ssd+j, &h_ssdqp, 
167 |                 sizeof(IoQueuePair), cudaMemcpyHostToDevice
168 |             ));
169 |         }
170 |     }
171 | 
172 |     uint64_t io_buf_size = (uint64_t)max_io_size*ssd_count*num_queue_per_ssd*queue_depth;
173 |     uint64_t *h_iobuf_phys;
174 |     if (buf_type == AEOLUS_BUF_USER) {
175 |         
176 |         // Allocate IO buffer.
177 |         
178 |         AEOLUS_LOG_INFO("Allocating IO buffer.");
179 |         int ret = ssd_list[0]->alloc_device_memory(
180 |             &d_iobuf_ptr, &iobuf_ctx, io_buf_size, &h_iobuf_phys
181 |         );
182 |         if (ret != 0)
183 |         {
184 |             AEOLUS_LOG_ERROR("Failed to allocate device memory for IO buffer: %s", strerror(ret));
185 |             exit(-1);
186 |         }
187 |     } else 
188 |     {
189 |         h_iobuf_phys = pinned_buf_phys;
190 |         io_buf_size = pinned_buf_size;
191 |     }
192 |     if (h_iobuf_phys != nullptr)
193 |     {
194 |         AEOLUS_CUDA_CHECK(cudaMalloc(&d_iobuf_phys, sizeof(uint64_t)));
195 |         AEOLUS_CUDA_CHECK(cudaMemcpy(d_iobuf_phys, h_iobuf_phys, sizeof(uint64_t), cudaMemcpyHostToDevice));
196 |     }
197 | 
198 |     // Allocate PRP list.
199 |     
200 |     if (max_io_size > AEOLUS_HOST_PGSIZE * 2) {
201 |         uint64_t prp_list_size = io_buf_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t);
202 |         if (io_buf_size > 0)
203 |         {
204 |             AEOLUS_LOG_INFO("Allocating PRP buffer.");
205 |             ssd_list[0]->alloc_host_memory((void **)&prp_list, prp_list_size, &h_prp_phys);
206 | 
207 |             // Fill in PRP table.
208 |             for (int i = 0; i < io_buf_size / AEOLUS_DEVICE_PGSIZE; i++)
209 |             {
210 |                 for (int j = 0; j < AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE; j++)
211 |                 {
212 |                     if (i == 0 && j == 0)
213 |                     {
214 |                         continue;
215 |                     }
216 |                     prp_list[i * AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE + j - 1] = 
217 |                         h_iobuf_phys[i] + j * AEOLUS_HOST_PGSIZE;
218 |                 }
219 |             }
220 |         }
221 | 
222 |         // Move PRP physical address to GPU.
223 |         size_t prp_phys_size = CEIL(prp_list_size, AEOLUS_HOST_PGSIZE) * sizeof(uint64_t);
224 |         AEOLUS_CUDA_CHECK(cudaMalloc(&d_prp_phys, prp_phys_size));
225 |         AEOLUS_CUDA_CHECK(cudaMemcpy(d_prp_phys, h_prp_phys, prp_phys_size, cudaMemcpyHostToDevice));
226 |     }
227 | }
228 | 
229 | Controller::~Controller()
230 | {
231 |     AEOLUS_LOG_INFO("Cleaning up controller.");
232 |     if (buf_type == AEOLUS_BUF_USER) {
233 |         if (max_io_size > 8192) {
234 |             AEOLUS_CUDA_CHECK(cudaFree(d_prp_phys));
235 |             ssd_list[0]->free_host_memory(prp_list, h_prp_phys);
236 |         }
237 |         ssd_list[0]->free_device_memory(iobuf_ctx);
238 |         AEOLUS_CUDA_CHECK(cudaFree(d_iobuf_phys));
239 |     }
240 | 
241 |     IoQueuePair *h_ssdqp = (IoQueuePair *)malloc(sizeof(IoQueuePair));
242 |     for (int i=0; i<ssd_count; i++)
243 |     {
244 |         for (int j=0; j<num_queue_per_ssd; j++)
245 |         {
246 |             AEOLUS_CUDA_CHECK(cudaMemcpy(
247 |                 h_ssdqp, d_ssdqp + i*num_queue_per_ssd+j, 
248 |                 sizeof(IoQueuePair), cudaMemcpyDeviceToHost
249 |             ));
250 |             AEOLUS_CUDA_CHECK(cudaFree(h_ssdqp->cmd_id_to_req_id)); 
251 |             AEOLUS_CUDA_CHECK(cudaFree(h_ssdqp->cmd_id_to_sq_pos));
252 |             AEOLUS_CUDA_CHECK(cudaFree(h_ssdqp->sq_entry_busy));
253 |             ssd_list[i]->admin_qp->delete_sq(qpid_list[i][j]);
254 |             ssd_list[i]->admin_qp->delete_cq(qpid_list[i][j]);
255 |             ssd_list[i]->free_qps.push_back(qpid_list[i][j]);
256 |         }
257 |         delete [] qpid_list[i];
258 |     }
259 |     delete [] qpid_list;
260 | 
261 |     AEOLUS_CUDA_CHECK(cudaFree(d_ssdqp));
262 |     ssd_list[0]->free_device_memory(qp_ctx);
263 |     free(h_ssdqp);
264 |     delete [] h_ssd_num_lbs;
265 |     AEOLUS_CUDA_CHECK(cudaFree(d_ssd_num_lbs));
266 |     AEOLUS_LOG_INFO("Cleaning up controller done.");
267 | }
268 | 
269 | __global__ static void rw_data_kernel(uint32_t opcode, int ssd_id, uint64_t start_lb, uint64_t num_lb, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type)
270 | {
271 |     uint32_t cid;
272 |     int global_queue_id = ssd_id * num_queues_per_ssd;
273 |     uint64_t global_pos = (uint64_t)global_queue_id * queue_depth;
274 |     uint64_t io_addr;
275 |     if (buf_type == AEOLUS_BUF_USER)
276 |         io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
277 |     else
278 |     {
279 |         io_addr = prp1[0];
280 |         global_pos = 0;
281 |     }
282 |     uint64_t io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
283 |     if (num_lb * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
284 |     {
285 |         int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request
286 |         uint64_t offset = global_pos * prp_size;
287 |         io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
288 |     }
289 |     ssdqp[global_queue_id].submit(cid, opcode, io_addr, io_addr2, start_lb & 0xffffffff, (start_lb >> 32) & 0xffffffff, NVME_RW_LIMITED_RETRY_MASK | (num_lb - 1));
290 |     uint32_t status;
291 |     ssdqp[global_queue_id].poll(status, cid);
292 |     // printf("ssd_id: %d, start_lb: %lu, cmd_id: %u\n", ssd_id, start_lb, ssdqp[global_queue_id].cmd_id);
293 |     if (status != 0)
294 |     {
295 |         AEOLUS_LOG_ERROR("read/write failed with status 0x%x\n", status);
296 |         assert(0);
297 |     }
298 | }
299 | 
300 | void Controller::lb_to_ssd_id(uint64_t lb, int &ssd_id, uint64_t &local_lb)
301 | {
302 |     int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE;
303 |     if (lb % lbs_per_max_io_size != 0)
304 |     {
305 |         AEOLUS_LOG_ERROR("Unaligned start LB %lu is unsupported now", lb);
306 |         exit(-1);
307 |     }
308 |     ssd_id = lb / lbs_per_max_io_size % ssd_count;
309 |     local_lb = lb / lbs_per_max_io_size / ssd_count * lbs_per_max_io_size;
310 |     if (local_lb >= h_ssd_num_lbs[ssd_id])
311 |     {
312 |         AEOLUS_LOG_ERROR("Out of bound start LB %lu", lb);
313 |         exit(-1);
314 |     }
315 | }
316 | 
317 | void Controller::read_data(uint64_t start_lb, uint64_t num_lb, void *buf)
318 | {
319 |     int ssd_id;
320 |     uint64_t local_lb;
321 |     lb_to_ssd_id(start_lb, ssd_id, local_lb);
322 |     rw_data_kernel<<<1, 1>>>(
323 |         NVME_OPCODE_READ, ssd_id, local_lb, num_lb, num_queue_per_ssd, 
324 |         d_ssdqp, d_iobuf_phys, d_prp_phys, queue_depth, max_io_size, buf_type
325 |     );
326 |     if (buf_type == AEOLUS_BUF_USER) {
327 |         AEOLUS_CUDA_CHECK(cudaMemcpy(
328 |             buf, (uint8_t *)d_iobuf_ptr + (uint64_t)ssd_id * num_queue_per_ssd *
329 |             queue_depth * max_io_size, 
330 |             num_lb * AEOLUS_LB_SIZE, cudaMemcpyDeviceToHost
331 |         ));
332 |     } else {
333 |         // TODO!
334 |     }
335 | }
336 | 
337 | void Controller::write_data(uint64_t start_lb, uint64_t num_lb, void *buf)
338 | {
339 |     int ssd_id;
340 |     uint64_t local_lb;
341 |     lb_to_ssd_id(start_lb, ssd_id, local_lb);
342 |     if (buf_type == AEOLUS_BUF_USER) {
343 |         AEOLUS_CUDA_CHECK(cudaMemcpy(
344 |             (uint8_t *)d_iobuf_ptr + (uint64_t)ssd_id * num_queue_per_ssd *
345 |             queue_depth * max_io_size, 
346 |             buf, num_lb * AEOLUS_LB_SIZE, cudaMemcpyHostToDevice
347 |         ));
348 |     } else {
349 |         // TODO!
350 |     }
351 |     rw_data_kernel<<<1, 1>>>(
352 |         NVME_OPCODE_WRITE, ssd_id, local_lb, num_lb, num_queue_per_ssd, 
353 |         d_ssdqp, d_iobuf_phys, d_prp_phys, queue_depth, max_io_size, buf_type
354 |     );
355 |     // AEOLUS_CUDA_CHECK(cudaDeviceSynchronize());
356 | }


--------------------------------------------------------------------------------
/src/applications/gemm/src/controller_decouple.cu:
--------------------------------------------------------------------------------
  1 | #include "controller.cuh"
  2 | 
  3 | __device__ static int req_id_to_ssd_id(int req_id, int num_ssds, int *ssd_num_reqs_prefix_sum)
  4 | {
  5 |     int ssd_id = 0;
  6 |     for (; ssd_id < num_ssds; ssd_id++)
  7 |         if (ssd_num_reqs_prefix_sum[ssd_id] > req_id)
  8 |             break;
  9 |     return ssd_id;
 10 | }
 11 | 
 12 | // Do NOT use std::pair in device function! Though this can be bypassed by --expt-relaxed-constexpr flag,
 13 | // it may contain bugs.
 14 | __device__ static void lb_to_ssd_id(uint64_t lb, int num_ssds, uint64_t *ssd_num_lbs, int max_io_size, int &ssd_id, uint64_t &start_lb)
 15 | {
 16 |     int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE;
 17 |     assert(lb % lbs_per_max_io_size == 0);
 18 |     ssd_id = lb / lbs_per_max_io_size % num_ssds;
 19 |     start_lb = lb / lbs_per_max_io_size / num_ssds * lbs_per_max_io_size;
 20 |     assert(start_lb < ssd_num_lbs[ssd_id]);
 21 | }
 22 | 
 23 | __global__ static void submit_io_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *prp2, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size, uint32_t opcode, aeolus_buf_type buf_type)
 24 | {
 25 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 26 |     int num_threads = blockDim.x * gridDim.x;
 27 |     for (int i = thread_id; i < num_reqs; i += num_threads)
 28 |     {
 29 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
 30 |         if (ssd_id >= num_ssds)
 31 |             break;
 32 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
 33 |         int queue_id = req_offset / (queue_depth - 1);
 34 |         if (queue_id >= num_queues_per_ssd)
 35 |             printf("%d %d\n", queue_id, num_queues_per_ssd);
 36 |         assert(queue_id < num_queues_per_ssd);
 37 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
 38 |         int id_in_queue = req_offset % (queue_depth - 1);
 39 |         int queue_pos = (ssdqp[global_queue_id].sq_tail + id_in_queue) % queue_depth;
 40 | 
 41 |         uint64_t global_pos = (uint64_t)global_queue_id * queue_depth + queue_pos;
 42 |         uint64_t io_addr;
 43 |         uint64_t io_addr2;
 44 |         if (buf_type == AEOLUS_BUF_USER)
 45 |         {
 46 |             io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
 47 |             io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
 48 |         }
 49 |         else
 50 |         {
 51 |             io_addr = reqs[i].dest_addr;
 52 |             io_addr2 = reqs[i].next_addr;   // io_size <= 8KB
 53 |             global_pos = reqs[i].next_addr; // io_size > 8KB
 54 |         }
 55 |         if (reqs[i].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
 56 |         {
 57 |             int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request
 58 |             uint64_t offset = global_pos * prp_size;
 59 |             io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
 60 |         }
 61 |         ssdqp[global_queue_id].fill_sq(
 62 |             ssdqp[global_queue_id].cmd_id + id_in_queue, // command id
 63 |             queue_pos,                                   // position in SQ
 64 |             opcode,                                      // opcode
 65 |             io_addr,                                     // prp1
 66 |             io_addr2,                                    // prp2
 67 |             reqs[i].start_lb & 0xffffffff,               // start lb low
 68 |             (reqs[i].start_lb >> 32) & 0xffffffff,       // start lb high
 69 |             NVME_RW_LIMITED_RETRY_MASK | (reqs[i].num_items - 1),     // number of LBs
 70 |             i                                            // req id
 71 |         );
 72 |     }
 73 | }
 74 | 
 75 | __global__ static void ring_sq_doorbell_kernel(int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int num_reqs, int queue_depth)
 76 | {
 77 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 78 |     int num_threads = blockDim.x * gridDim.x;
 79 |     for (int i = thread_id; i < num_reqs; i += num_threads)
 80 |     {
 81 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
 82 |         if (ssd_id >= num_ssds)
 83 |             break;
 84 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
 85 |         int queue_id = req_offset / (queue_depth - 1);
 86 |         assert(queue_id < num_queues_per_ssd);
 87 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
 88 |         int id_in_queue = req_offset % (queue_depth - 1);
 89 | 
 90 |         if (id_in_queue == 0)
 91 |         {
 92 |             int cnt = ssd_num_reqs[ssd_id] - queue_id * (queue_depth - 1);
 93 |             if (cnt > queue_depth - 1)
 94 |                 cnt = queue_depth - 1;
 95 |             ssdqp[global_queue_id].cmd_id += cnt;
 96 |             ssdqp[global_queue_id].sq_tail = (ssdqp[global_queue_id].sq_tail + cnt) % queue_depth;
 97 |             // printf("thread %d ssd %d queue %d end req %d cnt %d\n", thread_id, ssd_id, queue_id, ssd_num_reqs_prefix_sum[ssd_id], cnt);
 98 |             *ssdqp[global_queue_id].sqtdbl = ssdqp[global_queue_id].sq_tail;
 99 |         }
100 |     }
101 | }
102 | 
103 | __global__ static void copy_io_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size, aeolus_buf_type buf_type)
104 | {
105 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
106 |     int warp_id = thread_id / AEOLUS_WARP_SIZE;
107 |     int lane_id = thread_id % AEOLUS_WARP_SIZE;
108 |     int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE;
109 |     for (int i = warp_id; i < num_reqs; i += num_warps)
110 |     {
111 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
112 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
113 |         int queue_id = req_offset / (queue_depth - 1);
114 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
115 |         int id_in_queue = req_offset % (queue_depth - 1);
116 |         int complete_id = ssdqp[global_queue_id].num_completed + id_in_queue;
117 |         int queue_pos = complete_id % queue_depth;
118 | 
119 |         if (lane_id == 0)
120 |         {
121 |             // printf("polling req %d ssd %d queue %d complete_id %d queue_pos %d num_completed %d\n", i, ssd_id, queue_id, complete_id, queue_pos, ssdqp[global_queue_id].num_completed);
122 |             uint32_t current_phase = (complete_id / queue_depth) & 1;
123 |             while (((ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
124 |                 ;
125 |             uint32_t status = ssdqp[global_queue_id].cq[queue_pos * 4 + 3];
126 |             uint32_t cmd_id = status & NVME_ENTRY_CID_MASK;
127 |             if ((status >> 17) & NVME_ENTRY_SC_MASK)
128 |             {
129 |                 AEOLUS_LOG_ERROR("thread %d cq[%d] status: 0x%x, cid: %d\n", thread_id, queue_pos, (status >> 17) & NVME_ENTRY_SC_MASK, cmd_id);
130 |                 assert(0);
131 |             }
132 |         }
133 | 
134 |         if (buf_type == AEOLUS_BUF_USER)
135 |         {
136 |             int cmd_id = ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_CID_MASK;
137 |             int req_id = ssdqp[global_queue_id].cmd_id_to_req_id[cmd_id % queue_depth];
138 |             int sq_pos = ssdqp[global_queue_id].cmd_id_to_sq_pos[cmd_id % queue_depth];
139 |             for (int j = lane_id; j < reqs[req_id].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE)
140 |                 ((uint64_t *)reqs[req_id].dest_addr)[j] = IO_buf_base[(uint64_t)global_queue_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + j];
141 |         }
142 |     }
143 | }
144 | 
145 | __global__ static void ring_cq_doorbell_kernel(int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int num_reqs, int queue_depth)
146 | {
147 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
148 |     int num_threads = blockDim.x * gridDim.x;
149 |     for (int i = thread_id; i < num_reqs; i += num_threads)
150 |     {
151 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
152 |         if (ssd_id >= num_ssds)
153 |             break;
154 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
155 |         int queue_id = req_offset / (queue_depth - 1);
156 |         assert(queue_id < num_queues_per_ssd);
157 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
158 |         int id_in_queue = req_offset % (queue_depth - 1);
159 | 
160 |         if (id_in_queue == 0)
161 |         {
162 |             int cnt = ssd_num_reqs[ssd_id] - queue_id * (queue_depth - 1);
163 |             if (cnt > queue_depth - 1)
164 |                 cnt = queue_depth - 1;
165 |             ssdqp[global_queue_id].num_completed += cnt;
166 |             ssdqp[global_queue_id].cq_head = (ssdqp[global_queue_id].cq_head + cnt) % queue_depth;
167 |             *ssdqp[global_queue_id].cqhdbl = ssdqp[global_queue_id].cq_head;
168 |             // printf("queue %d num_completed %d cq_head %d\n", global_queue_id, ssdqp[global_queue_id].num_completed, ssdqp[global_queue_id].cq_head);
169 |         }
170 |     }
171 | }
172 | 
173 | __global__ static void copy_write_data_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size)
174 | {
175 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
176 |     int warp_id = thread_id / AEOLUS_WARP_SIZE;
177 |     int lane_id = thread_id % AEOLUS_WARP_SIZE;
178 |     int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE;
179 |     for (int i = warp_id; i < num_reqs; i += num_warps)
180 |     {
181 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
182 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
183 |         int queue_id = req_offset / (queue_depth - 1);
184 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
185 |         int id_in_queue = req_offset % (queue_depth - 1);
186 |         int queue_pos = (ssdqp[global_queue_id].sq_tail + id_in_queue) % queue_depth;
187 | 
188 |         for (int j = lane_id; j < reqs[i].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE)
189 |             IO_buf_base[(uint64_t)global_queue_id * queue_depth * max_io_size / 8 + queue_pos * max_io_size / 8 + j] = ((uint64_t *)reqs[i].dest_addr)[j];
190 |     }
191 | }
192 | 
193 | __global__ static void poll_write_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size)
194 | {
195 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
196 |     int warp_id = thread_id / AEOLUS_WARP_SIZE;
197 |     int lane_id = thread_id % AEOLUS_WARP_SIZE;
198 |     int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE;
199 |     for (int i = warp_id; i < num_reqs; i += num_warps)
200 |     {
201 |         int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum);
202 |         int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]);
203 |         int queue_id = req_offset / (queue_depth - 1);
204 |         int global_queue_id = ssd_id * num_queues_per_ssd + queue_id;
205 |         int id_in_queue = req_offset % (queue_depth - 1);
206 |         int complete_id = ssdqp[global_queue_id].num_completed + id_in_queue;
207 |         int queue_pos = complete_id % queue_depth;
208 | 
209 |         if (lane_id == 0)
210 |         {
211 |             // printf("polling req %d ssd %d queue %d complete_id %d queue_pos %d num_completed %d\n", i, ssd_id, queue_id, complete_id, queue_pos, ssdqp[global_queue_id].num_completed);
212 |             uint32_t current_phase = (complete_id / queue_depth) & 1;
213 |             while (((ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase)
214 |                 ;
215 |             uint32_t status = ssdqp[global_queue_id].cq[queue_pos * 4 + 3];
216 |             uint32_t cmd_id = status & NVME_ENTRY_CID_MASK;
217 |             if ((status >> 17) & NVME_ENTRY_SC_MASK)
218 |             {
219 |                 AEOLUS_LOG_ERROR("thread %d cq[%d] status: 0x%x, cid: %d\n", thread_id, queue_pos, (status >> 17) & NVME_ENTRY_SC_MASK, cmd_id);
220 |                 assert(0);
221 |             }
222 |         }
223 |     }
224 | }
225 | 
226 | __global__ static void preprocess_io_req_1(Request *reqs, int num_reqs, int num_ssds, int *ssd_num_reqs, uint64_t *ssd_num_lbs, int max_io_size)
227 | {
228 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
229 |     int num_threads = blockDim.x * gridDim.x;
230 |     for (int i = tid; i < num_reqs; i += num_threads)
231 |     {
232 |         int ssd_id;
233 |         uint64_t start_lb;  // Not used.
234 |         lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb);
235 |         // assert(ssd_id < num_ssds);
236 |         if (ssd_id < num_ssds && ssd_id >= 0)
237 |         {
238 |             atomicAdd(&ssd_num_reqs[ssd_id], 1);
239 |         }
240 |         else
241 |         {
242 |             AEOLUS_LOG_ERROR("ssd_id out of bound: %d\n", ssd_id);
243 |         }
244 |     }
245 | }
246 | 
247 | __global__ static void preprocess_io_req_2(int num_ssds, int num_queues_per_ssd, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int queue_depth)
248 | {
249 |     for (int i = 0; i < num_ssds; i++)
250 |     {
251 |         // assert(ssd_num_reqs[i] <= num_queues_per_ssd * (queue_depth - 1));
252 |         if (ssd_num_reqs[i] > num_queues_per_ssd * (queue_depth - 1))
253 |         {
254 |             AEOLUS_LOG_ERROR("ssd_num_reqs[%d]: %d\n", i, ssd_num_reqs[i]);
255 |         }
256 |         ssd_num_reqs_prefix_sum[i] = ssd_num_reqs[i];
257 |         if (i > 0)
258 |             ssd_num_reqs_prefix_sum[i] += ssd_num_reqs_prefix_sum[i - 1];
259 |     }
260 | }
261 | 
262 | __global__ static void distribute_io_req_1(int num_ssds, int *ssd_num_reqs_prefix_sum, int *req_ids)
263 | {
264 |     for (int i = 0; i < num_ssds; i++)
265 |         req_ids[i] = i ? ssd_num_reqs_prefix_sum[i - 1] : 0;
266 | }
267 | 
268 | __global__ static void distribute_io_req_2(Request *reqs, int num_reqs, int num_ssds, Request *distributed_reqs, int *req_ids, uint64_t *ssd_num_lbs, int max_io_size)
269 | {
270 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
271 |     int num_threads = blockDim.x * gridDim.x;
272 |     for (int i = tid; i < num_reqs; i += num_threads)
273 |     {
274 |         int ssd_id;
275 |         uint64_t start_lb;
276 |         lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb);
277 |         // assert(ssd_id < num_ssds);
278 |         if (ssd_id < num_ssds && ssd_id >= 0)
279 |         {
280 |             int req_id = atomicAdd(&req_ids[ssd_id], 1);
281 |             distributed_reqs[req_id] = reqs[i];
282 |             distributed_reqs[req_id].start_lb = start_lb;
283 |         }
284 |     }
285 | }
286 | 
287 | __global__ static void distribute_io_req_3(int num_ssds, int *ssd_num_reqs_prefix_sum, int *req_ids)
288 | {
289 |     for (int i = 0; i < num_ssds; i++)
290 |     {
291 |         if (req_ids[i] != ssd_num_reqs_prefix_sum[i])
292 |         {
293 |             AEOLUS_LOG_ERROR("req id %d %d\n", req_ids[i], ssd_num_reqs_prefix_sum[i]);
294 |         }
295 |         // assert(req_ids[i] == ssd_num_reqs_prefix_sum[i]);
296 |     }
297 | }
298 | 
299 | void ControllerDecoupled::submit_io_req(Request *reqs, int num_reqs, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys)
300 | {
301 |     if (num_reqs > AEOLUS_MAX_NUM_REQUESTS)
302 |     {
303 |         AEOLUS_LOG_ERROR("num_reqs %d > AEOLUS_MAX_NUM_REQUESTS %d", num_reqs, AEOLUS_MAX_NUM_REQUESTS);
304 |         exit(1);
305 |     }
306 |     if (num_reqs > ssd_count * num_queue_per_ssd * queue_depth)
307 |     {
308 |         AEOLUS_LOG_ERROR("num_reqs %d > ssd_count %d * num_queue_per_ssd %d * queue_depth %d", num_reqs, ssd_count, num_queue_per_ssd, queue_depth);
309 |         exit(1);
310 |     }
311 |     AEOLUS_CUDA_CHECK(cudaMemsetAsync(ssd_num_reqs, 0, sizeof(int) * ssd_count, stream));
312 |     int num_blocks = 32;
313 |     preprocess_io_req_1<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(reqs, num_reqs, ssd_count, ssd_num_reqs, d_ssd_num_lbs, max_io_size);
314 |     preprocess_io_req_2<<<1, 1, 0, stream>>>(ssd_count, num_queue_per_ssd, ssd_num_reqs, ssd_num_reqs_prefix_sum, queue_depth);
315 |     distribute_io_req_1<<<1, 1, 0, stream>>>(ssd_count, ssd_num_reqs_prefix_sum, req_ids);
316 |     distribute_io_req_2<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(reqs, num_reqs, ssd_count, distributed_reqs, req_ids, d_ssd_num_lbs, max_io_size);
317 |     distribute_io_req_3<<<1, 1, 0, stream>>>(ssd_count, ssd_num_reqs_prefix_sum, req_ids);
318 |     uint32_t opcode = NVME_OPCODE_READ;
319 |     if (dir == AEOLUS_DIR_WRITE)
320 |     {
321 |         opcode = NVME_OPCODE_WRITE;
322 |         if (buf_type == AEOLUS_BUF_USER)
323 |             copy_write_data_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size);
324 |     }
325 |     if (d_prp_phys == nullptr)
326 |         d_prp_phys = this->d_prp_phys;
327 |     submit_io_req_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, d_prp_phys, ssd_num_reqs_prefix_sum, queue_depth, max_io_size, opcode, buf_type);
328 |     ring_sq_doorbell_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(ssd_count, num_queue_per_ssd, d_ssdqp, ssd_num_reqs, ssd_num_reqs_prefix_sum, num_reqs, queue_depth);
329 |     this->num_reqs = num_reqs;
330 |     this->stream = stream;
331 |     this->dir = dir;
332 | }
333 | 
334 | void ControllerDecoupled::poll()
335 | {
336 |     int num_blocks = 32;
337 |     if (dir == AEOLUS_DIR_READ)
338 |         copy_io_req_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size, buf_type);
339 |     else
340 |         poll_write_req_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size);
341 |     ring_cq_doorbell_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(ssd_count, num_queue_per_ssd, d_ssdqp, ssd_num_reqs, ssd_num_reqs_prefix_sum, num_reqs, queue_depth);
342 | }


--------------------------------------------------------------------------------
/src/applications/gemm/src/controller_legacy.cu:
--------------------------------------------------------------------------------
  1 | #include "controller.cuh"
  2 | 
  3 | __device__ static void lb_to_ssd_id(uint64_t lb, int num_ssds, uint64_t *ssd_num_lbs, int max_io_size, int &ssd_id, uint64_t &start_lb)
  4 | {
  5 |     int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE;
  6 |     assert(lb % lbs_per_max_io_size == 0);
  7 |     ssd_id = lb / lbs_per_max_io_size % num_ssds;
  8 |     start_lb = lb / lbs_per_max_io_size / num_ssds * lbs_per_max_io_size;
  9 |     assert(start_lb < ssd_num_lbs[ssd_id]);
 10 | }
 11 | 
 12 | __global__ static void do_read_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *IO_buf_base, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type)
 13 | {
 14 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
 15 |     int warp_id = thread_id / AEOLUS_WARP_SIZE; // global queue id
 16 |     int lane_id = thread_id % AEOLUS_WARP_SIZE;
 17 |     int ssd_id = warp_id / num_warps_per_ssd;
 18 |     if (ssd_id >= num_ssds)
 19 |         return;
 20 | 
 21 |     // submit first page of double buffer
 22 |     assert(thread_id < num_reqs);
 23 |     int base_req_id = thread_id - lane_id;
 24 |     int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth;
 25 | 
 26 |     uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos;
 27 |     uint64_t io_addr;
 28 |     uint64_t io_addr2;
 29 |     if (buf_type == AEOLUS_BUF_USER)
 30 |     {
 31 |         io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
 32 |         io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
 33 |     }
 34 |     else
 35 |     {
 36 |         io_addr = reqs[thread_id].dest_addr;
 37 |         io_addr2 = reqs[thread_id].next_addr;   // io_size <= 8KB
 38 |         global_pos = reqs[thread_id].next_addr; // io_size > 8KB
 39 |     }
 40 |     int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request
 41 |     if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
 42 |     {
 43 |         uint64_t offset = global_pos * prp_size;
 44 |         io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
 45 |     }
 46 |     if (lane_id == 0)
 47 |     {
 48 |         // ssdqp[warp_id].cmd_id = 0;
 49 |         // printf("queue %d cmd_id %d\n", warp_id, ssdqp[warp_id].cmd_id);
 50 |         for (int i = 0; i < queue_depth; i++)
 51 |             ssdqp[warp_id].sq_entry_busy[i] = false;
 52 |     }
 53 |     int num_lbs = reqs[thread_id].num_items ? reqs[thread_id].num_items - 1 : 0;
 54 |     ssdqp[warp_id].fill_sq(
 55 |         ssdqp[warp_id].cmd_id + lane_id,               // command id
 56 |         sq_pos,                                        // position in SQ
 57 |         NVME_OPCODE_READ,                              // opcode
 58 |         io_addr,                                       // prp1
 59 |         io_addr2,                                      // prp2
 60 |         reqs[thread_id].start_lb & 0xffffffff,         // start lb low
 61 |         (reqs[thread_id].start_lb >> 32) & 0xffffffff, // start lb high
 62 |         NVME_RW_LIMITED_RETRY_MASK | num_lbs,          // number of LBs
 63 |         thread_id                                      // req id
 64 |     );
 65 |     // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, thread_id, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos);
 66 | 
 67 |     __threadfence_system();
 68 |     // __syncwarp();
 69 |     if (lane_id == 0)
 70 |     {
 71 |         ssdqp[warp_id].cmd_id += AEOLUS_WARP_SIZE;
 72 |         ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + AEOLUS_WARP_SIZE) % queue_depth;
 73 |         // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail);
 74 |         *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail;
 75 |     }
 76 | 
 77 |     int stride = num_ssds * num_warps_per_ssd * AEOLUS_WARP_SIZE;
 78 |     for (int i = thread_id + stride; i < num_reqs + stride; i += stride)
 79 |     {
 80 |         int prev_sq_tail = ssdqp[warp_id].sq_tail;
 81 |         base_req_id = i - lane_id; // first req_id in warp
 82 |         if (i < num_reqs)
 83 |         {
 84 |             // submit second page of double buffer
 85 |             int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth;
 86 | 
 87 |             uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos;
 88 |             uint64_t io_addr;
 89 |             uint64_t io_addr2;
 90 |             if (buf_type == AEOLUS_BUF_USER)
 91 |             {
 92 |                 io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
 93 |                 io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
 94 |             }
 95 |             else
 96 |             {
 97 |                 io_addr = reqs[i].dest_addr;
 98 |                 io_addr2 = reqs[i].next_addr;   // io_size <= 8KB
 99 |                 global_pos = reqs[i].next_addr; // io_size > 8KB
100 |             }
101 |             if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
102 |             {
103 |                 uint64_t offset = global_pos * prp_size;
104 |                 io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
105 |             }
106 |             assert(ssdqp[warp_id].sq_entry_busy[sq_pos] == false);
107 |             // if (i >= stride * 4 && !req_processed[i - stride * 4])
108 |             // {
109 |             //     printf("thread %d req_id %d not processed\n", thread_id, i - stride * 4);
110 |             //     for (int i = 0; i < ssdqp[warp_id].cmd_id; i++)
111 |             //     {
112 |             //         int req_id = ssdqp[warp_id].cmd_id_to_req_id[i];
113 |             //         int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[i];
114 |             //         if (req_id != 0xffffffff)
115 |             //             printf("thread %d cmd_id %d req_id %d processed %d sq_pos %d busy %d\n", thread_id, i, req_id, req_processed[req_id], sq_pos, ssdqp[warp_id].sq_entry_busy[sq_pos]);
116 |             //     }
117 |             //     assert(0);
118 |             // }
119 |             int num_lbs = reqs[i].num_items ? reqs[i].num_items - 1 : 0;
120 |             ssdqp[warp_id].fill_sq(
121 |                 ssdqp[warp_id].cmd_id + lane_id,       // command id
122 |                 sq_pos,                                // position in SQ
123 |                 NVME_OPCODE_READ,                      // opcode
124 |                 io_addr,                               // prp1
125 |                 io_addr2,                              // prp2
126 |                 reqs[i].start_lb & 0xffffffff,         // start lb low
127 |                 (reqs[i].start_lb >> 32) & 0xffffffff, // start lb high
128 |                 NVME_RW_LIMITED_RETRY_MASK | num_lbs,  // number of LBs
129 |                 i                                      // req id
130 |             );
131 |             // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, i, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos);
132 | 
133 |             __threadfence_system();
134 |             // __syncwarp();
135 |             if (lane_id == 0)
136 |             {
137 |                 int cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE;
138 |                 assert(cnt == AEOLUS_WARP_SIZE);
139 |                 ssdqp[warp_id].cmd_id += cnt;
140 |                 ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + cnt) % queue_depth;
141 |                 // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail);
142 |                 *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail;
143 |             }
144 |         }
145 | 
146 |         // poll and copy the *previous* page of double buffer
147 |         int prev_cq_head = ssdqp[warp_id].cq_head;
148 |         if (lane_id == 0)
149 |         {
150 |             uint32_t code;
151 |             ssdqp[warp_id].poll_range(code, prev_sq_tail, i < num_reqs);
152 |             assert(code == 0);
153 |             if (i + stride < num_reqs)
154 |             {
155 |                 base_req_id += stride;
156 |                 int next_cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE;
157 |                 for (int j = 0; j < next_cnt; j++)
158 |                 {
159 |                     int sq_pos = (ssdqp[warp_id].sq_tail + j) % queue_depth;
160 |                     if (ssdqp[warp_id].sq_entry_busy[sq_pos])
161 |                     {
162 |                         ssdqp[warp_id].poll_until_sq_entry_free(code, sq_pos);
163 |                         assert(code == 0);
164 |                     }
165 |                 }
166 |             }
167 |         }
168 | 
169 |         if (buf_type == AEOLUS_BUF_USER)
170 |         {
171 |             // copy data from IO buffer to app buffer
172 |             for (int j = prev_cq_head; j != ssdqp[warp_id].cq_head; j = (j + 1) % queue_depth)
173 |             {
174 |                 int cmd_id = (ssdqp[warp_id].cq[j * 4 + 3] & NVME_ENTRY_CID_MASK) % queue_depth;
175 |                 int req_id = ssdqp[warp_id].cmd_id_to_req_id[cmd_id];
176 |                 int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[cmd_id];
177 |                 for (int k = lane_id; k < reqs[req_id].num_items * AEOLUS_LB_SIZE / 8; k += AEOLUS_WARP_SIZE)
178 |                     ((uint64_t *)reqs[req_id].dest_addr)[k] = IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + k];
179 |             }
180 |         }
181 |     }
182 | }
183 | 
184 | __global__ static void do_write_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *IO_buf_base, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type)
185 | {
186 |     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
187 |     int warp_id = thread_id / AEOLUS_WARP_SIZE; // global queue id
188 |     int lane_id = thread_id % AEOLUS_WARP_SIZE;
189 |     int ssd_id = warp_id / num_warps_per_ssd;
190 |     if (ssd_id >= num_ssds)
191 |         return;
192 | 
193 |     // submit first page of double buffer
194 |     assert(thread_id < num_reqs);
195 |     int base_req_id = thread_id - lane_id;
196 |     int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth;
197 | 
198 |     uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos;
199 |     uint64_t io_addr;
200 |     uint64_t io_addr2;
201 |     if (buf_type == AEOLUS_BUF_USER)
202 |     {
203 |         io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
204 |         io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
205 |     }
206 |     else
207 |     {
208 |         io_addr = reqs[thread_id].dest_addr;
209 |         io_addr2 = reqs[thread_id].next_addr;   // io_size <= 8KB
210 |         global_pos = reqs[thread_id].next_addr; // io_size > 8KB
211 |     }
212 |     int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request
213 |     if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
214 |     {
215 |         uint64_t offset = global_pos * prp_size;
216 |         io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
217 |     }
218 |     if (lane_id == 0)
219 |     {
220 |         // ssdqp[warp_id].cmd_id = 0;
221 |         // printf("queue %d cmd_id %d\n", warp_id, ssdqp[warp_id].cmd_id);
222 |         for (int i = 0; i < queue_depth; i++)
223 |             ssdqp[warp_id].sq_entry_busy[i] = false;
224 |     }
225 |     int num_lbs = reqs[thread_id].num_items ? reqs[thread_id].num_items - 1 : 0;
226 |     ssdqp[warp_id].fill_sq(
227 |         ssdqp[warp_id].cmd_id + lane_id,               // command id
228 |         sq_pos,                                        // position in SQ
229 |         NVME_OPCODE_WRITE,                             // opcode
230 |         io_addr,                                       // prp1
231 |         io_addr2,                                      // prp2
232 |         reqs[thread_id].start_lb & 0xffffffff,         // start lb low
233 |         (reqs[thread_id].start_lb >> 32) & 0xffffffff, // start lb high
234 |         NVME_RW_LIMITED_RETRY_MASK | num_lbs,          // number of LBs
235 |         thread_id                                      // req id
236 |     );
237 |     // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, thread_id, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos);
238 | 
239 |     if (buf_type == AEOLUS_BUF_USER)
240 |     {
241 |         for (int i = base_req_id; i < base_req_id + AEOLUS_WARP_SIZE; i++)
242 |             for (int j = lane_id; j < reqs[i].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE)
243 |             {
244 |                 int sq_pos = (ssdqp[warp_id].sq_tail + i - base_req_id) % queue_depth;
245 |                 IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + j] = ((uint64_t *)reqs[i].dest_addr)[j];
246 |             }
247 |     }
248 | 
249 |     __threadfence_system();
250 |     // __syncwarp();
251 |     if (lane_id == 0)
252 |     {
253 |         ssdqp[warp_id].cmd_id += AEOLUS_WARP_SIZE;
254 |         ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + AEOLUS_WARP_SIZE) % queue_depth;
255 |         // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail);
256 |         *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail;
257 |     }
258 | 
259 |     int stride = num_ssds * num_warps_per_ssd * AEOLUS_WARP_SIZE;
260 |     for (int i = thread_id + stride; i < num_reqs + stride; i += stride)
261 |     {
262 |         int prev_sq_tail = ssdqp[warp_id].sq_tail;
263 |         base_req_id = i - lane_id; // first req_id in warp
264 |         if (i < num_reqs)
265 |         {
266 |             // submit second page of double buffer
267 |             int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth;
268 | 
269 |             uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos;
270 |             uint64_t io_addr;
271 |             uint64_t io_addr2;
272 |             if (buf_type == AEOLUS_BUF_USER)
273 |             {
274 |                 io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous!
275 |                 io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE;
276 |             }
277 |             else
278 |             {
279 |                 io_addr = reqs[i].dest_addr;
280 |                 io_addr2 = reqs[i].next_addr;   // io_size <= 8KB
281 |                 global_pos = reqs[i].next_addr; // io_size > 8KB
282 |             }
283 |             if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2)
284 |             {
285 |                 uint64_t offset = global_pos * prp_size;
286 |                 io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE;
287 |             }
288 |             assert(ssdqp[warp_id].sq_entry_busy[sq_pos] == false);
289 |             // if (i >= stride * 4 && !req_processed[i - stride * 4])
290 |             // {
291 |             //     printf("thread %d req_id %d not processed\n", thread_id, i - stride * 4);
292 |             //     for (int i = 0; i < ssdqp[warp_id].cmd_id; i++)
293 |             //     {
294 |             //         int req_id = ssdqp[warp_id].cmd_id_to_req_id[i];
295 |             //         int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[i];
296 |             //         if (req_id != 0xffffffff)
297 |             //             printf("thread %d cmd_id %d req_id %d processed %d sq_pos %d busy %d\n", thread_id, i, req_id, req_processed[req_id], sq_pos, ssdqp[warp_id].sq_entry_busy[sq_pos]);
298 |             //     }
299 |             //     assert(0);
300 |             // }
301 |             int num_lbs = reqs[i].num_items ? reqs[i].num_items - 1 : 0;
302 |             ssdqp[warp_id].fill_sq(
303 |                 ssdqp[warp_id].cmd_id + lane_id,       // command id
304 |                 sq_pos,                                // position in SQ
305 |                 NVME_OPCODE_WRITE,                     // opcode
306 |                 io_addr,                               // prp1
307 |                 io_addr2,                              // prp2
308 |                 reqs[i].start_lb & 0xffffffff,         // start lb low
309 |                 (reqs[i].start_lb >> 32) & 0xffffffff, // start lb high
310 |                 NVME_RW_LIMITED_RETRY_MASK | num_lbs,  // number of LBs
311 |                 i                                      // req id
312 |             );
313 |             // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, i, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos);
314 | 
315 |             if (buf_type == AEOLUS_BUF_USER)
316 |             {
317 |                 for (int j = base_req_id; j < base_req_id + AEOLUS_WARP_SIZE; j++)
318 |                     for (int k = lane_id; k < reqs[j].num_items * AEOLUS_LB_SIZE / 8; k += AEOLUS_WARP_SIZE)
319 |                     {
320 |                         int sq_pos = (ssdqp[warp_id].sq_tail + j - base_req_id) % queue_depth;
321 |                         IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + k] = ((uint64_t *)reqs[j].dest_addr)[k];
322 |                     }
323 |             }
324 | 
325 |             __threadfence_system();
326 |             // __syncwarp();
327 |             if (lane_id == 0)
328 |             {
329 |                 int cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE;
330 |                 assert(cnt == AEOLUS_WARP_SIZE);
331 |                 ssdqp[warp_id].cmd_id += cnt;
332 |                 ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + cnt) % queue_depth;
333 |                 // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail);
334 |                 *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail;
335 |             }
336 |         }
337 | 
338 |         // poll and copy the *previous* page of double buffer
339 |         if (lane_id == 0)
340 |         {
341 |             uint32_t code;
342 |             ssdqp[warp_id].poll_range(code, prev_sq_tail, i < num_reqs);
343 |             assert(code == 0);
344 |             if (i + stride < num_reqs)
345 |             {
346 |                 base_req_id += stride;
347 |                 int next_cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE;
348 |                 for (int j = 0; j < next_cnt; j++)
349 |                 {
350 |                     int sq_pos = (ssdqp[warp_id].sq_tail + j) % queue_depth;
351 |                     if (ssdqp[warp_id].sq_entry_busy[sq_pos])
352 |                     {
353 |                         ssdqp[warp_id].poll_until_sq_entry_free(code, sq_pos);
354 |                         assert(code == 0);
355 |                     }
356 |                 }
357 |             }
358 |         }
359 |     }
360 | }
361 | 
362 | __global__ static void preprocess_io_req_1(Request *reqs, int num_reqs, int num_ssds, int *ssd_num_reqs, uint64_t *ssd_num_lbs, int max_io_size)
363 | {
364 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
365 |     int num_threads = blockDim.x * gridDim.x;
366 |     for (int i = tid; i < num_reqs; i += num_threads)
367 |     {
368 |         int ssd_id;
369 |         uint64_t start_lb;  // Not used.
370 |         lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb);
371 |         assert(ssd_id < num_ssds);
372 |         atomicAdd(&ssd_num_reqs[ssd_id], 1);
373 |     }
374 | }
375 | 
376 | __global__ static void preprocess_io_req_2(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, int *ssd_num_reqs, int *num_distributed_reqs)
377 | {
378 |     int max_bucket = 0;
379 |     for (int i = 0; i < num_ssds; i++)
380 |         if (ssd_num_reqs[i] > max_bucket)
381 |             max_bucket = ssd_num_reqs[i];
382 |     int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE;
383 |     max_bucket = (max_bucket + num_reqs_per_chunk - 1) / num_reqs_per_chunk * num_reqs_per_chunk;
384 |     *num_distributed_reqs = max_bucket * num_ssds;
385 | }
386 | 
387 | __global__ static void distribute_io_req_1(int num_ssds, int num_warps_per_ssd, int *req_ids)
388 | {
389 |     int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE;
390 |     for (int i = 0; i < num_ssds; i++)
391 |         req_ids[i] = i * num_reqs_per_chunk;
392 | }
393 | 
394 | __global__ static void distribute_io_req_2(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, Request *distributed_reqs, int *req_ids, uint64_t *ssd_num_lbs, int max_io_size)
395 | {
396 |     int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE;
397 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
398 |     int num_threads = blockDim.x * gridDim.x;
399 |     for (int i = tid; i < num_reqs; i += num_threads)
400 |     {
401 |         int ssd_id;
402 |         uint64_t start_lb;
403 |         lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb);
404 |         assert(ssd_id < num_ssds);
405 |         int req_id;
406 |         for (;;)
407 |         {
408 |             req_id = req_ids[ssd_id];
409 |             int next_req_id = req_id + 1;
410 |             if (next_req_id % num_reqs_per_chunk == 0)
411 |                 next_req_id += num_reqs_per_chunk * (num_ssds - 1);
412 |             if (atomicCAS(&req_ids[ssd_id], req_id, next_req_id) == req_id)
413 |                 break;
414 |         }
415 |         distributed_reqs[req_id] = reqs[i];
416 |         distributed_reqs[req_id].start_lb = start_lb;
417 |     }
418 | }
419 | 
420 | __global__ static void distribute_io_req_3(int num_ssds, int num_warps_per_ssd, Request *distributed_reqs, int *req_ids, int *num_distributed_reqs)
421 | {
422 |     int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE;
423 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
424 |     int num_threads = blockDim.x * gridDim.x;
425 |     for (int i = tid; i < num_ssds; i += num_threads)
426 |         for (int j = req_ids[i]; j < *num_distributed_reqs;)
427 |         {
428 |             distributed_reqs[j].num_items = 0;
429 |             distributed_reqs[j++].start_lb = 0;
430 |             if (j % num_reqs_per_chunk == 0)
431 |                 j += num_reqs_per_chunk * (num_ssds - 1);
432 |         }
433 | }
434 | 
435 | void ControllerLegacy::submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys) {
436 |     if (num_req > AEOLUS_MAX_NUM_REQUESTS)
437 |     {
438 |         AEOLUS_LOG_ERROR("num_reqs %d > AEOLUS_MAX_NUM_REQUESTS %d", num_req, AEOLUS_MAX_NUM_REQUESTS);
439 |         return;
440 |     }
441 |     AEOLUS_CUDA_CHECK(cudaMemsetAsync(ssd_num_reqs, 0, sizeof(int) * ssd_count, stream));
442 |     int num_blocks = 8;
443 |     preprocess_io_req_1<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(req, num_req, ssd_count, ssd_num_reqs, d_ssd_num_lbs, max_io_size);
444 |     int *num_distributed_reqs;
445 |     AEOLUS_CUDA_CHECK(cudaMalloc(&num_distributed_reqs, sizeof(int)));
446 |     preprocess_io_req_2<<<1, 1, 0, stream>>>(req, num_req, ssd_count, num_queue_per_ssd, ssd_num_reqs, num_distributed_reqs);
447 |     distribute_io_req_1<<<1, 1, 0, stream>>>(ssd_count, num_queue_per_ssd, req_ids);
448 |     distribute_io_req_2<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(req, num_req, ssd_count, num_queue_per_ssd, distributed_reqs, req_ids, d_ssd_num_lbs, max_io_size);
449 |     distribute_io_req_3<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(ssd_count, num_queue_per_ssd, distributed_reqs, req_ids, num_distributed_reqs);
450 |     
451 |     int h_num_distributed_reqs;
452 |     AEOLUS_CUDA_CHECK(cudaMemcpy(&h_num_distributed_reqs, num_distributed_reqs, sizeof(int), cudaMemcpyDeviceToHost));
453 |     int num_threads = ssd_count * num_queue_per_ssd * AEOLUS_WARP_SIZE;
454 |     num_blocks = CEIL(num_threads, AEOLUS_NUM_THREADS_PER_BLOCK);
455 |     if (d_prp_phys == nullptr)
456 |         d_prp_phys = this->d_prp_phys;
457 |     if (dir == AEOLUS_DIR_READ)
458 |         do_read_req_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, h_num_distributed_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, (uint64_t *)d_iobuf_ptr, d_prp_phys, queue_depth, max_io_size, buf_type);
459 |     else
460 |         do_write_req_kernel<<<num_blocks, AEOLUS_NUM_THREADS_PER_BLOCK, 0, stream>>>(distributed_reqs, h_num_distributed_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, (uint64_t *)d_iobuf_ptr, d_prp_phys, queue_depth, max_io_size, buf_type);
461 |     AEOLUS_CUDA_CHECK(cudaFree(num_distributed_reqs));
462 | }


--------------------------------------------------------------------------------