├── .gitignore ├── doc ├── pic │ └── GEMM_pic.png ├── INSTALL.md └── EXP.md ├── .gitmodules ├── src ├── applications │ └── gemm │ │ ├── include │ │ ├── lightbam.cuh │ │ ├── ioctl.h │ │ ├── request.cuh │ │ ├── device.cuh │ │ ├── log.cuh │ │ ├── gemm.cuh │ │ ├── queue.cuh │ │ ├── util.cuh │ │ └── controller.cuh │ │ ├── Makefile │ │ ├── test.sh │ │ ├── gemm │ │ ├── cam_gemm.cu │ │ └── spdk_gemm.cu │ │ └── src │ │ ├── device.cu │ │ ├── queue.cu │ │ ├── controller.cu │ │ ├── controller_decouple.cu │ │ └── controller_legacy.cu ├── GPU_memory_lib │ ├── Makefile │ ├── GPU_memory_management.hpp │ └── GPU_memory_management.cpp ├── benchmarks │ ├── CAM_variable_core_benchmark │ │ ├── Makefile │ │ ├── variable_core_test_read.cu │ │ └── variable_core_test_write.cu │ └── CAM_benchmark │ │ ├── Makefile │ │ ├── test_seq_write.cu │ │ ├── test_random_read.cu │ │ ├── test_random_write.cu │ │ └── test_seq_read.cu ├── CAM_variable_core_lib │ ├── Makefile │ ├── CAM_variable_core.h │ └── threadPool.h └── CAM_lib │ ├── Makefile │ ├── sample_read.cu │ ├── sample_write.cu │ ├── CAM_interface.h │ ├── threadPool.h │ ├── gpu_transfer.cuh │ └── gpu_transfer.cu ├── run_GEMM.sh └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | -------------------------------------------------------------------------------- /doc/pic/GEMM_pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RC4ML/CAM/HEAD/doc/pic/GEMM_pic.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "spdk"] 2 | path = spdk 3 | url = https://github.com/RC4ML/spdk.git 4 | [submodule "gdrcopy"] 5 | path = gdrcopy 6 | url = https://github.com/RC4ML/gdrcopy.git 7 | -------------------------------------------------------------------------------- /src/applications/gemm/include/lightbam.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_CUH__ 2 | #define __AEOLUS_CUH__ 3 | 4 | #include "controller.cuh" 5 | #include "device.cuh" 6 | #include "log.cuh" 7 | #include "queue.cuh" 8 | #include "util.cuh" 9 | #include "request.cuh" 10 | 11 | #endif -------------------------------------------------------------------------------- /run_GEMM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export AEOLUS_LOG_LEVEL=ERROR 3 | B=16k 4 | ssd=6 5 | IO=32k 6 | 7 | for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k 8 | do 9 | for ((i=0; i<3; i++)) 10 | do 11 | .build/application/gemm/gemm-test $N $N $N 0 16UL*1024*1024*1024 32UL*1024*1024*1024 $B $IO 6 12 | if [ $? -ne 0 ]; then 13 | echo "Failed at $N" 14 | # exit 1 15 | fi 16 | done 17 | done 18 | echo "All tests done" -------------------------------------------------------------------------------- /src/applications/gemm/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | INCLUDE = -I./include 3 | FLAGS = -D__DEBUG__ -rdc=true -g -lcublas 4 | OUTPUT_DIR = ../../../build/application/gemm 5 | LIB_PATH = ../../../build/lib 6 | # Ensure the output directory exists 7 | $(shell mkdir -p $(OUTPUT_DIR)) 8 | 9 | 10 | gemm-test: src/queue.cu src/device.cu src/controller.cu src/controller_decouple.cu src/controller_legacy.cu gemm/cam_gemm.cu 11 | $(NVCC) $(INCLUDE) $(FLAGS) -I../../GPU_memory_lib -I ../../../src/CAM_lib -L../../../spdk/build/lib -L ../../../build/lib -lCAM_interface $^ -o $(OUTPUT_DIR)/$@ 12 | 13 | 14 | 15 | 16 | clean: 17 | rm -f gemm-test -------------------------------------------------------------------------------- /src/GPU_memory_lib/Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | 3 | CUDAFLAGS = -arch=sm_80 -Xcompiler=-fPIC,-mavx512f 4 | CUDALDFLAGS = -lgdrapi -lcuda -lcudart 5 | 6 | # Output directory 7 | OUTPUT_DIR = ../../build/lib 8 | 9 | # Source and object files 10 | SOURCE1 = GPU_memory_management.cpp 11 | OBJECT1 = $(OUTPUT_DIR)/$(SOURCE1:.cpp=.o) 12 | TARGET1 = $(OUTPUT_DIR)/libgpu_memory_management.so 13 | 14 | # Default rule 15 | all: $(TARGET1) 16 | 17 | # Rule to compile the source file into an object file 18 | $(OBJECT1): $(SOURCE1) 19 | $(NVCC) $(CUDAFLAGS) -c $< -o $@ 20 | 21 | # Rule to link the object file and create the shared library 22 | $(TARGET1): $(OBJECT1) 23 | $(NVCC) -shared $(OBJECT1) -o $@ $(CUDALDFLAGS) 24 | 25 | # Clean rule to remove generated files 26 | clean: 27 | rm -f $(OBJECT1) $(TARGET1) -------------------------------------------------------------------------------- /src/applications/gemm/include/ioctl.h: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_IOCTL_H 2 | #define __AEOLUS_IOCTL_H 3 | #ifdef __linux__ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define NVM_IOCTL_TYPE 0x80 11 | 12 | 13 | 14 | /* Memory map request */ 15 | struct nvm_ioctl_map 16 | { 17 | uint64_t vaddr_start; 18 | size_t n_pages; 19 | uint64_t* ioaddrs; 20 | }; 21 | 22 | 23 | 24 | /* Supported operations */ 25 | enum nvm_ioctl_type 26 | { 27 | NVM_MAP_HOST_MEMORY = _IOW(NVM_IOCTL_TYPE, 1, struct nvm_ioctl_map), 28 | #ifdef _CUDA 29 | NVM_MAP_DEVICE_MEMORY = _IOW(NVM_IOCTL_TYPE, 2, struct nvm_ioctl_map), 30 | #endif 31 | NVM_UNMAP_MEMORY = _IOW(NVM_IOCTL_TYPE, 3, uint64_t) 32 | }; 33 | 34 | 35 | #endif /* __linux__ */ 36 | #endif -------------------------------------------------------------------------------- /src/benchmarks/CAM_variable_core_benchmark/Makefile: -------------------------------------------------------------------------------- 1 | # Define variables for include, library, and output paths 2 | INCLUDE_PATH = ../../CAM_variable_core_lib 3 | LIB_PATH = ../../../build/lib 4 | OUTPUT_DIR = ../../../build/benchmarks/CAM_variable_core_benchmark 5 | 6 | # Ensure the output directory exists 7 | $(shell mkdir -p $(OUTPUT_DIR)) 8 | 9 | # Compiler 10 | NVCC = nvcc 11 | 12 | # Compiler flags 13 | NVCC_FLAGS = -I $(INCLUDE_PATH) -I../../GPU_memory_lib -L $(LIB_PATH) -L../../../spdk/build/lib -lCAM_variable_core 14 | 15 | 16 | 17 | 18 | TARGET1 = variable_core_test_read 19 | SOURCE1 = variable_core_test_read.cu 20 | 21 | TARGET2 = variable_core_test_write 22 | SOURCE2 = variable_core_test_write.cu 23 | 24 | 25 | 26 | # Default rule 27 | all: $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) 28 | 29 | $(OUTPUT_DIR)/$(TARGET1): $(SOURCE1) 30 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 31 | 32 | $(OUTPUT_DIR)/$(TARGET2): $(SOURCE2) 33 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 34 | 35 | 36 | 37 | # Clean rule 38 | clean: 39 | rm -f $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) -------------------------------------------------------------------------------- /src/applications/gemm/include/request.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_REQUEST_CUH 2 | #define __AEOLUS_REQUEST_CUH 3 | 4 | #include 5 | #include "util.cuh" 6 | 7 | enum aeolus_access_dir 8 | { 9 | AEOLUS_DIR_READ = 0, 10 | AEOLUS_DIR_WRITE = 1 11 | }; 12 | 13 | class Request 14 | { 15 | public: 16 | uint64_t start_lb; // starting logical block 17 | uint64_t dest_addr; // physical address (pinned buffer) / virtual address (non-pinned buffer) of destination 18 | uint64_t next_addr; // only valid for pinned buffer, next page of dest_addr (io_size <= 8KB) / prp list offset (io_size > 8KB) 19 | int num_items; // number of logical blocks 20 | 21 | inline __host__ __device__ Request(uint64_t start_lb, int num_items) 22 | { 23 | this->start_lb = start_lb; 24 | this->num_items = num_items; 25 | // You may need to call cudaLimitMallocHeapSize beforehand in this implementation. 26 | // this->dest_addr = (uint64_t*)malloc(sizeof(uint64_t)*num_items); 27 | } 28 | 29 | inline __host__ __device__ bool operator<(const Request& other) const 30 | { 31 | return this->start_lb < other.start_lb; 32 | } 33 | 34 | inline __host__ __device__ ~Request() 35 | { 36 | // free(this->dest_addr); 37 | } 38 | }; 39 | 40 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export AEOLUS_LOG_LEVEL=ERROR 3 | B=16k 4 | ssd=6 5 | IO=32k 6 | # dd if=/dev/zero of=/share/data6/a bs=1M count=64K 7 | # dd if=/dev/zero of=/share/data6/b bs=1M count=64K 8 | #for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k 9 | # for N in 32768 49152 65536 81920 98304 114688 131072 163840 196608 229376 262144 10 | for N in 32k 48k 64k 80k 96k 112k 128k 160k 192k 224k 256k 11 | do 12 | for ((i=0; i<1; i++)) 13 | do 14 | # ./gemm-test-pinned $N $N $N 0 256g 512g $B $IO $ssd 15 | # ./gemm-cublasxt $N $N $N 0 256g 512g $IO $ssd 16 | # ./gemm-cublas-gds $N $N $N 17 | # ~/bam/build.new/bin/nvm-gemm-bench --m=$N --n=$N --k=$N --a_offset=0 --b_offset=274877906944 --c_offset=549755813888 --block_size=16384 --page_size=32768 --blk_size=512 --queue_depth=4096 --pages=524288 --num_queues=128 --threads=4194304 --n_ctrls=$ssd --ssd=1 | grep result 18 | #./gemm-gds-no-batch $N $N $N /share/data6/a /share/data6/b /share/data6/c $B 19 | #./gemm-test $N $N $N 0 16UL*1024*1024*1024 32UL*1024*1024*1024 $B $IO 6 20 | ./gemm-spdk-test $N $N $N 0 16UL*1024*1024*1024 32UL*1024*1024*1024 $B $IO 6 21 | if [ $? -ne 0 ]; then 22 | echo "Failed at $N" 23 | # exit 1 24 | fi 25 | done 26 | done 27 | echo "All tests done" -------------------------------------------------------------------------------- /src/benchmarks/CAM_benchmark/Makefile: -------------------------------------------------------------------------------- 1 | # Define variables for include, library, and output paths 2 | INCLUDE_PATH = ../../CAM_lib 3 | LIB_PATH = ../../../build/lib 4 | OUTPUT_DIR = ../../../build/benchmarks/CAM_benchmark 5 | 6 | # Ensure the output directory exists 7 | $(shell mkdir -p $(OUTPUT_DIR)) 8 | 9 | # Compiler 10 | NVCC = nvcc 11 | 12 | # Compiler flags 13 | NVCC_FLAGS = -I $(INCLUDE_PATH) -I../../GPU_memory_lib -L $(LIB_PATH) -L../../../spdk/build/lib -lCAM_interface 14 | 15 | 16 | 17 | TARGET1 = test_seq_read 18 | SOURCE1 = test_seq_read.cu 19 | 20 | TARGET2 = test_seq_write 21 | SOURCE2 = test_seq_write.cu 22 | 23 | TARGET3 = test_random_read 24 | SOURCE3 = test_random_read.cu 25 | 26 | TARGET4 = test_random_write 27 | SOURCE4 = test_random_write.cu 28 | 29 | # Default rule 30 | all: $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) $(OUTPUT_DIR)/$(TARGET3) $(OUTPUT_DIR)/$(TARGET4) 31 | 32 | $(OUTPUT_DIR)/$(TARGET1): $(SOURCE1) 33 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 34 | 35 | $(OUTPUT_DIR)/$(TARGET2): $(SOURCE2) 36 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 37 | 38 | $(OUTPUT_DIR)/$(TARGET3): $(SOURCE3) 39 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 40 | 41 | $(OUTPUT_DIR)/$(TARGET4): $(SOURCE4) 42 | $(NVCC) -o $@ $(NVCC_FLAGS) $< 43 | 44 | # Clean rule 45 | clean: 46 | rm -f $(OUTPUT_DIR)/$(TARGET1) $(OUTPUT_DIR)/$(TARGET2) $(OUTPUT_DIR)/$(TARGET3) $(OUTPUT_DIR)/$(TARGET4) -------------------------------------------------------------------------------- /src/GPU_memory_lib/GPU_memory_management.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GPU_MEMORY_MANAGEMENT 2 | #define GPU_MEMORY_MANAGEMENT 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | 13 | class MemCtl { 14 | public: 15 | virtual ~MemCtl() = default; 16 | 17 | [[nodiscard]] size_t getPoolSize() const { 18 | return pool_size; 19 | } 20 | 21 | void *alloc(size_t size); 22 | 23 | void free(void *ptr); 24 | 25 | protected: 26 | MemCtl() = default; 27 | 28 | size_t pool_size{}; 29 | 30 | std::mutex allocMutex; 31 | /*<首地址, 块大小>*/ 32 | std::map free_chunk, used_chunk; 33 | /* n_pages, virt_addr_base, phy_addr_array */ 34 | std::tuple page_table; 35 | }; 36 | 37 | class GPUMemCtl : public MemCtl { 38 | public: 39 | ~GPUMemCtl() override; 40 | 41 | static GPUMemCtl *getInstance(int32_t dev_id, size_t pool_size); 42 | [[maybe_unused]] static void cleanCtx(); 43 | 44 | protected: 45 | explicit GPUMemCtl(uint64_t size); 46 | 47 | public: 48 | /* 49 | * void(uint32_t, uint32_t, uint64_t, uint64_t) => (page_index, page_size, virt_addr, phy_addr) 50 | */ 51 | void writeTLB(const std::function &func, bool aggr_flag); 52 | 53 | uint64_t mapV2P(void *ptr); 54 | 55 | void *getDevPtr() const; 56 | void *getMapDevPtr() const; 57 | 58 | bool chechPhyContiguous() const; 59 | 60 | }; 61 | 62 | #endif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CAM: Asynchronous GPU-Initiated, CPU-Managed SSD Management for Batching Storage Access 2 | 3 | 4 | This is the source code for our paper. 5 | 6 | 7 | 8 | ## Required hardware and software 9 | 10 | - NVIDIA 80GB-PCIe-A100 11 | - HugePage: At least 4096 huge pages 12 | - g++ >= 11.3.0 13 | 14 | 15 | 16 | ## Install Dependencies and Build 17 | See [INSTALL.md](./doc/INSTALL.md) for install dependencies and build CAM on a single machine. 18 | 19 | 20 | ## Run Test 21 | If Check if the configuration is correct in Run Experiments of [EXP.md](./doc/EXP.md) passes, then everything will be fine. Please refer to exp.md for more details. 22 | 23 | 24 | ### Directory Structure: 25 | 26 | ~~~ 27 | . 28 | ├── spdk (our modified SPDK driver) 29 | ├── gdrcopy (our modified gdrcopy driver) 30 | ├── build.sh (Script for compiling CAM code project) 31 | ├── doc (Document on how to install and conduct experiments with CAM) 32 | ├── README.md 33 | └── src 34 | ├── benchmarks 35 | │ ├── CAM_benchmark (microbenchmark for CAM,one thread control one SSD) 36 | │ │ 37 | │ └── CAM_variable_core_benchmark (microbenchmark for CAM,one thread control variable SSDs) 38 | │ 39 | ├── CAM_lib (source code of CAM) 40 | │ 41 | ├── CAM_variable_core_lib (source code of CAM with one thread control variable SSDs) 42 | │ 43 | └── GPU_memory_lib (source code for GPU memory management used in CAM) 44 | │ 45 | └── applications 46 | └── gemm (end to end test in GEMM application) 47 | 48 | ~~~ 49 | 50 | 51 | 52 | 53 | 54 | ### Getting help 55 | 56 | Working in the process... 57 | 58 | 59 | 60 | ### Contact 61 | 62 | email at songziyu@zju.edu.cn 63 | 64 | 65 | -------------------------------------------------------------------------------- /doc/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | This document shows all of the essential software installation process on test machine. 4 | 5 | ## 1. Install gdrcopy Driver: 6 | 7 | 8 | ~~~bash 9 | cd gdrcopy 10 | make 11 | make install 12 | ~~~ 13 | 14 | 15 | ## 2. Config Hugepages 16 | 17 | Our system need enough Hugepages, before run application, first alloc hugepages. 18 | 19 | ~~~bash 20 | sudo sh -c "echo 32768 > /proc/sys/vm/nr_hugepages" 21 | ~~~ 22 | 23 | 24 | ## 3. Build SPDK: 25 | 26 | ~~~bash 27 | cd spdk 28 | git submodule update --init 29 | ./configure --with-shared 30 | make 31 | ~~~ 32 | 33 | ## 4. Install SPDK Driver on NVMe SSDs. 34 | ~~~bash 35 | cd spdk/scrips 36 | sudo ./setup.sh 37 | ~~~ 38 | 39 | ## 5. Build all CAM software 40 | 41 | Now we create a `buildt` directory, and build all the software in the `build` directory. 42 | ~~~bash 43 | mkdir build 44 | bash build.sh 45 | ~~~ 46 | 47 | It should report no error. And we will get the output binary in the `build` directory. 48 | 49 | 50 | ## 4. Uninstall SPDK driver on NVMe SSD. 51 | 52 | After experiment, you need to uninstall SPDK driver 53 | ~~~bash 54 | cd spdk/scrips 55 | sudo ./setup.sh reset 56 | ~~~ 57 | 58 | ## 5. Extra Attention 59 | 60 | Before installing the SPDK driver, please ensure that there is no data present on the SSD. 61 | 62 | During the installation of the SPDK driver on NVMe SSDs, it is possible that some SSDs may not be able to install the SPDK driver. This is mainly due to the SSD already being mounted or having a file system on it. Therefore, it is necessary to unmount the SSD and wipe the file system before installing the SPDK driver. 63 | 64 | To unmount and wipe the file system on the selected SSDs, use the following command: 65 | ~~~bash 66 | sudo umount /dev/nvmeXn1 67 | sudo wipefs -a /dev/nvmeXn1 68 | ~~~ -------------------------------------------------------------------------------- /src/CAM_variable_core_lib/Makefile: -------------------------------------------------------------------------------- 1 | SPDK_DIR = ../../spdk 2 | OUTPUT_DIR = ../../build/lib 3 | 4 | 5 | .PHONY:all 6 | 7 | 8 | 9 | $(shell mkdir -p $(OUTPUT_DIR)) 10 | exe: 11 | 12 | 13 | g++ --std=c++17 -mcmodel=medium CAM_variable_core.h CAM_variable_core.cpp -o $(OUTPUT_DIR)/libCAM_variable_core.so -shared -lcudart \ 14 | -L/usr/local/cuda/lib64 -L../../build/lib -lgpu_memory_management -I../GPU_memory_lib \ 15 | -g -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -fno-strict-aliasing \ 16 | -I$(SPDK_DIR)/include -march=native -Wformat -Wformat-security -D_GNU_SOURCE -fPIC -fstack-protector -fno-common \ 17 | -I$(SPDK_DIR)/isa-l/.. -I$(SPDK_DIR)/isalbuild -I$(SPDK_DIR)/isa-l-crypto/.. \ 18 | -I$(SPDK_DIR)/isalcryptobuild -DNDEBUG -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -DSPDK_GIT_COMMIT=6ee9cd257 -pthread -std=c++17 \ 19 | -Wl,-z,relro,-z,now -Wl,-z,noexecstack -fuse-ld=bfd -Wl,-rpath=$(SPDK_DIR)/isa-l/.libs -Wl,-rpath=$(SPDK_DIR)/isa-l-crypto/.libs \ 20 | -L$(SPDK_DIR)/build/lib -Wl,--whole-archive -Wl,--no-as-needed -lspdk_sock_posix -lspdk_nvme -lspdk_keyring -lspdk_sock -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_dma -lspdk_vmd -lspdk_util -lspdk_log \ 21 | -Wl,--no-whole-archive $(SPDK_DIR)/build/lib/libspdk_env_dpdk.so -Wl,--no-as-needed $(SPDK_DIR)/dpdk/build/lib/librte_bus_pci.so $(SPDK_DIR)/dpdk/build/lib/librte_cryptodev.so \ 22 | $(SPDK_DIR)/dpdk/build/lib/librte_dmadev.so $(SPDK_DIR)/dpdk/build/lib/librte_eal.so $(SPDK_DIR)/dpdk/build/lib/librte_ethdev.so \ 23 | $(SPDK_DIR)/dpdk/build/lib/librte_hash.so $(SPDK_DIR)/dpdk/build/lib/librte_kvargs.so $(SPDK_DIR)/dpdk/build/lib/librte_log.so $(SPDK_DIR)/dpdk/build/lib/librte_mbuf.so \ 24 | $(SPDK_DIR)/dpdk/build/lib/librte_mempool.so $(SPDK_DIR)/dpdk/build/lib/librte_mempool_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_net.so $(SPDK_DIR)/dpdk/build/lib/librte_pci.so \ 25 | $(SPDK_DIR)/dpdk/build/lib/librte_power.so $(SPDK_DIR)/dpdk/build/lib/librte_rcu.so $(SPDK_DIR)/dpdk/build/lib/librte_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_telemetry.so \ 26 | $(SPDK_DIR)/dpdk/build/lib/librte_vhost.so -Wl,-as-needed -Wl,-rpath=$(SPDK_DIR)/dpdk/build/lib -L$(SPDK_DIR)/isa-l/.libs -lisal -L$(SPDK_DIR)/isa-l-crypto/.libs -lisal_crypto -pthread -lrt -luuid -lssl -lcrypto -lm -lfuse3 -lkeyutils -laio 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/CAM_variable_core_lib/CAM_variable_core.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef GPUSSD_BASELINE_H 4 | #define GPUSSD_BASELINE_H 5 | 6 | 7 | 8 | #include "GPU_memory_management.hpp" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "threadPool.h" 16 | 17 | #include "spdk/stdinc.h" 18 | 19 | #include "spdk/env.h" 20 | #include "spdk/log.h" 21 | #include "spdk/nvme.h" 22 | #include "spdk/nvme_zns.h" 23 | #include "spdk/string.h" 24 | #include "spdk/vmd.h" 25 | 26 | 27 | #define MAX_EMBED_NUM 1000000 28 | struct ctrlr_entry { 29 | struct spdk_nvme_ctrlr* ctrlr; 30 | char name[1024]; 31 | }; 32 | 33 | struct ns_entry { 34 | int32_t id; 35 | struct spdk_nvme_ctrlr* ctrlr; 36 | struct spdk_nvme_ns* ns; 37 | struct spdk_nvme_qpair* qpair; 38 | }; 39 | 40 | 41 | 42 | 43 | 44 | static void read_complete(void* arg, const struct spdk_nvme_cpl* completion) ; 45 | //static int thread_runner(int32_t dev_index); 46 | static int thread_runner2(int32_t dev_index) ; 47 | static bool probe_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr_opts* opts); 48 | static void attach_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr* ctrlr, const struct spdk_nvme_ctrlr_opts* opts); 49 | static void register_ns(struct spdk_nvme_ctrlr* ctrlr, struct spdk_nvme_ns* ns) ; 50 | void task_submit(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr); 51 | // void task_submit(int64_t embed_num, int32_t *embed_id, void *dev_addr); 52 | int rc4ml_spdk_init(u_int32_t emb_width); 53 | static void alloc_qpair() ; 54 | inline std::pair getEmbedAddr(int32_t embed_id) ; 55 | void spdkmap(void * map_ptr,size_t pool_size,uint64_t phy_addr); 56 | void clear_wait_flag(); 57 | 58 | //* new function 59 | 60 | static void write_complete(void* arg, const struct spdk_nvme_cpl* completion); 61 | static int thread_runner3(int32_t dev_index); 62 | void task_submit_write(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr); 63 | void clear_wait_flag_write(); 64 | void cam_init(u_int32_t emb_width,uint32_t core_num); 65 | void* alloc_gpu(int64_t size); 66 | void free_gpu(void* p); 67 | void cam_clean_up(void); 68 | static int thread_runner_variablecore(int32_t thread_index); 69 | static int thread_runner_variablecore_write(int32_t thread_index); 70 | void seq_read_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr); 71 | void seq_write_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr); 72 | 73 | 74 | void cam_gemm_read(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr); 75 | void cam_gemm_write(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr); 76 | 77 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/include/device.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_DEVICE_CUH 2 | #define __AEOLUS_DEVICE_CUH 3 | 4 | #define _CUDA 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "log.cuh" 15 | #include "util.cuh" 16 | #include "ioctl.h" 17 | #include "queue.cuh" 18 | 19 | struct aeolus_dev_mem_context 20 | { 21 | void *ptr; 22 | void *ioaddrs; 23 | }; 24 | 25 | /** 26 | * @brief Abstraction of an SSD device. 27 | * 28 | */ 29 | class Device 30 | { 31 | private: 32 | void *admin_queue_ptr; 33 | uint64_t *admin_queue_phys_addr; 34 | 35 | public: 36 | int ssd_id; 37 | int device_fd; 38 | void *reg_ptr; 39 | uint32_t max_queue_num; 40 | uint64_t max_lb_num; 41 | uint32_t max_io_size; 42 | 43 | AdminQueuePair *admin_qp; 44 | uint32_t active_ns; 45 | 46 | std::vector free_qps; 47 | 48 | /** 49 | * @brief Construct a new Device object. 50 | * 51 | * @param ssd_id ID of the SSD, typically the number in /dev/libnvm* 52 | */ 53 | Device(int ssd_id); 54 | 55 | ~Device(); 56 | 57 | /** 58 | * @brief Allocate a pinned host memory buffer with physical address provided. 59 | * 60 | * @param ptr Buffer pointer to be allocated. 61 | * @param size Size of the buffer. 62 | * @param phys_addr A physical address list returned. Each entry of the list is a 4KB page. 63 | * @return Allocation result. Can be read by strerror. 64 | */ 65 | int alloc_host_memory(void **ptr, uint64_t size, uint64_t** phys_addr); 66 | 67 | /** 68 | * @brief Free a pinned host memory buffer. 69 | * 70 | * @param ptr Buffer pointer. 71 | * @param phys_addr Physical address list pointer. 72 | */ 73 | void free_host_memory(void *ptr, uint64_t* phys_addr); 74 | 75 | /** 76 | * @brief Allocate a pinned device memory buffer with physical address provided. The buffer will be 64KB-aligned. 77 | * 78 | * @param ptr Buffer pointer to be allocated. 79 | * @param context A context pointer which could be used for freeing the buffer. 80 | * @param size Size of the buffer. 81 | * @param phys_addr A physical address list returned. Each entry of the list is a 64KB page. 82 | * @return Allocation result. Can be read by strerror. 83 | */ 84 | int alloc_device_memory(void **ptr, aeolus_dev_mem_context** context, uint64_t size, uint64_t** phys_addr); 85 | 86 | /** 87 | * @brief Free a pinned device memory buffer. 88 | * 89 | * @param context Device buffer context. 90 | */ 91 | void free_device_memory(aeolus_dev_mem_context* context); 92 | }; 93 | 94 | #endif -------------------------------------------------------------------------------- /src/CAM_lib/Makefile: -------------------------------------------------------------------------------- 1 | SPDK_DIR = ../../spdk 2 | 3 | CUDA_DIR = /usr/local/cuda-12.4 4 | OUTPUT_DIR = ../../build/lib 5 | 6 | 7 | $(shell mkdir -p $(OUTPUT_DIR)) 8 | 9 | .PHONY:all 10 | 11 | exe: 12 | g++ --std=c++17 -mcmodel=medium CAM_interface.h CAM_interface.cpp -o $(OUTPUT_DIR)/libCAM_interface.so -shared -lcudart \ 13 | -L/usr/local/cuda/lib64 -L../../build/lib -lgpu_memory_management -lcudart -lcuda -L$(CUDA_DIR)/lib64 -I../../src/GPU_memory_lib \ 14 | -g -Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -fno-strict-aliasing \ 15 | -I$(SPDK_DIR)/include -march=native -Wformat -Wformat-security -D_GNU_SOURCE -fPIC -fstack-protector -fno-common \ 16 | -I$(SPDK_DIR)/isa-l/.. -I$(SPDK_DIR)/isalbuild -I$(SPDK_DIR)/isa-l-crypto/.. -I$(CUDA_DIR)/include \ 17 | -I$(SPDK_DIR)/isalcryptobuild -DNDEBUG -O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -DSPDK_GIT_COMMIT=6ee9cd257 -pthread -std=c++17 \ 18 | -Wl,-z,relro,-z,now -Wl,-z,noexecstack -fuse-ld=bfd -Wl,-rpath=$(SPDK_DIR)/isa-l/.libs -Wl,-rpath=$(SPDK_DIR)/isa-l-crypto/.libs \ 19 | -L$(SPDK_DIR)/build/lib -Wl,--whole-archive -Wl,--no-as-needed -lspdk_sock_posix -lspdk_nvme -lspdk_keyring -lspdk_sock -lspdk_trace -lspdk_rpc -lspdk_jsonrpc -lspdk_json -lspdk_dma -lspdk_vmd -lspdk_util -lspdk_log \ 20 | -Wl,--no-whole-archive $(SPDK_DIR)/build/lib/libspdk_env_dpdk.so -Wl,--no-as-needed $(SPDK_DIR)/dpdk/build/lib/librte_bus_pci.so $(SPDK_DIR)/dpdk/build/lib/librte_cryptodev.so \ 21 | $(SPDK_DIR)/dpdk/build/lib/librte_dmadev.so $(SPDK_DIR)/dpdk/build/lib/librte_eal.so $(SPDK_DIR)/dpdk/build/lib/librte_ethdev.so \ 22 | $(SPDK_DIR)/dpdk/build/lib/librte_hash.so $(SPDK_DIR)/dpdk/build/lib/librte_kvargs.so $(SPDK_DIR)/dpdk/build/lib/librte_log.so $(SPDK_DIR)/dpdk/build/lib/librte_mbuf.so \ 23 | $(SPDK_DIR)/dpdk/build/lib/librte_mempool.so $(SPDK_DIR)/dpdk/build/lib/librte_mempool_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_net.so $(SPDK_DIR)/dpdk/build/lib/librte_pci.so \ 24 | $(SPDK_DIR)/dpdk/build/lib/librte_power.so $(SPDK_DIR)/dpdk/build/lib/librte_rcu.so $(SPDK_DIR)/dpdk/build/lib/librte_ring.so $(SPDK_DIR)/dpdk/build/lib/librte_telemetry.so \ 25 | $(SPDK_DIR)/dpdk/build/lib/librte_vhost.so -Wl,-as-needed -Wl,-rpath=$(SPDK_DIR)/dpdk/build/lib -L$(SPDK_DIR)/isa-l/.libs -lisal -L$(SPDK_DIR)/isa-l-crypto/.libs -lisal_crypto -pthread -lrt -luuid -lssl -lcrypto -lm -lfuse3 -lkeyutils -laio 26 | 27 | 28 | nvcc -dc -Xcompiler -fPIC -I../../src/GPU_memory_lib -c gpu_transfer.cu -o $(OUTPUT_DIR)/gpu_transfer.o 29 | nvcc -dlink -I ../../src/CAM_lib -o $(OUTPUT_DIR)/gpu_transfer_link.o $(OUTPUT_DIR)/gpu_transfer.o 30 | nvcc -rdc=true sample_read.cu $(OUTPUT_DIR)/gpu_transfer.o -o $(OUTPUT_DIR)/sample_read -I../../src/GPU_memory_lib -L $(OUTPUT_DIR) -lCAM_interface 31 | nvcc -rdc=true sample_write.cu $(OUTPUT_DIR)/gpu_transfer.o -o $(OUTPUT_DIR)/sample_write -I../../src/GPU_memory_lib -L $(OUTPUT_DIR) -lCAM_interface 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/CAM_lib/sample_read.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "gpu_transfer.cuh" 12 | #include "CAM_interface.h" 13 | 14 | __inline__ uint64_t get_tscp(void) 15 | { 16 | uint32_t lo, hi; 17 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 18 | __asm__ __volatile__ ( 19 | "rdtscp" : "=a"(lo), "=d"(hi) 20 | ); 21 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 22 | } 23 | 24 | 25 | 26 | __global__ void myKernel( u_int64_t* d_data_dev,u_int64_t* d_data,uint64_t* dev_addr) { 27 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 28 | for(int i=0;(256*i+idx)<1000000UL;i++) 29 | d_data[256*i+idx]=d_data_dev[256*i+idx]; 30 | for(int i=0;i<10;i++){ 31 | prefetch(1000000UL,dev_addr); 32 | prefetch_syncronize(); 33 | } 34 | } 35 | 36 | 37 | int main(int argc, char** argv) { 38 | 39 | cam_init(4096); 40 | u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t)); 41 | //launch_idle_kernel(); 42 | for (int64_t i = 0; i < 10000000UL; i++) { 43 | embed_id[i] = i; 44 | } 45 | std::random_shuffle(embed_id, embed_id + 10000000UL-1); 46 | u_int64_t* embed_id_dev; 47 | cudaMalloc(&embed_id_dev,10000000UL*sizeof(u_int64_t)); 48 | cudaMemcpy(embed_id_dev,embed_id,10000000UL*sizeof(u_int64_t),cudaMemcpyHostToDevice); 49 | cudaStream_t stream1,stream2; 50 | cudaError_t result; 51 | result = cudaStreamCreate(&stream1); 52 | result = cudaStreamCreate(&stream2); 53 | Init(4096,stream1); 54 | void* gem_memory = alloc_gpu(1000000UL*4096); 55 | u_int64_t* p_d = get_d_data(); 56 | std::thread th(polling_thread); 57 | double sum=0; 58 | uint64_t beg_tsc, end_tsc, middle_tsc; 59 | beg_tsc = get_tscp(); 60 | myKernel<<<1, 256,0,stream2>>>(embed_id_dev,p_d,(uint64_t*)gem_memory); 61 | cudaDeviceSynchronize(); 62 | end_tsc = get_tscp(); 63 | sum = 1.0*(end_tsc-beg_tsc)/ 2.2; 64 | printf("time cost : %lf ms\n",1.0*sum/1000000); 65 | std::cout<<"bandwidth: "<< 1000000UL*4096*10 / sum << "GB/s" < 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "gpu_transfer.cuh" 12 | #include "CAM_interface.h" 13 | 14 | __inline__ uint64_t get_tscp(void) 15 | { 16 | uint32_t lo, hi; 17 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 18 | __asm__ __volatile__ ( 19 | "rdtscp" : "=a"(lo), "=d"(hi) 20 | ); 21 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 22 | } 23 | 24 | 25 | 26 | __global__ void myKernel( u_int64_t* d_data_dev,u_int64_t* d_data,uint64_t* dev_addr) { 27 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 28 | for(int i=0;(256*i+idx)<1000000UL;i++) 29 | d_data[256*i+idx]=d_data_dev[256*i+idx]; 30 | for(int i=0;i<10;i++){ 31 | writeback(1000000UL,dev_addr); 32 | writeback_syncronize(); 33 | } 34 | } 35 | 36 | 37 | int main(int argc, char** argv) { 38 | 39 | cam_init(4096); 40 | u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t)); 41 | //launch_idle_kernel(); 42 | for (int64_t i = 0; i < 10000000UL; i++) { 43 | embed_id[i] = i; 44 | } 45 | std::random_shuffle(embed_id, embed_id + 10000000UL-1); 46 | u_int64_t* embed_id_dev; 47 | cudaMalloc(&embed_id_dev,10000000UL*sizeof(u_int64_t)); 48 | cudaMemcpy(embed_id_dev,embed_id,10000000UL*sizeof(u_int64_t),cudaMemcpyHostToDevice); 49 | cudaStream_t stream1,stream2; 50 | cudaError_t result; 51 | result = cudaStreamCreate(&stream1); 52 | result = cudaStreamCreate(&stream2); 53 | Init(4096,stream1); 54 | void* gem_memory = alloc_gpu(1000000UL*4096); 55 | u_int64_t* p_d = get_d_data_write(); 56 | std::thread th(polling_thread_write); 57 | double sum=0; 58 | uint64_t beg_tsc, end_tsc, middle_tsc; 59 | beg_tsc = get_tscp(); 60 | myKernel<<<1, 256,0,stream2>>>(embed_id_dev,p_d,(uint64_t*)gem_memory); 61 | cudaDeviceSynchronize(); 62 | end_tsc = get_tscp(); 63 | sum = 1.0*(end_tsc-beg_tsc)/ 2.2; 64 | printf("time cost : %lf ms\n",1.0*sum/1000000); 65 | std::cout<<"bandwidth: "<< 1000000UL*4096*10 / sum << "GB/s" < 12 | // #include "cuda_runtime.h" 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "threadPool.h" 22 | 23 | #include "spdk/stdinc.h" 24 | 25 | #include "spdk/env.h" 26 | #include "spdk/log.h" 27 | #include "spdk/nvme.h" 28 | #include "spdk/nvme_zns.h" 29 | #include "spdk/string.h" 30 | #include "spdk/vmd.h" 31 | 32 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__ 33 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 34 | 35 | #define MAX_EMBED_NUM 1000000 36 | struct ctrlr_entry { 37 | struct spdk_nvme_ctrlr* ctrlr; 38 | char name[1024]; 39 | }; 40 | 41 | struct ns_entry { 42 | int32_t id; 43 | struct spdk_nvme_ctrlr* ctrlr; 44 | struct spdk_nvme_ns* ns; 45 | struct spdk_nvme_qpair* qpair; 46 | }; 47 | 48 | 49 | 50 | 51 | 52 | static void read_complete(void* arg, const struct spdk_nvme_cpl* completion) ; 53 | //static int thread_runner(int32_t dev_index); 54 | static int thread_runner2(int32_t dev_index) ; 55 | static bool probe_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr_opts* opts); 56 | static void attach_cb(void* cb_ctx, const struct spdk_nvme_transport_id* trid, struct spdk_nvme_ctrlr* ctrlr, const struct spdk_nvme_ctrlr_opts* opts); 57 | static void register_ns(struct spdk_nvme_ctrlr* ctrlr, struct spdk_nvme_ns* ns) ; 58 | void task_submit(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr); 59 | // void task_submit(int64_t embed_num, int32_t *embed_id, void *dev_addr); 60 | int rc4ml_spdk_init(u_int32_t emb_width); 61 | static void alloc_qpair() ; 62 | inline std::pair getEmbedAddr(int32_t embed_id) ; 63 | void spdkmap(void * map_ptr,size_t pool_size,uint64_t phy_addr); 64 | void clear_wait_flag(); 65 | 66 | //* new function 67 | 68 | static void write_complete(void* arg, const struct spdk_nvme_cpl* completion); 69 | static int thread_runner3(int32_t dev_index); 70 | void task_submit_write(int64_t embed_num, u_int64_t embed_id,uintptr_t *dev_addr); 71 | void clear_wait_flag_write(); 72 | void cam_init(u_int32_t emb_width); 73 | void* alloc_gpu(int64_t size); 74 | void free_gpu(void* p); 75 | void cam_clean_up(void); 76 | 77 | void seq_read_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr); 78 | void seq_write_submit(u_int64_t start_lba, u_int64_t num_blocks,uintptr_t dev_addr); 79 | 80 | 81 | void cam_gemm_read(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr); 82 | void cam_gemm_write(u_int64_t * lba_array, u_int64_t req_num,uintptr_t dev_addr); 83 | 84 | #endif -------------------------------------------------------------------------------- /src/CAM_lib/threadPool.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_POOL_H 2 | #define THREAD_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | class ThreadPool { 15 | public: 16 | ThreadPool(size_t); 17 | template 18 | auto enqueue(F&& f, Args&&... args) 19 | -> std::future::type>; 20 | ~ThreadPool(); 21 | private: 22 | // need to keep track of threads so we can join them 23 | std::vector< std::thread > workers; 24 | // the task queue 25 | std::queue< std::function > tasks; 26 | 27 | // synchronization 28 | std::mutex queue_mutex; 29 | std::condition_variable condition; 30 | bool stop; 31 | }; 32 | 33 | // the constructor just launches some amount of workers 34 | inline ThreadPool::ThreadPool(size_t threads) 35 | : stop(false) 36 | { 37 | for(size_t i = 0;i task; 44 | 45 | { 46 | std::unique_lock lock(this->queue_mutex); 47 | this->condition.wait(lock, 48 | [this]{ return this->stop || !this->tasks.empty(); }); 49 | if(this->stop && this->tasks.empty()) 50 | return; 51 | task = std::move(this->tasks.front()); 52 | this->tasks.pop(); 53 | } 54 | 55 | task(); 56 | } 57 | } 58 | ); 59 | } 60 | 61 | // add new work item to the pool 62 | template 63 | auto ThreadPool::enqueue(F&& f, Args&&... args) 64 | -> std::future::type> 65 | { 66 | using return_type = typename std::result_of::type; 67 | 68 | auto task = std::make_shared< std::packaged_task >( 69 | std::bind(std::forward(f), std::forward(args)...) 70 | ); 71 | 72 | std::future res = task->get_future(); 73 | { 74 | std::unique_lock lock(queue_mutex); 75 | 76 | // don't allow enqueueing after stopping the pool 77 | if(stop) 78 | throw std::runtime_error("enqueue on stopped ThreadPool"); 79 | 80 | tasks.emplace([task](){ (*task)(); }); 81 | } 82 | condition.notify_one(); 83 | return res; 84 | } 85 | 86 | // the destructor joins all threads 87 | inline ThreadPool::~ThreadPool() 88 | { 89 | { 90 | std::unique_lock lock(queue_mutex); 91 | stop = true; 92 | } 93 | condition.notify_all(); 94 | for(std::thread &worker: workers) 95 | worker.join(); 96 | } 97 | 98 | #endif -------------------------------------------------------------------------------- /src/CAM_variable_core_lib/threadPool.h: -------------------------------------------------------------------------------- 1 | #ifndef THREAD_POOL_H 2 | #define THREAD_POOL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | class ThreadPool { 15 | public: 16 | ThreadPool(size_t); 17 | template 18 | auto enqueue(F&& f, Args&&... args) 19 | -> std::future::type>; 20 | ~ThreadPool(); 21 | private: 22 | // need to keep track of threads so we can join them 23 | std::vector< std::thread > workers; 24 | // the task queue 25 | std::queue< std::function > tasks; 26 | 27 | // synchronization 28 | std::mutex queue_mutex; 29 | std::condition_variable condition; 30 | bool stop; 31 | }; 32 | 33 | // the constructor just launches some amount of workers 34 | inline ThreadPool::ThreadPool(size_t threads) 35 | : stop(false) 36 | { 37 | for(size_t i = 0;i task; 44 | 45 | { 46 | std::unique_lock lock(this->queue_mutex); 47 | this->condition.wait(lock, 48 | [this]{ return this->stop || !this->tasks.empty(); }); 49 | if(this->stop && this->tasks.empty()) 50 | return; 51 | task = std::move(this->tasks.front()); 52 | this->tasks.pop(); 53 | } 54 | 55 | task(); 56 | } 57 | } 58 | ); 59 | } 60 | 61 | // add new work item to the pool 62 | template 63 | auto ThreadPool::enqueue(F&& f, Args&&... args) 64 | -> std::future::type> 65 | { 66 | using return_type = typename std::result_of::type; 67 | 68 | auto task = std::make_shared< std::packaged_task >( 69 | std::bind(std::forward(f), std::forward(args)...) 70 | ); 71 | 72 | std::future res = task->get_future(); 73 | { 74 | std::unique_lock lock(queue_mutex); 75 | 76 | // don't allow enqueueing after stopping the pool 77 | if(stop) 78 | throw std::runtime_error("enqueue on stopped ThreadPool"); 79 | 80 | tasks.emplace([task](){ (*task)(); }); 81 | } 82 | condition.notify_one(); 83 | return res; 84 | } 85 | 86 | // the destructor joins all threads 87 | inline ThreadPool::~ThreadPool() 88 | { 89 | { 90 | std::unique_lock lock(queue_mutex); 91 | stop = true; 92 | } 93 | condition.notify_all(); 94 | for(std::thread &worker: workers) 95 | worker.join(); 96 | } 97 | 98 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/include/log.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_LOG_CUH 2 | #define __AEOLUS_LOG_CUH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define AEOLUS_LOG_DEBUG(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_DEBUG, __FILE__, __LINE__, __VA_ARGS__) 10 | #define AEOLUS_LOG_INFO(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_INFO, __FILE__, __LINE__, __VA_ARGS__) 11 | #define AEOLUS_LOG_WARNING(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_WARNING, __FILE__, __LINE__, __VA_ARGS__) 12 | #define AEOLUS_LOG_ERROR(...) AEOLUS_LOG(AEOLUS_LOG_LEVEL_ERROR, __FILE__, __LINE__, __VA_ARGS__) 13 | 14 | #ifndef __CUDA_ARCH__ 15 | #define AEOLUS_LOG(level, filename, lineno, ...) aeolus_log(level, filename, lineno, __VA_ARGS__); 16 | #else 17 | #define AEOLUS_LOG(level, filename, lineno, ...) \ 18 | printf("%s:%d ", filename, lineno); \ 19 | printf(__VA_ARGS__); 20 | #endif 21 | 22 | enum aeolus_log_level 23 | { 24 | AEOLUS_LOG_LEVEL_NULL = 0, 25 | AEOLUS_LOG_LEVEL_DEBUG = 1, 26 | AEOLUS_LOG_LEVEL_INFO = 2, 27 | AEOLUS_LOG_LEVEL_WARNING = 3, 28 | AEOLUS_LOG_LEVEL_ERROR = 4, 29 | }; 30 | 31 | static aeolus_log_level log_level = AEOLUS_LOG_LEVEL_NULL; 32 | 33 | inline aeolus_log_level get_log_level() 34 | { 35 | if (log_level != AEOLUS_LOG_LEVEL_NULL) 36 | { 37 | return log_level; 38 | } 39 | 40 | char *log_level_env = getenv("AEOLUS_LOG_LEVEL"); 41 | if (log_level_env == NULL) 42 | { 43 | #ifdef __DEBUG__ 44 | log_level = AEOLUS_LOG_LEVEL_INFO; 45 | #else 46 | log_level = AEOLUS_LOG_LEVEL_WARNING; 47 | #endif 48 | } else 49 | { 50 | if (strcmp(log_level_env, "DEBUG") == 0) 51 | { 52 | log_level = AEOLUS_LOG_LEVEL_DEBUG; 53 | } else if (strcmp(log_level_env, "INFO") == 0) 54 | { 55 | log_level = AEOLUS_LOG_LEVEL_INFO; 56 | } else if (strcmp(log_level_env, "WARNING") == 0) 57 | { 58 | log_level = AEOLUS_LOG_LEVEL_WARNING; 59 | } else if (strcmp(log_level_env, "ERROR") == 0) 60 | { 61 | log_level = AEOLUS_LOG_LEVEL_ERROR; 62 | } else 63 | { 64 | log_level = AEOLUS_LOG_LEVEL_INFO; 65 | } 66 | } 67 | 68 | return log_level; 69 | } 70 | 71 | __host__ inline void aeolus_log(aeolus_log_level level, const char *filename, int lineno, const char *format, ...) 72 | { 73 | if (level < get_log_level()) 74 | { 75 | return; 76 | } 77 | 78 | char *level_str; 79 | switch (level) 80 | { 81 | case AEOLUS_LOG_LEVEL_DEBUG: 82 | level_str = (char *)"DEBUG"; 83 | break; 84 | case AEOLUS_LOG_LEVEL_INFO: 85 | level_str = (char *)"INFO"; 86 | break; 87 | case AEOLUS_LOG_LEVEL_WARNING: 88 | level_str = (char *)"WARNING"; 89 | break; 90 | case AEOLUS_LOG_LEVEL_ERROR: 91 | level_str = (char *)"ERROR"; 92 | break; 93 | default: 94 | level_str = (char *)"UNKNOWN"; 95 | break; 96 | } 97 | 98 | va_list args; 99 | va_start(args, format); 100 | fprintf(stderr, "[%s] %s:%d ", level_str, filename, lineno); 101 | vfprintf(stderr, format, args); 102 | fprintf(stderr, "\n"); 103 | va_end(args); 104 | } 105 | 106 | #endif -------------------------------------------------------------------------------- /doc/EXP.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | **Important:** Please see [INSTALL.md](./INSTALL.md) for install dependencies and build CAM on a single machine. 4 | 5 | ## 1. CAM Throughput Microbenchmark 6 | 7 | This is the evalution for fig.8 in the paper. 8 | 9 | 10 | 11 | ### 1.1 Run the Random Read Benchmark 12 | 13 | 14 | ~~~bash 15 | cd build/benchmarks/CAM_benchmark 16 | sudo ./test_random_read 17 | ~~~ 18 | 19 | 20 | 21 | 22 | Firstly, the otput will be like this, which means SSDs are initialized. 23 | ~~~ 24 | Initializing NVMe Controllers 25 | Attaching to 0000:50:00.0 26 | Attaching to 0000:57:00.0 27 | Attaching to 0000:66:00.0 28 | Attaching to 0000:68:00.0 29 | Attaching to 0000:9c:00.0 30 | Attaching to 0000:9d:00.0 31 | Attaching to 0000:e3:00.0 32 | Attaching to 0000:e4:00.0 33 | Attaching to 0000:e5:00.0 34 | attach_cb 35 | Attached to 0000:68:00.0 36 | Namespace ID: 1 size: 3840GB 37 | attach_cb 38 | Attached to 0000:66:00.0 39 | Namespace ID: 1 size: 3840GB 40 | attach_cb 41 | Attached to 0000:50:00.0 42 | Namespace ID: 1 size: 3840GB 43 | attach_cb 44 | Attached to 0000:57:00.0 45 | Namespace ID: 1 size: 3840GB 46 | attach_cb 47 | Attached to 0000:e5:00.0 48 | Namespace ID: 1 size: 3840GB 49 | attach_cb 50 | Attached to 0000:e4:00.0 51 | Namespace ID: 1 size: 3840GB 52 | attach_cb 53 | Attached to 0000:9c:00.0 54 | Namespace ID: 1 size: 3840GB 55 | attach_cb 56 | Attached to 0000:9d:00.0 57 | Namespace ID: 1 size: 3840GB 58 | attach_cb 59 | Attached to 0000:e3:00.0 60 | Namespace ID: 1 size: 3840GB 61 | Initialization complete. 62 | ~~~ 63 | 64 | 65 | 66 | After initialization, the output will be like this: 67 | 68 | ~~~bash 69 | time cost : 617.335267 ms 70 | bandwidth: 19.9049GB/s 71 | ~~~ 72 | 73 | ## 1.2 Run Other Microbenchmark 74 | 75 | The random write, sequential read, sequential write microbenchmark have similar output to the random read benchmark. 76 | ~~~bash 77 | cd build/benchmarks/CAM_benchmark 78 | sudo ./test_random_write 79 | sudo ./test_seq_read 80 | sudo ./test_seq_write 81 | ~~~ 82 | 83 | 84 | ## 2. CAM Throughput Microbenchmark Using One Thread to Manage Multiple SSDs 85 | 86 | This is the evalution for fig.7 in the paper. 87 | 88 | ### 2.1 Run the Benchmark 89 | 90 | 91 | ~~~bash 92 | cd build/benchmarks/CAM_variable_core_benchmark 93 | sudo ./variable_core_test_read 94 | sudo ./variable_core_test_write 95 | ~~~ 96 | 97 | The output is similar to the `CAM Throughput Microbenchmark` 98 | 99 | 100 | 101 | ### 2.2 Run the Benchmark Using One Thread to Manage Different SSDs 102 | 103 | 104 | 105 | To run differenct SSDs number using one thread to control, please edit `/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_read.cu` and `/src/benchmarks/CAM_variable_core_benchmark/variable_core_test_write.cu`, change `thread_num` and recompile the program. 106 | 107 | ## 3. Run CAM Sample Code 108 | 109 | ~~~bash 110 | cd build/lib 111 | sudo ./sample_read 112 | sudo ./sample_write 113 | ~~~ 114 | 115 | ## 4. Run GEMM with CAM 116 | 117 | ~~~bash 118 | sudo bash run_GEMM.sh 119 | ~~~ 120 | 121 | This script runs a series of tests using the gemm-test application located within a build directory. It iterates over a set of matrix sizes, specified as 'N', ranging from 32k to 256k with increments at approximately consistent steps. For each matrix size 'N', the gemm-test command is executed three times. 122 | 123 | Each execution outputs the execution time and TFLOPS, as shown in the figure below 124 | ![alt text](pic/GEMM_pic.png "GEMM output") 125 | 126 | ## 5. Extra Attention 127 | - the initialize operations are required after each reboot. 128 | 129 | And if you have any questions, please contact us. -------------------------------------------------------------------------------- /src/CAM_lib/gpu_transfer.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TRANSFER_CUH__ 2 | #define __GPU_TRANSFER_CUH__ 3 | #include 4 | #include 5 | #include "CAM_interface.h" 6 | 7 | #define MSCCLPP_DEVICE_INLINE __forceinline__ __device__ 8 | #define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__ 9 | 10 | #define MAX_EMBED_NUM 1000000 11 | 12 | 13 | 14 | //* A semaphore for sending signals from the host to the device. 15 | class Host2DeviceSemaphore { 16 | private: 17 | void* InboundSemaphore; 18 | u_int64_t expectedInboundSemaphore; 19 | void* outboundSemaphore; 20 | u_int64_t* p_outboundSemaphoreValue; 21 | cudaStream_t stream; 22 | u_int64_t* total_num; 23 | public: 24 | Host2DeviceSemaphore(void){ 25 | cudaHostAlloc( (void**)&p_outboundSemaphoreValue,sizeof(u_int64_t),cudaHostAllocDefault ) ; 26 | expectedInboundSemaphore =1; 27 | cudaMalloc(&outboundSemaphore,sizeof(u_int64_t)); 28 | *p_outboundSemaphoreValue =0; 29 | 30 | } 31 | void ConnectToDeviceSemaphore(void* InboundSemaphore_,u_int64_t* total_num_ ) {InboundSemaphore=InboundSemaphore_;total_num=total_num_;} 32 | void* GetoutboundSemaphore(void) { return outboundSemaphore;} 33 | u_int64_t GetTotalNumber(void) { return *total_num;} 34 | void ConnectToStream(cudaStream_t stream1){ stream= stream1;} 35 | void signal() { 36 | //printf("signal\n"); 37 | (*p_outboundSemaphoreValue)+=1; 38 | cudaError_t return_value=cudaMemcpyAsync(outboundSemaphore, p_outboundSemaphoreValue, sizeof(u_int64_t), cudaMemcpyHostToDevice,stream); 39 | if (return_value != cudaSuccess) { 40 | std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorName(return_value) << " - " << cudaGetErrorString(return_value) << std::endl; 41 | // 处理错误 42 | } 43 | 44 | //printf("p_outboundSemaphoreValue: %ld\n",*(u_int64_t*)p_outboundSemaphoreValue); 45 | } 46 | 47 | void wait() { 48 | // printf("wait\n"); 49 | // printf("InboundSemaphore: %ld\n",*(u_int64_t*)InboundSemaphore); 50 | // printf("expectedInboundSemaphore: %ld\n",expectedInboundSemaphore); 51 | uint64_t start = 0; 52 | while((*(u_int64_t*)InboundSemaphore) < expectedInboundSemaphore){ 53 | start = 0; 54 | while (start++ < 100000); 55 | 56 | } 57 | // printf("wait end\n"); 58 | // printf("InboundSemaphore: %ld\n",*(u_int64_t*)InboundSemaphore); 59 | // printf("expectedInboundSemaphore: %ld\n",expectedInboundSemaphore); 60 | expectedInboundSemaphore ++; 61 | } 62 | }; 63 | 64 | // struct SmDevice2DeviceSemaphoreDeviceHandle { 65 | 66 | 67 | 68 | 69 | 70 | // MSCCLPP_DEVICE_INLINE void signal(uint64_t num) { 71 | // *total_num = num; 72 | // semaphoreIncrement(); 73 | // *outboundSemaphoreId = *outboundSemaphoreValue; 74 | // } 75 | 76 | // /// Increase the counter of the local semaphore. 77 | // MSCCLPP_DEVICE_INLINE void semaphoreIncrement() { *outboundSemaphoreValue += 1; } 78 | 79 | // /// Get the value of the local semaphore. 80 | // MSCCLPP_DEVICE_INLINE uint64_t semaphoreGetLocal() const { return *outboundSemaphoreValue; } 81 | 82 | 83 | // }; 84 | 85 | 86 | 87 | 88 | void SemaphoreInit(cudaStream_t stream1); 89 | void Init(u_int32_t access_size,cudaStream_t stream1); 90 | extern "C" __global__ void init_myKernel(void); 91 | 92 | //* read functions 93 | void polling_thread(void); 94 | __device__ void prefetch(int64_t embed_num,uintptr_t *dev_addr); 95 | __device__ void prefetch_syncronize(void); 96 | uint64_t* get_d_data(void); 97 | //*wrtie functions 98 | void polling_thread_write(void); 99 | __device__ void writeback(int64_t embed_num,uintptr_t *dev_addr); 100 | __device__ void writeback_syncronize(void); 101 | uint64_t* get_d_data_write(void); 102 | 103 | void polling_thread_seq(void); 104 | void polling_thread_seq_write(void); 105 | __device__ void prefetch_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr); 106 | __device__ void writeback_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr); 107 | 108 | 109 | 110 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/include/gemm.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "lightbam.cuh" 6 | uint64_t parse_offset(char *str) 7 | { 8 | int len = strlen(str); 9 | char unit = ' '; 10 | if (!isdigit(str[len - 1])) 11 | { 12 | unit = str[len - 1]; 13 | str[len - 1] = '\0'; 14 | } 15 | uint64_t offset = atoll(str); 16 | if (unit == 'K' || unit == 'k') 17 | { 18 | offset = offset * 1024; 19 | } 20 | if (unit == 'M' || unit == 'm') 21 | { 22 | offset = offset * 1024 * 1024; 23 | } 24 | if (unit == 'G' || unit == 'g') 25 | { 26 | offset = offset * 1024 * 1024 * 1024; 27 | } 28 | if (unit == 'T' || unit == 't') 29 | { 30 | offset = offset * 1024 * 1024 * 1024 * 1024; 31 | } 32 | return offset; 33 | } 34 | 35 | class PinnedBuffer 36 | { 37 | private: 38 | void *iobuf; 39 | aeolus_dev_mem_context *iobuf_ctx; 40 | uint64_t *h_iobuf_phys; 41 | uint64_t *d_iobuf_phys; 42 | uint64_t *prp_list; 43 | uint64_t *h_prp_phys; 44 | uint64_t *d_prp_phys; 45 | uint64_t max_io_size; 46 | Device *dev; 47 | 48 | public: 49 | PinnedBuffer(Device* dev, uint64_t size, uint64_t max_io_size = 0) 50 | { 51 | int ret = dev->alloc_device_memory( 52 | &iobuf, &iobuf_ctx, size, &h_iobuf_phys 53 | ); 54 | if (ret != 0) 55 | { 56 | AEOLUS_LOG_ERROR("Failed to allocate device memory for IO buffer: %s", strerror(ret)); 57 | exit(-1); 58 | } 59 | size_t iobuf_phys_size = size / AEOLUS_DEVICE_PGSIZE * sizeof(uint64_t); 60 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_iobuf_phys, iobuf_phys_size)); 61 | AEOLUS_CUDA_CHECK(cudaMemcpy(d_iobuf_phys, h_iobuf_phys, iobuf_phys_size, cudaMemcpyHostToDevice)); 62 | 63 | if (max_io_size > AEOLUS_HOST_PGSIZE * 2) 64 | { 65 | uint64_t prp_list_size = size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); 66 | AEOLUS_LOG_INFO("Allocating PRP buffer."); 67 | dev->alloc_host_memory((void **)&prp_list, prp_list_size, &h_prp_phys); 68 | 69 | // Fill in PRP table. 70 | for (int i = 0; i < size / AEOLUS_DEVICE_PGSIZE; i++) 71 | { 72 | for (int j = 0; j < AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE; j++) 73 | { 74 | if (i == 0 && j == 0) 75 | { 76 | continue; 77 | } 78 | prp_list[i * AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE + j - 1] = 79 | h_iobuf_phys[i] + j * AEOLUS_HOST_PGSIZE; 80 | } 81 | } 82 | 83 | // Move PRP physical address to GPU. 84 | size_t prp_phys_size = CEIL(prp_list_size, AEOLUS_HOST_PGSIZE) * sizeof(uint64_t); 85 | AEOLUS_CUDA_CHECK(cudaMalloc((void **)&d_prp_phys, prp_phys_size)); 86 | AEOLUS_CUDA_CHECK(cudaMemcpy(d_prp_phys, h_prp_phys, prp_phys_size, cudaMemcpyHostToDevice)); 87 | } 88 | this->max_io_size = max_io_size; 89 | this->dev = dev; 90 | } 91 | 92 | Request create_request(uint64_t offset, uint64_t start_lb = 0, int num_items = 0) 93 | { 94 | if (num_items == 0) 95 | num_items = max_io_size / AEOLUS_LB_SIZE; 96 | Request req(start_lb, num_items); 97 | req.dest_addr = h_iobuf_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE; 98 | req.next_addr = offset / max_io_size; 99 | if (max_io_size <= AEOLUS_HOST_PGSIZE * 2) 100 | { 101 | offset += AEOLUS_HOST_PGSIZE; 102 | req.next_addr = h_iobuf_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE; 103 | } 104 | return req; 105 | } 106 | 107 | uint64_t *get_iobuf_phys() 108 | { 109 | return h_iobuf_phys; 110 | } 111 | 112 | uint64_t *get_d_iobuf_phys() 113 | { 114 | return d_iobuf_phys; 115 | } 116 | 117 | uint64_t *get_d_prp_phys() 118 | { 119 | return d_prp_phys; 120 | } 121 | 122 | operator void *() 123 | { 124 | return iobuf; 125 | } 126 | 127 | ~PinnedBuffer() 128 | { 129 | if (max_io_size > AEOLUS_HOST_PGSIZE * 2) 130 | { 131 | dev->free_host_memory(prp_list, h_prp_phys); 132 | AEOLUS_CUDA_CHECK(cudaFree(d_prp_phys)); 133 | } 134 | dev->free_device_memory(iobuf_ctx); 135 | } 136 | }; -------------------------------------------------------------------------------- /src/benchmarks/CAM_benchmark/test_seq_write.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_interface.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 300000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(1000000UL*10*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 122 | // printf("Time: %f\n", time_span.count()); 123 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 124 | } 125 | 126 | int main(int argc, char** argv) { 127 | 128 | cam_init(4096); 129 | run_task_function_test(); 130 | cam_clean_up(); 131 | 132 | return 0; 133 | } 134 | 135 | /* 136 | nvcc -o test_seq_write -I /home/szy/yzh_hyprion/spdk_interface -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_seq_write.cu 137 | */ -------------------------------------------------------------------------------- /src/benchmarks/CAM_benchmark/test_random_read.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_interface.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 100000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(embed_num*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 122 | // printf("Time: %f\n", time_span.count()); 123 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 124 | } 125 | 126 | int main(int argc, char** argv) { 127 | 128 | cam_init(4096); 129 | run_task_function_test(); 130 | cam_clean_up(); 131 | 132 | return 0; 133 | } 134 | 135 | /* 136 | nvcc -o test_random_read -I /home/szy/yzh_hyprion/spdk_interface -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_random_read.cu 137 | */ 138 | -------------------------------------------------------------------------------- /src/benchmarks/CAM_benchmark/test_random_write.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_interface.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 100000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(1000000UL*10*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 122 | // printf("Time: %f\n", time_span.count()); 123 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 124 | } 125 | 126 | int main(int argc, char** argv) { 127 | 128 | cam_init(4096); 129 | run_task_function_test(); 130 | cam_clean_up(); 131 | 132 | return 0; 133 | } 134 | 135 | /* 136 | nvcc -o test_random_write -I /home/szy/yzh_hyprion/spdk_interface -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_random_write.cu 137 | */ 138 | -------------------------------------------------------------------------------- /src/benchmarks/CAM_benchmark/test_seq_read.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_interface.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 300000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(embed_num*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 123 | // printf("Time: %f\n", time_span.count()); 124 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 125 | } 126 | 127 | int main(int argc, char** argv) { 128 | 129 | cam_init(4096); 130 | run_task_function_test(); 131 | cam_clean_up(); 132 | 133 | return 0; 134 | } 135 | 136 | /* 137 | nvcc -o test_seq_read -I /home/szy/yzh_hyprion/spdk_interface -L /home/szy/yzh_hyprion/spdk_interface -lgpussd_baseline test_seq_read.cu 138 | */ -------------------------------------------------------------------------------- /src/benchmarks/CAM_variable_core_benchmark/variable_core_test_read.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_variable_core.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 1000000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(10000000UL*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(1000000UL*10*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 122 | // printf("Time: %f\n", time_span.count()); 123 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 124 | } 125 | 126 | int main(int argc, char** argv) { 127 | int thread_num =2; 128 | cam_init(4096,thread_num); 129 | run_task_function_test(); 130 | cam_clean_up(); 131 | 132 | return 0; 133 | } 134 | 135 | /* 136 | nvcc -o variable_core_test -I /home/szy/application/spdk_variable_core -L /home/szy/application/spdk_variable_core -lspdk_variable_core variable_core_test.cu 137 | */ 138 | -------------------------------------------------------------------------------- /src/benchmarks/CAM_variable_core_benchmark/variable_core_test_write.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "CAM_variable_core.h" 12 | #include 13 | 14 | 15 | const int64_t embed_num = 1000000; 16 | //uintptr_t dev_addr[embed_num]; 17 | //static GPUMemCtl* gpuMemCtl; 18 | static const int64_t lba_size = 512; 19 | static int64_t embed_entry_width ; 20 | static int64_t embed_entry_lba; 21 | 22 | 23 | 24 | __inline__ uint64_t get_tsc() 25 | { 26 | uint64_t a, d; 27 | __asm__ volatile("rdtsc" : "=a"(a), "=d"(d)); 28 | return (d << 32) | a; 29 | } 30 | 31 | __inline__ uint64_t get_tscp(void) 32 | { 33 | uint32_t lo, hi; 34 | // take time stamp counter, rdtscp does serialize by itself, and is much cheaper than using CPUID 35 | __asm__ __volatile__ ( 36 | "rdtscp" : "=a"(lo), "=d"(hi) 37 | ); 38 | return ((uint64_t)lo) | (((uint64_t)hi) << 32); 39 | } 40 | 41 | __inline__ uint64_t cycles_2_ns(uint64_t cycles, uint64_t hz) 42 | { 43 | return cycles * (1000000000.0 / hz); 44 | } 45 | 46 | uint64_t get_cpu_freq() 47 | { 48 | FILE *fp=popen("lscpu | grep CPU | grep MHz | awk {'print $3'}","r"); 49 | if(fp == nullptr) 50 | return 0; 51 | 52 | char cpu_mhz_str[200] = { 0 }; 53 | fgets(cpu_mhz_str,80,fp); 54 | fclose(fp); 55 | 56 | return atof(cpu_mhz_str) * 1000 * 1000; 57 | 58 | } 59 | 60 | 61 | 62 | 63 | static void run_task_function_test() { 64 | u_int64_t* embed_id = (u_int64_t*)malloc(embed_num*sizeof(u_int64_t)); 65 | //launch_idle_kernel(); 66 | void* gem_memory = alloc_gpu(embed_num*10*4096); 67 | for (int64_t i = 0; i < embed_num; i++) { 68 | embed_id[i] = i; 69 | //dev_addr[i] = (uintptr_t)gem_memory + i * embed_entry_width; 70 | } 71 | // int buffer[1024]; 72 | // int buffer_fake[1024]; 73 | // int buffer2[1024]; 74 | // for(int i=0;i<1024;i++){ 75 | // buffer[i]=i; 76 | // buffer_fake[i] =0; 77 | // } 78 | std::cout<<" begin!"< time_span = std::chrono::duration_cast>(time_end - time_start); 122 | // printf("Time: %f\n", time_span.count()); 123 | // printf("bandwdth : %lf GB/s\n",embed_num*4/time_span.count()/1024/1024); 124 | } 125 | 126 | int main(int argc, char** argv) { 127 | int thread_num =2; 128 | cam_init(4096,thread_num); 129 | run_task_function_test(); 130 | cam_clean_up(); 131 | 132 | return 0; 133 | } 134 | 135 | /* 136 | nvcc -o variable_core_test_write -I /home/szy/application/spdk_variable_core -L /home/szy/application/spdk_variable_core -lspdk_variable_core variable_core_test_write.cu 137 | */ 138 | -------------------------------------------------------------------------------- /src/applications/gemm/include/queue.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_QUEUE_CUH 2 | #define __AEOLUS_QUEUE_CUH 3 | 4 | #include 5 | #include 6 | #include "util.cuh" 7 | #include "log.cuh" 8 | 9 | /** 10 | * @brief Abstraction of an SSD SQ-CQ pair. 11 | * 12 | */ 13 | class QueuePair 14 | { 15 | public: 16 | uint32_t *cmd_id_to_req_id; 17 | uint32_t *cmd_id_to_sq_pos; 18 | bool *sq_entry_busy; 19 | uint32_t sq_tail; 20 | uint32_t cq_head; 21 | uint32_t cmd_id; // also number of commands submitted 22 | uint32_t *sqtdbl, *cqhdbl; 23 | uint32_t num_completed; 24 | volatile uint32_t *sq; 25 | volatile uint32_t *cq; 26 | protected: 27 | uint32_t namespace_id; 28 | uint32_t queue_depth; 29 | 30 | public: 31 | inline __host__ __device__ QueuePair() 32 | { 33 | } 34 | 35 | inline __host__ __device__ QueuePair(volatile uint32_t *sq, volatile uint32_t *cq, uint32_t namespace_id, uint32_t *sqtdbl, uint32_t *cqhdbl, uint32_t queue_depth, uint32_t *cmd_id_to_req_id = nullptr, uint32_t *cmd_id_to_sq_pos = nullptr, bool *sq_entry_busy = nullptr) 36 | : sq(sq), cq(cq), sq_tail(0), cq_head(0), cmd_id(0), namespace_id(namespace_id), sqtdbl(sqtdbl), cqhdbl(cqhdbl), cmd_id_to_req_id(cmd_id_to_req_id), cmd_id_to_sq_pos(cmd_id_to_sq_pos), sq_entry_busy(sq_entry_busy), queue_depth(queue_depth), num_completed(0) 37 | { 38 | } 39 | 40 | __host__ __device__ void submit(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0); 41 | 42 | __device__ void submit_fence(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0); 43 | 44 | __host__ __device__ void fill_sq(uint32_t cid, uint32_t pos, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0, uint32_t req_id = 0xffffffff); 45 | 46 | __host__ __device__ void poll(uint32_t &code, uint32_t cid); 47 | 48 | __host__ __device__ void poll_with_dw0(uint32_t &code, uint32_t cid, uint32_t &dw0); 49 | }; 50 | 51 | /** 52 | * @brief Abstraction of an SSD IO queue pair. 53 | * 54 | */ 55 | class IoQueuePair : public QueuePair { 56 | public: 57 | inline __host__ IoQueuePair( 58 | volatile uint32_t *sq, 59 | volatile uint32_t *cq, 60 | uint32_t namespace_id, 61 | uint32_t *sqtdbl, 62 | uint32_t *cqhdbl, 63 | uint32_t queue_depth, 64 | uint32_t *cmd_id_to_req_id = nullptr, 65 | uint32_t *cmd_id_to_sq_pos = nullptr, 66 | bool *sq_entry_busy = nullptr 67 | ) : QueuePair( 68 | sq, cq, namespace_id, sqtdbl, cqhdbl, queue_depth, 69 | cmd_id_to_req_id, cmd_id_to_sq_pos, sq_entry_busy 70 | ) 71 | { 72 | // AEOLUS_LOG_INFO("IoQueuePair sqtdbl %p cqhdbl %p", sqtdbl, cqhdbl); 73 | } 74 | 75 | __device__ void poll_range(uint32_t &code, int expected_sq_head, bool should_break); 76 | __device__ void poll_multiple(uint32_t &code, int cnt); 77 | __device__ void poll_until_sq_entry_free(uint32_t &code, int expected_sq_pos); 78 | }; 79 | 80 | /** 81 | * @brief Abstraction of an SSD admin queue pair. 82 | * 83 | */ 84 | class AdminQueuePair : public QueuePair { 85 | public: 86 | inline __host__ AdminQueuePair( 87 | volatile uint32_t *sq, 88 | volatile uint32_t *cq, 89 | uint32_t namespace_id, 90 | uint32_t *sqtdbl, 91 | uint32_t *cqhdbl, 92 | uint32_t queue_depth, 93 | uint32_t *cmd_id_to_req_id = nullptr, 94 | uint32_t *cmd_id_to_sq_pos = nullptr, 95 | bool *sq_entry_busy = nullptr 96 | ) : QueuePair( 97 | sq, cq, namespace_id, sqtdbl, cqhdbl, queue_depth, 98 | cmd_id_to_req_id, cmd_id_to_sq_pos, sq_entry_busy 99 | ) 100 | { 101 | // AEOLUS_LOG_INFO("AdminQueuePair sqtdbl %p cqhdbl %p", sqtdbl, cqhdbl); 102 | } 103 | __host__ __device__ void submit_with_ns(uint32_t &cid, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0); 104 | __host__ __device__ void fill_sq_with_ns(uint32_t cid, uint32_t pos, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12 = 0, uint32_t req_id = 0xffffffff); 105 | __host__ uint32_t set_num_queues(uint16_t nsqr, uint16_t ncqr); 106 | __host__ uint32_t get_num_queues(uint16_t &nsqa, uint16_t &ncqa); 107 | __host__ uint32_t identify(uint8_t cns, uint16_t cntid, uint32_t nsid, uint64_t prp1); 108 | __host__ uint32_t create_cq_cont(uint16_t cqid, uint64_t cq_phys, uint16_t queue_depth); 109 | __host__ uint32_t create_sq_cont(uint16_t sqid, uint16_t cqid, uint64_t sq_phys, uint16_t queue_depth); 110 | __host__ uint32_t delete_sq(uint16_t sqid); 111 | __host__ uint32_t delete_cq(uint16_t cqid); 112 | }; 113 | 114 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/include/util.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_UTIL_CUH 2 | #define __AEOLUS_UTIL_CUH 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "cufile.h" 9 | 10 | // NVMe BAR0 register sizes and offsets. 11 | 12 | #define NVME_BAR0_SIZE 0x4000 13 | #define NVME_REG_CC 0x14 // addr: controller configuration 14 | #define NVME_REG_CC_EN 0x1 // mask: enable controller 15 | #define NVME_REG_CSTS 0x1c // addr: controller status 16 | #define NVME_REG_CSTS_RDY 0x1 // mask: controller ready 17 | #define NVME_REG_AQA 0x24 // addr: admin queue attributes 18 | #define NVME_REG_ASQ 0x28 // addr: admin submission queue base addr 19 | #define NVME_REG_ACQ 0x30 // addr: admin completion queue base addr 20 | #define NVME_REG_SQTDBL 0x1000 // addr: submission queue 0 tail doorbell 21 | #define NVME_REG_CQHDBL 0x1004 // addr: completion queue 0 sq_tail doorbell 22 | 23 | // NVMe admin opcode 24 | #define NVME_ADMIN_OPCODE_DELETE_SQ 0x00 25 | #define NVME_ADMIN_OPCODE_CREATE_SQ 0x01 26 | #define NVME_ADMIN_OPCODE_DELETE_CQ 0x04 27 | #define NVME_ADMIN_OPCODE_CREATE_CQ 0x05 28 | #define NVME_ADMIN_OPCODE_IDENTIFY 0x06 29 | #define NVME_ADMIN_OPCODE_SET_FEATURES 0x09 30 | #define NVME_ADMIN_OPCODE_GET_FEATURES 0x0a 31 | 32 | // NVMe opcode 33 | #define NVME_OPCODE_READ 0x02 34 | #define NVME_OPCODE_WRITE 0x01 35 | 36 | // NVMe feature ID. 37 | #define NVME_FEATURE_ID_NUM_QUEUES 0x07 38 | 39 | // NVMe field masks. 40 | #define NVME_ENTRY_PHASE_MASK 0x10000 41 | #define NVME_ENTRY_CID_MASK 0xffff // mask: command id 42 | #define NVME_ENTRY_SC_MASK 0xff // mask: status code 43 | #define NVME_ENTRY_SQ_HEAD_MASK 0xffff 44 | #define NVME_RW_LIMITED_RETRY_MASK 0x80000000 45 | 46 | // NVMe misc 47 | #define NVME_BROADCAST_NSID 0xffffffff 48 | #define NVME_SQ_ENTRY_SIZE 64 49 | #define NVME_CQ_ENTRY_SIZE 16 50 | #define NVME_DBLSTRIDE 8 51 | 52 | // Other constants. 53 | 54 | #define AEOLUS_HOST_PGSIZE 4096 55 | #define AEOLUS_DEVICE_PGSIZE 0x10000 56 | #define AEOLUS_ADMIN_QUEUE_DEPTH 64 57 | #define AEOLUS_WARP_SIZE 32 58 | #define AEOLUS_LB_SIZE 512 59 | #define AEOLUS_NUM_THREADS_PER_BLOCK 512 60 | #define AEOLUS_MAX_NUM_REQUESTS 4000000 61 | 62 | #define AEOLUS_MAX_NUM_QUEUES -1 63 | #define AEOLUS_MAX_DATA_TRANSFER -1 64 | 65 | // Check cuda errors. 66 | 67 | #define AEOLUS_CUDA_CHECK(ans) gpuAssert((ans), __FILE__, __LINE__) 68 | 69 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) 70 | { 71 | if (code != cudaSuccess) 72 | { 73 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 74 | if (abort) 75 | exit(1); 76 | } 77 | } 78 | 79 | inline bool isPowerOfTwo(int num) { 80 | return (num > 0) && ((num & (num - 1)) == 0); 81 | } 82 | 83 | inline uint64_t longrand(uint64_t max, uint64_t min = 0) { 84 | return min + (((unsigned long)rand() << 31 | rand()) % (max - min)); 85 | } 86 | 87 | #define MIN(a, b) ((a) < (b) ? (a) : (b)) 88 | #define MAX(a, b) ((b) < (a) ? (a) : (b)) 89 | #define CEIL(a, b) (((a)+(b)-1) / (b)) 90 | 91 | // 92 | // cuda driver error description 93 | // 94 | static inline const char *GetCuErrorString(CUresult curesult) { 95 | const char *descp; 96 | if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS) 97 | descp = "unknown cuda error"; 98 | return descp; 99 | } 100 | 101 | // 102 | // cuFile APIs return both cuFile specific error codes as well as POSIX error codes 103 | // for ease, the below template can be used for getting the error description depending 104 | // on its type. 105 | 106 | // POSIX 107 | template::value, std::nullptr_t>::type = nullptr> 109 | std::string cuFileGetErrorString(T status) { 110 | status = std::abs(status); 111 | return IS_CUFILE_ERR(status) ? 112 | std::string(CUFILE_ERRSTR(status)) : std::string(strerror(status)); 113 | } 114 | 115 | // CUfileError_t 116 | template::value, std::nullptr_t>::type = nullptr> 118 | std::string cuFileGetErrorString(T status) { 119 | std::string errStr = cuFileGetErrorString(static_cast(status.err)); 120 | if (IS_CUDA_ERR(status)) 121 | errStr.append(".").append(GetCuErrorString(status.cu_err)); 122 | return errStr; 123 | } 124 | 125 | #define AEOLUS_CUFILE_CHECK(ans) cufileAssert((ans), __FILE__, __LINE__) 126 | 127 | inline void cufileAssert(CUfileError_t status, const char *file, int line, bool abort = true) 128 | { 129 | if (status.err != CU_FILE_SUCCESS) 130 | { 131 | fprintf(stderr, "CUfileAssert: %s %s %d\n", cuFileGetErrorString(status).c_str(), file, line); 132 | if (abort) 133 | exit(1); 134 | } 135 | } 136 | 137 | #endif -------------------------------------------------------------------------------- /src/applications/gemm/gemm/cam_gemm.cu: -------------------------------------------------------------------------------- 1 | #include "lightbam.cuh" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "gemm.cuh" 9 | #include 10 | #include "CAM_interface.h" 11 | 12 | typedef float fp_t; 13 | int main(int argc, char *argv[]) 14 | { 15 | if (argc != 10) 16 | { 17 | printf("Usage: %s m n k a_offset b_offset c_offset block_size max_io_size num_ssds\n", argv[0]); 18 | return 1; 19 | } 20 | int m = parse_offset(argv[1]); 21 | int n = parse_offset(argv[2]); 22 | int k = parse_offset(argv[3]); 23 | uint64_t a_offset = parse_offset(argv[4]); 24 | uint64_t b_offset = parse_offset(argv[5]); 25 | uint64_t c_offset = parse_offset(argv[6]); 26 | uint64_t block_size = parse_offset(argv[7]); 27 | uint64_t max_io_size = parse_offset(argv[8]); 28 | int num_ssds = atoi(argv[9]); 29 | if (m % block_size != 0 || n % block_size != 0) 30 | { 31 | std::cout<<"m and n must be a multiple of block_size"<= 0 && i < m_blocks) 82 | { 83 | //clear_wait_flag(); 84 | std::swap(a0, a1); 85 | } 86 | if (i + 1 < m_blocks) 87 | { 88 | for (int l = 0; l < num_reqs; l++) 89 | { 90 | uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t); 91 | h_reqs[l] = (a_offset + ((i + 1) * block_size * k + offset) * sizeof(fp_t)) / AEOLUS_LB_SIZE; 92 | } 93 | cam_gemm_read(h_reqs,num_reqs,(uintptr_t)a1); 94 | clear_wait_flag(); 95 | } 96 | if (i - 1 >= 0) 97 | { 98 | // if (i - 2 >= 0) 99 | // { 100 | // clear_wait_flag_write(); 101 | // } 102 | std::swap(c0, c1); 103 | int num_reqs = CEIL(block_size * block_size * sizeof(fp_t), max_io_size); 104 | for (int l = 0; l < num_reqs; l++) 105 | { 106 | uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t); 107 | int row = (i - 1) * block_size + offset / block_size; 108 | int col = j * block_size + offset % block_size; 109 | h_reqs2[l] = (c_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE; 110 | } 111 | cam_gemm_write(h_reqs2,num_reqs,(uintptr_t)c1); 112 | clear_wait_flag_write(); 113 | } 114 | if (i == 0) 115 | { 116 | std::swap(b0, b1); 117 | } 118 | if (i >= 0 && i < m_blocks) 119 | { 120 | cudaEventRecord(gemm_start, 0); 121 | cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, block_size, block_size, k, &alpha, b0, CUDA_R_32F, block_size, a0, CUDA_R_32F, k, &beta, c0, CUDA_R_32F, block_size, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); 122 | cudaEventRecord(gemm_stop, 0); 123 | cudaEventSynchronize(gemm_stop); 124 | float ms; 125 | cudaEventElapsedTime(&ms, gemm_start, gemm_stop); 126 | gemm_ms += ms; 127 | } 128 | } 129 | // clear_wait_flag_write(); 130 | } 131 | cudaEventRecord(stop, 0); 132 | cudaEventSynchronize(stop); 133 | float ms; 134 | cudaEventElapsedTime(&ms, start, stop); 135 | printf("m = %d, n = %d, k = %d, block_size = %ld, time = %f ms, tflops = %f\n", m, n, k, block_size, ms, 2.0 * m * n * k / ms / 1e9); 136 | printf("gemm time = %f ms, num_ssds = %d, max_io_size = %ld, num_queues = %d\n", gemm_ms, num_ssds, max_io_size, num_queues_per_ssd); 137 | printf("%d %ld %d %ld %f %f %d\n", n, block_size, num_ssds, max_io_size, gemm_ms, ms, num_queues_per_ssd); 138 | cublasDestroy(handle); 139 | free_gpu(a0); 140 | free_gpu(a1); 141 | free_gpu(b0); 142 | free_gpu(b1); 143 | free_gpu(c0); 144 | free_gpu(c1); 145 | free(h_reqs); 146 | cam_clean_up(); 147 | return 0; 148 | } -------------------------------------------------------------------------------- /src/applications/gemm/src/device.cu: -------------------------------------------------------------------------------- 1 | #include "device.cuh" 2 | 3 | Device::Device(int ssd_id) 4 | { 5 | 6 | // Open file and map BAR0 of SSD 7 | 8 | this->ssd_id = ssd_id; 9 | AEOLUS_LOG_INFO("Setting up device %d", ssd_id); 10 | char device_path[64]; 11 | sprintf(device_path, "/dev/libnvm%d", ssd_id); 12 | device_fd = open(device_path, O_RDWR); 13 | if (device_fd < 0) 14 | { 15 | AEOLUS_LOG_ERROR("Failed to open: %s", strerror(errno)); 16 | exit(1); 17 | } 18 | reg_ptr = mmap(NULL, NVME_BAR0_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, device_fd, 0); 19 | if (reg_ptr == MAP_FAILED) 20 | { 21 | AEOLUS_LOG_ERROR("Failed to mmap: %s\n", strerror(errno)); 22 | exit(1); 23 | } 24 | AEOLUS_CUDA_CHECK(cudaHostRegister(reg_ptr, NVME_BAR0_SIZE, cudaHostRegisterIoMemory)); 25 | 26 | // Reset controller. 27 | 28 | uint64_t reg_ptr_uint = (uint64_t)reg_ptr; 29 | *(uint32_t *)(reg_ptr_uint + NVME_REG_CC) &= ~NVME_REG_CC_EN; 30 | while (*(uint32_t volatile *)(reg_ptr_uint + NVME_REG_CSTS) & NVME_REG_CSTS_RDY) 31 | ; 32 | AEOLUS_LOG_INFO("Reset done."); 33 | 34 | // Set admin queue attributes. 35 | 36 | int ret = alloc_host_memory(&admin_queue_ptr, 2*AEOLUS_HOST_PGSIZE, &admin_queue_phys_addr); 37 | if (ret != 0) 38 | { 39 | AEOLUS_LOG_ERROR("Allocate admin queue memory failed: %s", strerror(ret)); 40 | exit(1); 41 | } 42 | 43 | uint64_t asq = (uint64_t)admin_queue_ptr; 44 | uint64_t acq = (uint64_t)admin_queue_ptr + AEOLUS_HOST_PGSIZE; 45 | *(uint32_t *)(reg_ptr_uint + NVME_REG_AQA) = ((AEOLUS_ADMIN_QUEUE_DEPTH - 1) << 16) | (AEOLUS_ADMIN_QUEUE_DEPTH - 1); 46 | *(uint64_t *)(reg_ptr_uint + NVME_REG_ASQ) = admin_queue_phys_addr[0]; 47 | *(uint64_t *)(reg_ptr_uint + NVME_REG_ACQ) = admin_queue_phys_addr[1]; 48 | // AEOLUS_LOG_INFO("Admin queue phy addr: 0x%lx, 0x%lx", admin_queue_phys_addr[0], admin_queue_phys_addr[1]); 49 | 50 | admin_qp = new AdminQueuePair( 51 | (volatile uint32_t *)asq, 52 | (volatile uint32_t *)acq, 53 | NVME_BROADCAST_NSID, 54 | (uint32_t *)(reg_ptr_uint + NVME_REG_SQTDBL), 55 | (uint32_t *)(reg_ptr_uint + NVME_REG_CQHDBL), 56 | AEOLUS_ADMIN_QUEUE_DEPTH 57 | ); 58 | AEOLUS_LOG_INFO("Set admin_qp queue attributes done."); 59 | 60 | // Enable controller. 61 | *(uint32_t *)(reg_ptr_uint + NVME_REG_CC) |= NVME_REG_CC_EN; 62 | while (!(*(uint32_t volatile *)(reg_ptr_uint + NVME_REG_CSTS) & NVME_REG_CSTS_RDY)) 63 | ; 64 | AEOLUS_LOG_INFO("Enable controller done."); 65 | 66 | // Set number of I/O queues. We will tentatively set a large number to the queue number 67 | // and then run the get-feature command so as to get the largest queue number supported. 68 | 69 | uint32_t status = admin_qp->set_num_queues(0xfffe, 0xfffe); // Maximum queue pairs supported by NVMe. 70 | if (status != 0) 71 | { 72 | AEOLUS_LOG_ERROR("Set number of queues failed with status 0x%x", status); 73 | exit(1); 74 | } 75 | AEOLUS_LOG_INFO("Set number of queues done."); 76 | 77 | uint16_t max_sq_num, max_cq_num; 78 | status = admin_qp->get_num_queues(max_sq_num, max_cq_num); 79 | if (status != 0) 80 | { 81 | AEOLUS_LOG_ERROR("Get number of queues failed with status 0x%x", status); 82 | exit(1); 83 | } 84 | max_queue_num = MIN(max_sq_num, max_cq_num); 85 | AEOLUS_LOG_INFO("Maximum queue number supported: %d.", max_queue_num); 86 | 87 | // Decide the namespace to use. The namespace with the lowest number will be chosen. 88 | 89 | void *temp_buffer; 90 | uint64_t *temp_buffer_phys_addr; 91 | alloc_host_memory(&temp_buffer, AEOLUS_HOST_PGSIZE, &temp_buffer_phys_addr); 92 | status = admin_qp->identify(0x02, 0x0, 0, temp_buffer_phys_addr[0]); 93 | if (status != 0) 94 | { 95 | AEOLUS_LOG_ERROR("Get namespace list failed with status 0x%x", status); 96 | exit(1); 97 | } 98 | active_ns = *((uint32_t *)temp_buffer); 99 | 100 | // Get device capacity. 101 | 102 | status = admin_qp->identify(0x00, 0x0, active_ns, temp_buffer_phys_addr[0]); 103 | if (status != 0) 104 | { 105 | AEOLUS_LOG_ERROR("Get namespace structure 0x%x", status); 106 | exit(1); 107 | } 108 | max_lb_num = *((uint64_t *)temp_buffer); 109 | AEOLUS_LOG_INFO("Active ns: %d, Maximum logical block number supported: %lu.", active_ns, max_lb_num); 110 | 111 | // Get maximum IO size. 112 | 113 | status = admin_qp->identify(0x01, 0x0, 0x0, temp_buffer_phys_addr[0]); 114 | if (status != 0) 115 | { 116 | AEOLUS_LOG_ERROR("Get controller structure failed with status 0x%x", status); 117 | exit(1); 118 | } 119 | max_io_size = *((uint8_t *)((uint64_t)temp_buffer + 77)); 120 | max_io_size = AEOLUS_HOST_PGSIZE * (1 << max_io_size); 121 | AEOLUS_LOG_INFO("Maximum IO size supported: %d B.", max_io_size); 122 | free_host_memory(temp_buffer, temp_buffer_phys_addr); 123 | 124 | // Get free queue pair IDs. 125 | for (int i=1; iptr), size + AEOLUS_DEVICE_PGSIZE)); 166 | *ptr = (void *)((uint64_t)((*context)->ptr) / AEOLUS_DEVICE_PGSIZE * AEOLUS_DEVICE_PGSIZE + AEOLUS_DEVICE_PGSIZE); 167 | int flag = 0; 168 | if ((uint64_t)*ptr != (uint64_t)((*context)->ptr)) 169 | { 170 | flag = 1; 171 | } 172 | (*context)->ioaddrs = malloc(sizeof(uint64_t) * (size / AEOLUS_DEVICE_PGSIZE + flag)); 173 | *phys_addr = (uint64_t*)(*context)->ioaddrs; 174 | nvm_ioctl_map req; 175 | req.vaddr_start = (uint64_t)((*context)->ptr); 176 | req.n_pages = size / AEOLUS_DEVICE_PGSIZE + flag; 177 | req.ioaddrs = *phys_addr; 178 | *phys_addr += flag; 179 | 180 | return ioctl(device_fd, NVM_MAP_DEVICE_MEMORY, &req); 181 | } 182 | 183 | void Device::free_device_memory(aeolus_dev_mem_context* context) 184 | { 185 | ioctl(device_fd, NVM_UNMAP_MEMORY, (uint64_t)(context->ptr)); 186 | free(context->ioaddrs); 187 | AEOLUS_CUDA_CHECK(cudaFree(context->ptr)); 188 | free(context); 189 | } -------------------------------------------------------------------------------- /src/applications/gemm/gemm/spdk_gemm.cu: -------------------------------------------------------------------------------- 1 | #include "lightbam.cuh" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "gemm.cuh" 9 | #include 10 | #include "spdk_read.h" 11 | 12 | typedef float fp_t; 13 | int main(int argc, char *argv[]) 14 | { 15 | if (argc != 10) 16 | { 17 | printf("Usage: %s m n k a_offset b_offset c_offset block_size max_io_size num_ssds\n", argv[0]); 18 | return 1; 19 | } 20 | int m = parse_offset(argv[1]); 21 | int n = parse_offset(argv[2]); 22 | int k = parse_offset(argv[3]); 23 | uint64_t a_offset = parse_offset(argv[4]); 24 | uint64_t b_offset = parse_offset(argv[5]); 25 | uint64_t c_offset = parse_offset(argv[6]); 26 | uint64_t block_size = parse_offset(argv[7]); 27 | uint64_t max_io_size = parse_offset(argv[8]); 28 | int num_ssds = atoi(argv[9]); 29 | if (m % block_size != 0 || n % block_size != 0) 30 | { 31 | std::cout<<"m and n must be a multiple of block_size"<= 0 && i < m_blocks) 98 | { 99 | // clear_wait_flag(); 100 | cudaMemcpyAsync(a0, a1, block_size * k * sizeof(fp_t), cudaMemcpyHostToDevice, streama); 101 | //std::swap(a0, a1); 102 | } 103 | if (i < m_blocks -1) //read phase 104 | { 105 | for (int l = 0; l < num_reqs; l++) 106 | { 107 | uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t); 108 | h_reqs[l] = (a_offset + ((i + 1) * block_size * k + offset) * sizeof(fp_t)) / AEOLUS_LB_SIZE; 109 | } 110 | cam_gemm_read(h_reqs,num_reqs,(uintptr_t)a1); 111 | clear_wait_flag(); 112 | } 113 | if(i>=2 && i<= m_blocks+1){ 114 | // std::swap(c0, c1); 115 | cudaMemcpyAsync(c0, c1, block_size * block_size * sizeof(fp_t), cudaMemcpyHostToDevice, streamc); 116 | } 117 | if (i >= 3) //write phase 118 | { 119 | // if (i >= 4) 120 | // { 121 | // clear_wait_flag_write(); 122 | // } 123 | 124 | int num_reqs = CEIL(block_size * block_size * sizeof(fp_t), max_io_size); 125 | for (int l = 0; l < num_reqs; l++) 126 | { 127 | uint64_t offset = 1ll * l * max_io_size / sizeof(fp_t); 128 | int row = (i - 1) * block_size + offset / block_size; 129 | int col = j * block_size + offset % block_size; 130 | h_reqs2[l] = (c_offset + (1ll * row * n + col) * sizeof(fp_t)) / AEOLUS_LB_SIZE; 131 | // h_reqs2[l] = l; 132 | } 133 | cudaStreamSynchronize(streamc); 134 | cam_gemm_write(h_reqs2,num_reqs,(uintptr_t)c1); 135 | clear_wait_flag_write(); 136 | 137 | } 138 | 139 | if (i >= 1 && i <= m_blocks) //gemm compute phase 140 | { 141 | cudaEventRecord(gemm_start, 0); 142 | cudaStreamSynchronize(streama); 143 | cudaStreamSynchronize(streamb); 144 | cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, block_size, block_size, k, &alpha, b0, CUDA_R_32F, block_size, a0, CUDA_R_32F, k, &beta, c0, CUDA_R_32F, block_size, CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); 145 | cudaEventRecord(gemm_stop, 0); 146 | cudaEventSynchronize(gemm_stop); 147 | float ms; 148 | cudaEventElapsedTime(&ms, gemm_start, gemm_stop); 149 | gemm_ms += ms; 150 | } 151 | } 152 | // clear_wait_flag_write(); 153 | } 154 | cudaEventRecord(stop, 0); 155 | cudaEventSynchronize(stop); 156 | float ms; 157 | cudaEventElapsedTime(&ms, start, stop); 158 | printf("m = %d, n = %d, k = %d, block_size = %ld, time = %f ms, tflops = %f\n", m, n, k, block_size, ms, 2.0 * m * n * k / ms / 1e9); 159 | printf("gemm time = %f ms, num_ssds = %d, max_io_size = %ld, num_queues = %d\n", gemm_ms, num_ssds, max_io_size, num_queues_per_ssd); 160 | printf("%d %ld %d %ld %f %f %d\n", n, block_size, num_ssds, max_io_size, gemm_ms, ms, num_queues_per_ssd); 161 | cublasDestroy(handle); 162 | free_pinmemory(a1); 163 | free_pinmemory(b1); 164 | free_pinmemory(c1); 165 | cudaFree(a0); 166 | cudaFree(b0); 167 | cudaFree(c0); 168 | cudaStreamDestroy(streama); 169 | cudaStreamDestroy(streamb); 170 | cudaStreamDestroy(streamc); 171 | free(h_reqs); 172 | cam_clean_up(); 173 | return 0; 174 | } -------------------------------------------------------------------------------- /src/applications/gemm/include/controller.cuh: -------------------------------------------------------------------------------- 1 | #ifndef __AEOLUS_CONTROLLER_CUH 2 | #define __AEOLUS_CONTROLLER_CUH 3 | 4 | #include 5 | #include "device.cuh" 6 | #include "request.cuh" 7 | 8 | enum aeolus_access_type 9 | { 10 | AEOLUS_ACCESS_SEQUENTIAL = 0, 11 | AEOLUS_ACCESS_RANDOM = 1 12 | }; 13 | 14 | enum aeolus_dist_type 15 | { 16 | AEOLUS_DIST_STRIPE = 0, 17 | AEOLUS_DIST_REPLICA = 1 18 | }; 19 | 20 | enum aeolus_buf_type 21 | { 22 | AEOLUS_BUF_USER = 0, 23 | AEOLUS_BUF_PINNED = 1 24 | }; 25 | 26 | /** 27 | * @brief A controller manages multiple IO queues of multiple SSDs and provides a simple interface for user. 28 | * 29 | */ 30 | class Controller 31 | { 32 | protected: 33 | std::vectorssd_list; 34 | int ssd_count; 35 | int gpu_id; 36 | int32_t max_io_size; 37 | int32_t num_queue_per_ssd; 38 | int32_t queue_depth; 39 | aeolus_dist_type dist_type; 40 | aeolus_buf_type buf_type; 41 | 42 | int32_t max_queue_num; 43 | int32_t max_trans_size; 44 | 45 | int32_t **qpid_list; 46 | IoQueuePair *d_ssdqp; 47 | void *d_iobuf_ptr; 48 | uint64_t *d_iobuf_phys; 49 | uint64_t *prp_list; 50 | uint64_t *h_prp_phys; 51 | uint64_t *d_prp_phys; 52 | uint64_t *h_ssd_num_lbs; 53 | uint64_t *d_ssd_num_lbs; 54 | 55 | aeolus_dev_mem_context *qp_ctx; 56 | aeolus_dev_mem_context *iobuf_ctx; 57 | 58 | int* ssd_num_reqs; 59 | Request *distributed_reqs; 60 | int *req_ids; 61 | public: 62 | uint64_t max_lb_number; 63 | uint64_t *qp_phys; 64 | 65 | /** 66 | * @brief Construct a new Controller object. A controller manages multiple IO queues of multiple SSDs and provides a simple interface for user. 67 | * 68 | * @param ssd_list List of SSD devices to be managed by the controller. 69 | * @param num_queue_per_ssd Number of IO queues allocated to each SSD. 70 | * @param max_io_size Maximum IO size in bytes in a single NVMe command. 71 | * @param queue_depth Depth of each IO queue. 72 | * @param dist_type Pattern for data distribution. AEOLUS_DIST_STRIPE means data is striped across SSDs, AEOLUS_DIST_REPLICA means data is replicated across SSDs. 73 | * @param buf_type Type of data buffer for IO. AEOLUS_BUF_USER means the buffer can be arbitrary buffer provided by user, AEOLUS_BUF_PINNED means the buffer is pinned by user beforehand. 74 | * @param pinned_buf_phys Physical addresses of the pinned buffer. Only valid when buf_type is AEOLUS_BUF_PINNED. 75 | * @param pinned_buf_size Size of the pinned buffer. Only valid when buf_type is AEOLUS_BUF_PINNED. 76 | */ 77 | Controller( 78 | std::vector ssd_list, 79 | int32_t num_queue_per_ssd = AEOLUS_MAX_NUM_QUEUES, 80 | int32_t max_io_size = 4096, 81 | int32_t queue_depth = 4096, 82 | aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE, 83 | aeolus_buf_type buf_type = AEOLUS_BUF_USER, 84 | uint64_t *pinned_buf_phys = nullptr, 85 | uint64_t pinned_buf_size = 0 86 | ); 87 | 88 | /** 89 | * @brief Construct a new Controller object. This interface hides the details of queue depth and IO size and provides pre-defined configurations to user. 90 | * 91 | * @param ssd_list List of SSD devices to be managed by the controller. 92 | * @param access_type Preset of IO pattern. AEOLUS_ACCESS_SEQUENTIAL means the user prefers sequential access, AEOLUS_ACCESS_RANDOM means random access. 93 | * @param dist_type Pattern for data distribution. AEOLUS_DIST_STRIPE means data is striped across SSDs, AEOLUS_DIST_REPLICA means data is replicated across SSDs. 94 | * @param buf_type Type of data buffer for IO. AEOLUS_BUF_USER means the buffer can be arbitrary buffer provided by user, AEOLUS_BUF_PINNED means the buffer is pinned by user beforehand. 95 | */ 96 | inline Controller( 97 | std::vector ssd_list, 98 | aeolus_access_type access_type = AEOLUS_ACCESS_SEQUENTIAL, 99 | aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE, 100 | aeolus_buf_type buf_type = AEOLUS_BUF_USER 101 | ) : Controller( 102 | ssd_list, 103 | access_type == AEOLUS_ACCESS_SEQUENTIAL ? 8 : AEOLUS_MAX_NUM_QUEUES, 104 | access_type == AEOLUS_ACCESS_SEQUENTIAL ? AEOLUS_MAX_DATA_TRANSFER : 4096, 105 | access_type == AEOLUS_ACCESS_SEQUENTIAL ? 256 : 4096, 106 | dist_type, 107 | buf_type 108 | ) 109 | {}; 110 | 111 | ~Controller(); 112 | void read_data(uint64_t start_lb, uint64_t num_lb, void *buf); 113 | void write_data(uint64_t start_lb, uint64_t num_lb, void *buf); 114 | IoQueuePair *get_io_queue_pair() { return d_ssdqp; } 115 | 116 | /** 117 | * @brief Submit a batch of IO requests to the controller and process them in a helper thread 118 | * until completion. 119 | * 120 | * @param req List of IO requests. 121 | * @param num_req Number of IO requests. 122 | * @param dir Direction of requests. All requests must have the same direction. 123 | * @param stream cuda stream used. 124 | * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional) 125 | */ 126 | virtual void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t *d_prp_phys = nullptr) = 0; 127 | 128 | private: 129 | void lb_to_ssd_id(uint64_t lb, int &ssd_id, uint64_t &local_lb); 130 | }; 131 | 132 | /** 133 | * @brief A controller instance with helper thread-based IO processing functions. 134 | * requests are processed in multiple batches. 135 | * 136 | */ 137 | class ControllerLegacy : public Controller 138 | { 139 | public: 140 | inline ControllerLegacy( 141 | std::vector ssd_list, 142 | int32_t num_queue_per_ssd = AEOLUS_MAX_NUM_QUEUES, 143 | int32_t max_io_size = 4096, 144 | int32_t queue_depth = 4096, 145 | aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE, 146 | aeolus_buf_type buf_type = AEOLUS_BUF_USER, 147 | uint64_t *pinned_buf_phys = nullptr, 148 | uint64_t pinned_buf_size = 0 149 | ) : Controller( 150 | ssd_list, 151 | num_queue_per_ssd, 152 | max_io_size, 153 | queue_depth, 154 | dist_type, 155 | buf_type, 156 | pinned_buf_phys, 157 | pinned_buf_size 158 | ) 159 | {}; 160 | 161 | /** 162 | * @brief Submit a batch of IO requests to the controller and process them in a helper thread 163 | * until completion. 164 | * 165 | * @param req List of IO requests. 166 | * @param num_req Number of IO requests. 167 | * @param dir Direction of requests. All requests must have the same direction. 168 | * @param stream cuda stream used. 169 | * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional) 170 | */ 171 | void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t *d_prp_phys = nullptr) override; 172 | }; 173 | 174 | /** 175 | * @brief A controller instance with submit-poll processing interface. 176 | * @warning Each instance can only process one batch of request at one time, 177 | * and that the number of requests in a batch should not exceed the queue number times queue depth. 178 | * 179 | */ 180 | class ControllerDecoupled : public Controller 181 | { 182 | private: 183 | int* ssd_num_reqs_prefix_sum; 184 | int num_reqs; 185 | cudaStream_t stream; 186 | aeolus_access_dir dir; 187 | public: 188 | inline ControllerDecoupled( 189 | std::vector ssd_list, 190 | int32_t num_queue_per_ssd = AEOLUS_MAX_NUM_QUEUES, 191 | int32_t max_io_size = 4096, 192 | int32_t queue_depth = 4096, 193 | aeolus_dist_type dist_type = AEOLUS_DIST_STRIPE, 194 | aeolus_buf_type buf_type = AEOLUS_BUF_USER, 195 | uint64_t *pinned_buf_phys = nullptr, 196 | uint64_t pinned_buf_size = 0 197 | ) : Controller( 198 | ssd_list, 199 | num_queue_per_ssd, 200 | max_io_size, 201 | queue_depth, 202 | dist_type, 203 | buf_type, 204 | pinned_buf_phys, 205 | pinned_buf_size 206 | ) 207 | { 208 | AEOLUS_CUDA_CHECK(cudaMalloc(&ssd_num_reqs_prefix_sum, ssd_count * sizeof(int))); 209 | } 210 | 211 | /** 212 | * @brief Submit a batch of IO requests to the NVMe SSDs. User needs to ensure the completion 213 | * of the requests by `poll()` function. 214 | * 215 | * @param req List of IO requests. 216 | * @param num_req Number of IO requests. 217 | * @param dir Direction of requests. All requests must have the same direction. 218 | * @param stream cuda stream used. 219 | * @param d_prp_phys Physical addresses of the PRP list. Only valid when buf_type is AEOLUS_BUF_PINNED. (optional) 220 | * 221 | * @warning User must make sure that number of requests is no greater than the queue number times queue depth. 222 | */ 223 | void submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys = nullptr) override; 224 | 225 | /** 226 | * @brief Poll the in-flight requests until completion. 227 | * 228 | * @warning User must not submit new requests before the completion of the previous batch. 229 | */ 230 | void poll(); 231 | }; 232 | 233 | #endif -------------------------------------------------------------------------------- /src/CAM_lib/gpu_transfer.cu: -------------------------------------------------------------------------------- 1 | #include "gpu_transfer.cuh" 2 | 3 | //static struct SmDevice2DeviceSemaphoreDeviceHandle d_sm; 4 | u_int64_t read_block_num = 1000000UL; 5 | //* read arguments 6 | static Host2DeviceSemaphore h_sm; 7 | static u_int64_t *h_data; 8 | __device__ static u_int64_t *d_data; 9 | std::vector a; 10 | 11 | static u_int64_t *h_submit_info; 12 | __device__ static u_int64_t *d_submit_info; 13 | 14 | 15 | //* D2H Semaphore arguments 16 | __device__ uint64_t* D2H_inboundSemaphoreId; 17 | __device__ uint64_t* D2H_expectedInboundSemaphore; 18 | __device__ uint64_t* D2H_outboundSemaphoreId; 19 | __device__ uint64_t* D2H_outboundSemaphoreValue; 20 | __device__ uint64_t* D2H_total_num; 21 | 22 | //* write arguments 23 | static Host2DeviceSemaphore h_sm_2; 24 | static u_int64_t *h_data_2; 25 | __device__ static u_int64_t *d_data_2; 26 | std::vector a_2; 27 | 28 | static u_int64_t *h_submit_info_2; 29 | __device__ static u_int64_t *d_submit_info_2; 30 | 31 | 32 | //* D2H Semaphore arguments 33 | __device__ uint64_t* D2H_inboundSemaphoreId_2; 34 | __device__ uint64_t* D2H_expectedInboundSemaphore_2; 35 | __device__ uint64_t* D2H_outboundSemaphoreId_2; 36 | __device__ uint64_t* D2H_outboundSemaphoreValue_2; 37 | __device__ uint64_t* D2H_total_num_2; 38 | 39 | void SemaphoreInit(cudaStream_t stream1) 40 | { 41 | void* tmp; 42 | cudaMalloc(&(tmp),sizeof(u_int64_t)); 43 | cudaMemcpyToSymbol(D2H_outboundSemaphoreValue, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 44 | cudaMalloc(&(tmp),sizeof(u_int64_t)); 45 | cudaMemcpyToSymbol(D2H_expectedInboundSemaphore, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 46 | void *tmp_h_data; 47 | cudaHostAlloc((void**)&tmp_h_data, sizeof(u_int64_t), cudaHostAllocMapped); 48 | cudaHostGetDevicePointer(&(tmp), (u_int64_t *)tmp_h_data, 0); 49 | cudaMemcpyToSymbol(D2H_outboundSemaphoreId, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 50 | 51 | void *tmp_num; 52 | cudaHostAlloc((void**)&tmp_num, sizeof(u_int64_t), cudaHostAllocMapped); 53 | cudaHostGetDevicePointer((void**)&(D2H_total_num), tmp_num, 0); 54 | 55 | 56 | h_sm.ConnectToStream(stream1); 57 | h_sm.ConnectToDeviceSemaphore(tmp_h_data,(u_int64_t *)tmp_num); 58 | tmp = (uint64_t*)h_sm.GetoutboundSemaphore(); 59 | cudaMemcpyToSymbol(D2H_inboundSemaphoreId, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice); 60 | 61 | cudaMalloc(&(tmp),sizeof(u_int64_t)); 62 | cudaMemcpyToSymbol(D2H_outboundSemaphoreValue_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 63 | cudaMalloc(&(tmp),sizeof(u_int64_t)); 64 | cudaMemcpyToSymbol(D2H_expectedInboundSemaphore_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 65 | void *tmp_h_data_2; 66 | cudaHostAlloc((void**)&tmp_h_data_2, sizeof(u_int64_t), cudaHostAllocMapped); 67 | cudaHostGetDevicePointer(&(tmp), (u_int64_t *)tmp_h_data_2, 0); 68 | cudaMemcpyToSymbol(D2H_outboundSemaphoreId_2, &tmp, sizeof(uint64_t),0, cudaMemcpyHostToDevice); 69 | 70 | 71 | cudaHostAlloc((void**)&tmp_num, sizeof(u_int64_t), cudaHostAllocMapped); 72 | cudaHostGetDevicePointer((void**)&(D2H_total_num_2), tmp_num, 0); 73 | 74 | 75 | h_sm_2.ConnectToStream(stream1); 76 | h_sm_2.ConnectToDeviceSemaphore(tmp_h_data_2,(u_int64_t *)tmp_num); 77 | tmp = (uint64_t*)h_sm_2.GetoutboundSemaphore(); 78 | cudaMemcpyToSymbol(D2H_inboundSemaphoreId_2, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice); 79 | } 80 | 81 | __device__ void prefetch(int64_t embed_num,uintptr_t *dev_addr) 82 | { 83 | __syncthreads(); 84 | if((threadIdx.x + blockIdx.x * blockDim.x) == 0) 85 | { 86 | 87 | d_submit_info[0] = 1; 88 | d_submit_info[1] = embed_num; 89 | d_submit_info[2] = (uint64_t)dev_addr; 90 | 91 | *D2H_outboundSemaphoreValue += 1; 92 | //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue); 93 | *D2H_outboundSemaphoreId = *D2H_outboundSemaphoreValue; 94 | //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId); 95 | } 96 | } 97 | 98 | __device__ void prefetch_syncronize(void) 99 | { 100 | if((threadIdx.x + blockIdx.x * blockDim.x) == 0) 101 | { 102 | //printf("leadind thread wait\n"); 103 | (*D2H_expectedInboundSemaphore) += 1; 104 | uint64_t value; 105 | uint64_t value2= (*D2H_expectedInboundSemaphore); 106 | //printf("value2: %ld\n",value2); 107 | while(true){ 108 | value= atomicMin((unsigned long long int*)D2H_inboundSemaphoreId,(unsigned long long int)value2); 109 | if(value >= value2) 110 | break; 111 | uint64_t start = 0; 112 | while (start++ < 100000); 113 | } 114 | 115 | //printf("leadind thread wait done\n"); 116 | } 117 | __syncthreads(); 118 | } 119 | 120 | 121 | void polling_thread(void) 122 | { 123 | while(1){ 124 | h_sm.wait(); 125 | uint64_t embed_num = h_submit_info[1]; 126 | uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]); 127 | cam_gemm_read(h_data, embed_num,(uintptr_t)gem_memory); 128 | clear_wait_flag(); 129 | h_sm.signal(); 130 | } 131 | } 132 | 133 | 134 | void polling_thread_seq(void) 135 | { 136 | while(1){ 137 | h_sm.wait(); 138 | u_int64_t start_lba = h_submit_info[0]; 139 | uint64_t embed_num = h_submit_info[1]; 140 | uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]); 141 | seq_read_submit(start_lba,embed_num,(uintptr_t)gem_memory); 142 | clear_wait_flag(); 143 | h_sm.signal(); 144 | } 145 | } 146 | 147 | void polling_thread_seq_write(void) 148 | { 149 | while(1){ 150 | h_sm.wait(); 151 | u_int64_t start_lba = h_submit_info[0]; 152 | uint64_t embed_num = h_submit_info[1]; 153 | uintptr_t *gem_memory = (uintptr_t *)(h_submit_info[2]); 154 | seq_write_submit(start_lba,embed_num,(uintptr_t)gem_memory); 155 | clear_wait_flag_write(); 156 | h_sm.signal(); 157 | } 158 | } 159 | 160 | __device__ void prefetch_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr) 161 | { 162 | __syncthreads(); 163 | if((threadIdx.x + blockIdx.x * blockDim.x) == 0) 164 | { 165 | 166 | d_submit_info[0] = start_lba; 167 | d_submit_info[1] = embed_num; 168 | d_submit_info[2] = (uint64_t)dev_addr; 169 | 170 | *D2H_outboundSemaphoreValue += 1; 171 | //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue); 172 | *D2H_outboundSemaphoreId = *D2H_outboundSemaphoreValue; 173 | //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId); 174 | } 175 | } 176 | 177 | __device__ void writeback_seq(int64_t start_lba,int64_t embed_num,uintptr_t *dev_addr) 178 | { 179 | __syncthreads(); 180 | if((threadIdx.x + blockIdx.x * blockDim.x) == 0) 181 | { 182 | 183 | d_submit_info_2[0] = start_lba; 184 | d_submit_info_2[1] = embed_num; 185 | d_submit_info_2[2] = (uint64_t)dev_addr; 186 | 187 | *D2H_outboundSemaphoreValue_2 += 1; 188 | //printf("D2H_outboundSemaphoreValue: %ld\n",*D2H_outboundSemaphoreValue); 189 | *D2H_outboundSemaphoreId_2 = *D2H_outboundSemaphoreValue_2; 190 | //printf("D2H_outboundSemaphoreId: %ld\n",*D2H_outboundSemaphoreId); 191 | } 192 | } 193 | 194 | 195 | void Init(u_int32_t access_size,cudaStream_t stream1) 196 | { 197 | SemaphoreInit(stream1); 198 | // 在主机端分配页锁内存(零拷贝内存) 199 | cudaHostAlloc((void**)&h_data, read_block_num * sizeof(u_int64_t), cudaHostAllocMapped); 200 | // 获取对应的设备指针 201 | cudaHostGetDevicePointer((void**)&d_data, h_data, 0); 202 | 203 | cudaHostAlloc((void**)&h_submit_info, 3 * sizeof(u_int64_t), cudaHostAllocMapped); 204 | 205 | void* tmp; 206 | cudaHostGetDevicePointer((void**)&tmp, h_submit_info, 0); 207 | cudaMemcpyToSymbol(d_submit_info, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice); 208 | 209 | cudaHostAlloc((void**)&h_data_2, read_block_num * sizeof(u_int64_t), cudaHostAllocMapped); 210 | // 获取对应的设备指针 211 | cudaHostGetDevicePointer((void**)&d_data_2, h_data_2, 0); 212 | 213 | cudaHostAlloc((void**)&h_submit_info_2, 3 * sizeof(u_int64_t), cudaHostAllocMapped); 214 | 215 | 216 | cudaHostGetDevicePointer((void**)&tmp, h_submit_info_2, 0); 217 | cudaMemcpyToSymbol(d_submit_info_2, &(tmp), sizeof(uint64_t),0, cudaMemcpyHostToDevice); 218 | //cam_init(access_size); 219 | init_myKernel<<<1, 1,0,stream1>>>(); 220 | 221 | std::cout<<"init done"<= value2) 299 | break; 300 | // uint64_t start = 0; 301 | // while (start++ < 10000000000); 302 | } 303 | 304 | //printf("leadind thread wait done\n"); 305 | } 306 | __syncthreads(); 307 | } -------------------------------------------------------------------------------- /src/GPU_memory_lib/GPU_memory_management.cpp: -------------------------------------------------------------------------------- 1 | #include "GPU_memory_management.hpp" 2 | 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | // #include 10 | // #include 11 | // #include 12 | // #include 13 | // #include 14 | // #include 15 | // #include 16 | // #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | 27 | 28 | #include 29 | #include 30 | 31 | #define ASSERT(x) \ 32 | do \ 33 | { \ 34 | if (!(x)) \ 35 | { \ 36 | fprintf(stderr, "Assertion \"%s\" failed at %s:%d\n", #x, __FILE__, __LINE__); \ 37 | exit(EXIT_FAILURE); \ 38 | } \ 39 | } while (0) 40 | 41 | #define ASSERTDRV(stmt) \ 42 | do \ 43 | { \ 44 | CUresult result = (stmt); \ 45 | if (result != CUDA_SUCCESS) { \ 46 | const char *_err_name; \ 47 | cuGetErrorName(result, &_err_name); \ 48 | fprintf(stderr, "CUDA error: %s\n", _err_name); \ 49 | } \ 50 | ASSERT(CUDA_SUCCESS == result); \ 51 | } while (0) 52 | 53 | #define ASSERT_EQ(P, V) ASSERT((P) == (V)) 54 | #define ASSERT_NEQ(P, V) ASSERT(!((P) == (V))) 55 | 56 | 57 | 58 | [[maybe_unused]] static bool debug_flag = false; 59 | 60 | 61 | 62 | // [[maybe_unused]] static void errorPrint(std::string_view str) { 63 | // fmt::print(fg(fmt::color::red), "{}\n", str); 64 | // } 65 | 66 | // [[maybe_unused]] static void passPrint(std::string_view str) { 67 | // fmt::print(fg(fmt::color::green), "{}\n", str); 68 | // } 69 | 70 | // [[maybe_unused]] static void warnPrint(std::string_view str) { 71 | // fmt::print(fg(fmt::color::yellow), "{}\n", str); 72 | // } 73 | 74 | // [[maybe_unused]] static void infoPrint(std::string_view str) { 75 | // fmt::print(fg(fmt::color::cyan), "{}\n", str); 76 | // } 77 | 78 | static const size_t config_region_size = 256*1024; 79 | static const size_t lite_region_size = 4*1024; 80 | static const size_t bridge_region_size = 1024*1024*1024; 81 | 82 | std::pair findFreeChunk(const std::map &freeChunk, uint64_t mSize) { 83 | for (auto const &it: freeChunk) { 84 | if (it.second >= mSize) { 85 | return {it.first, it.second}; 86 | } 87 | } 88 | return {0, 0}; 89 | } 90 | 91 | static bool contains(const std::map &mp, uint64_t addr) { 92 | auto it = mp.find(addr); 93 | if (it == mp.end()) { 94 | return false; 95 | } else { 96 | return true; 97 | } 98 | } 99 | 100 | void *MemCtl::alloc(size_t size) { 101 | size = (size + 64UL - 1) & ~(64UL - 1); 102 | std::lock_guard lock(allocMutex); 103 | /*查找大小大于申请空间大小的空闲内存块*/ 104 | auto ck = findFreeChunk(free_chunk, size); 105 | auto &free_addr = ck.first; 106 | auto &free_size = ck.second; 107 | /*如果找到的块为空则报告申请失败*/ 108 | if (free_addr == 0) { 109 | // warnPrint(fmt::format("No Free CPU Chunk. Alloc failed!")); 110 | return nullptr; 111 | } 112 | /*如果内存块分配后仍存在剩余空间, 从内存块高地址部分分配*/ 113 | if (free_size > size) { 114 | free_chunk[free_addr] = free_size - size; 115 | used_chunk[free_addr + free_size - size] = size; 116 | return (void *) (free_addr + free_size - size); 117 | } else { 118 | free_chunk.erase(free_addr); 119 | used_chunk[free_addr] = size; 120 | return (void *) (free_addr); 121 | } 122 | } 123 | 124 | void MemCtl::free(void *ptr) { 125 | std::lock_guard lock(allocMutex); 126 | /*检查释放的内存块的合法性*/ 127 | if (!contains(used_chunk, (uint64_t) ptr)) { 128 | // errorPrint(fmt::format("Pointer to free is not in Alloc Log")); 129 | exit(1); 130 | } 131 | auto it = used_chunk.find((uint64_t) ptr); 132 | uint64_t free_size = it->second; 133 | used_chunk.erase(it); 134 | /*寻找第一个首地址大于ptr的空闲块, 返回map结构的迭代器*/ 135 | auto nextIt = free_chunk.upper_bound((uint64_t) ptr); 136 | if (!free_chunk.empty()) { 137 | auto prevIt = std::prev(nextIt); 138 | /*检查前置空闲块 首地址+块大小 与 释放块首地址 是否连续, 连续则将释放块合并到前置空闲块中*/ 139 | if (prevIt->first + prevIt->second == (uint64_t) ptr) { 140 | free_size += prevIt->second; 141 | ptr = (void *) prevIt->first; 142 | } 143 | } 144 | /*合并后置块*/ 145 | if (nextIt != free_chunk.end() && (uint64_t) ptr + free_size == nextIt->first) { 146 | free_size += nextIt->second; 147 | free_chunk.erase(nextIt); 148 | } 149 | free_chunk[(int64_t) ptr] = free_size; 150 | } 151 | 152 | 153 | 154 | class gdrMemAllocator { 155 | public: 156 | ~gdrMemAllocator(); 157 | 158 | CUresult gpuMemAlloc(CUdeviceptr *pptr, size_t psize, bool align_to_gpu_page = true, bool set_sync_memops = true); 159 | 160 | CUresult gpuMemFree(CUdeviceptr pptr); 161 | 162 | private: 163 | std::map _allocations; 164 | }; 165 | 166 | gdrMemAllocator::~gdrMemAllocator() { 167 | for (auto &it: _allocations) { 168 | CUresult ret; 169 | ret = cuMemFree(it.second); 170 | if (ret != CUDA_SUCCESS) { 171 | // warnPrint(fmt::format("Fail to free cuMemAlloc GPU Memory")); 172 | } 173 | } 174 | } 175 | 176 | CUresult gdrMemAllocator::gpuMemAlloc(CUdeviceptr *pptr, size_t psize, bool align_to_gpu_page, bool set_sync_memops) { 177 | CUresult ret = CUDA_SUCCESS; 178 | CUdeviceptr ptr; 179 | size_t size; 180 | 181 | if (align_to_gpu_page) { 182 | size = psize + GPU_PAGE_SIZE - 1; 183 | } else { 184 | size = psize; 185 | } 186 | 187 | ret = cuMemAlloc(&ptr, size); 188 | if (ret != CUDA_SUCCESS) 189 | return ret; 190 | 191 | if (set_sync_memops) { 192 | unsigned int flag = 1; 193 | ret = cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, ptr); 194 | if (ret != CUDA_SUCCESS) { 195 | cuMemFree(ptr); 196 | return ret; 197 | } 198 | } 199 | 200 | if (align_to_gpu_page) { 201 | *pptr = (ptr + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; 202 | } else { 203 | *pptr = ptr; 204 | } 205 | // Record the actual pointer for doing gpuMemFree later. 206 | _allocations[*pptr] = ptr; 207 | 208 | return CUDA_SUCCESS; 209 | } 210 | 211 | CUresult gdrMemAllocator::gpuMemFree(CUdeviceptr pptr) { 212 | CUresult ret = CUDA_SUCCESS; 213 | CUdeviceptr ptr; 214 | 215 | if (_allocations.count(pptr) > 0) { 216 | ptr = _allocations[pptr]; 217 | ret = cuMemFree(ptr); 218 | if (ret == CUDA_SUCCESS) 219 | _allocations.erase(pptr); 220 | return ret; 221 | } else { 222 | return CUDA_ERROR_INVALID_VALUE; 223 | } 224 | } 225 | 226 | static gdrMemAllocator allocator; 227 | 228 | static int32_t devID{-1}; 229 | 230 | static const gdr_mh_t null_mh = {0}; 231 | 232 | static gdr_t gdrDev{}; 233 | static gdr_mh_t gdrUserMapHandler{null_mh}; 234 | static gpu_tlb_t gdrPageTable{}; 235 | static gdr_info_t info{}; 236 | 237 | static CUdeviceptr devAddr{}; 238 | static void *mapDevPtr{}; 239 | 240 | static inline bool operator==(const gdr_mh_t &a, const gdr_mh_t &b) { 241 | return a.h == b.h; 242 | } 243 | 244 | 245 | static std::vector> gpu_mem_ctl_list; 246 | 247 | GPUMemCtl::GPUMemCtl([[maybe_unused]]uint64_t size) { 248 | 249 | pool_size = size; 250 | auto page_size = 64UL * 1024; 251 | 252 | CUdevice dev; 253 | CUcontext devCtx; 254 | ASSERTDRV(cuInit(0)); 255 | ASSERTDRV(cuDeviceGet(&dev, devID)); 256 | ASSERTDRV(cuDevicePrimaryCtxRetain(&devCtx, dev)); 257 | ASSERTDRV(cuCtxSetCurrent(devCtx)); 258 | 259 | ASSERTDRV(allocator.gpuMemAlloc(&devAddr, size)); 260 | 261 | gdrDev = gdr_open(); 262 | ASSERT_NEQ(gdrDev, nullptr); 263 | 264 | // 64KB * 64K = 4GB 265 | // 4GB * 20 = 80GB 266 | gdrPageTable.pages = new uint64_t[65536 * 20]; 267 | 268 | ASSERT_EQ(rc4ml_pin_buffer(gdrDev, devAddr, size, 0, 0, &gdrUserMapHandler, &gdrPageTable), 0); 269 | ASSERT_NEQ(gdrUserMapHandler, null_mh); 270 | 271 | ASSERT_EQ(gdr_map(gdrDev, gdrUserMapHandler, &mapDevPtr, size), 0); 272 | 273 | ASSERT_EQ(gdr_get_info(gdrDev, gdrUserMapHandler, &info), 0); 274 | 275 | ASSERT_EQ((info.va - devAddr), 0); 276 | ASSERT_EQ((devAddr & (page_size - 1)), 0); 277 | 278 | page_table = {gdrPageTable.page_entries, (uint64_t) (devAddr), gdrPageTable.pages}; 279 | free_chunk.emplace((uint64_t) devAddr, size); 280 | 281 | } 282 | 283 | GPUMemCtl::~GPUMemCtl() { 284 | 285 | const auto size = std::get<0>(page_table) * 64UL * 1024; 286 | delete[] std::get<2>(page_table); 287 | ASSERT_EQ(gdr_unmap(gdrDev, gdrUserMapHandler, mapDevPtr, size), 0); 288 | ASSERT_EQ(gdr_unpin_buffer(gdrDev, gdrUserMapHandler), 0); 289 | ASSERT_EQ(gdr_close(gdrDev), 0); 290 | ASSERTDRV(allocator.gpuMemFree(devAddr)); 291 | 292 | } 293 | 294 | GPUMemCtl *GPUMemCtl::getInstance([[maybe_unused]]int32_t dev_id, [[maybe_unused]]size_t pool_size) { 295 | 296 | if (devID >= 0 && devID != dev_id) { 297 | // errorPrint(fmt::format("This QDMA library now only support one GPU Memory Pool")); 298 | // errorPrint(fmt::format("New device id {} is not equal to previous device id {}", dev_id, devID)); 299 | exit(1); 300 | } 301 | // up round to 64KB 302 | pool_size = (pool_size + 64UL * 1024 - 1) & ~(64UL * 1024 - 1); 303 | 304 | if (pool_size % (2UL * 1024 * 1024) != 0) { 305 | // warnPrint(fmt::format("Suggest GPU Memory Pool Size to be multiple of 2MB for Page Aggregation")); 306 | // errorPrint(fmt::format("For correctness safety, the program will exit. Please change the pool size")); 307 | exit(1); 308 | } 309 | 310 | if (gpu_mem_ctl_list.empty()) { 311 | devID = dev_id; 312 | auto tmp = new GPUMemCtl(pool_size); 313 | gpu_mem_ctl_list.push_back(std::shared_ptr(tmp)); 314 | return tmp; 315 | } else { 316 | static bool warn_flag = false; 317 | if (!warn_flag) { 318 | warn_flag = true; 319 | // warnPrint(fmt::format("This QDMA library now only support one GPU Memory Pool")); 320 | // warnPrint(fmt::format("Request pool size will be ignored")); 321 | // warnPrint(fmt::format("The previous GPU Memory Pool with size {} will be returned", 322 | // gpu_mem_ctl_list[0]->getPoolSize())); 323 | } 324 | return gpu_mem_ctl_list[0].get(); 325 | } 326 | 327 | } 328 | 329 | void GPUMemCtl::cleanCtx() { 330 | 331 | gpu_mem_ctl_list.clear(); 332 | 333 | 334 | } 335 | 336 | void GPUMemCtl::writeTLB([[maybe_unused]]const std::function &func, [[maybe_unused]]bool aggr_flag) { 337 | 338 | const auto &[n_pages, vaddr, parray] = page_table; 339 | 340 | if (aggr_flag) { 341 | const auto page_size = 2UL * 1024 * 1024; 342 | auto aggr_n_pages = n_pages / 32; 343 | for (uint32_t i = 0; i < aggr_n_pages; ++i) { 344 | for (uint32_t j = 1; j < 32; ++j) { 345 | ASSERT_EQ((parray[i * 32 + j] - parray[i * 32 + j - 1]), 65536); 346 | } 347 | func(i, page_size, vaddr + i * page_size, parray[i * 32]); 348 | } 349 | } else { 350 | const auto page_size = 64UL * 1024; 351 | for (int i = 0; i < n_pages; i++) { 352 | func(i, page_size, vaddr + i * page_size, parray[i]); 353 | } 354 | } 355 | 356 | } 357 | 358 | uint64_t GPUMemCtl::mapV2P(void *ptr) { 359 | const auto &[n_pages, vaddr, parray] = page_table; 360 | const auto page_size = 64UL * 1024; 361 | uint64_t offset = (uint64_t) ptr - vaddr; 362 | return parray[offset / page_size] + (offset & (page_size - 1)); 363 | } 364 | 365 | void *GPUMemCtl::getDevPtr() const { 366 | 367 | return (void *)devAddr; 368 | 369 | } 370 | 371 | void *GPUMemCtl::getMapDevPtr() const { 372 | 373 | return mapDevPtr; 374 | 375 | } 376 | 377 | bool GPUMemCtl::chechPhyContiguous() const { 378 | 379 | const auto &[n_pages, vaddr, parray] = page_table; 380 | const auto page_size = 64UL * 1024; 381 | for (int i = 1; i < n_pages; i++) { 382 | if (parray[i] - parray[i - 1] != page_size) { 383 | return false; 384 | } 385 | } 386 | return true; 387 | 388 | } 389 | -------------------------------------------------------------------------------- /src/applications/gemm/src/queue.cu: -------------------------------------------------------------------------------- 1 | #include "queue.cuh" 2 | 3 | __host__ __device__ void QueuePair::submit(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12) 4 | { 5 | fill_sq(cmd_id, sq_tail, opcode, prp1, prp2, dw10, dw11, dw12); 6 | sq_tail = (sq_tail + 1) % queue_depth; 7 | *sqtdbl = sq_tail; 8 | cid = cmd_id; 9 | cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK; 10 | } 11 | 12 | __device__ void QueuePair::submit_fence(uint32_t &cid, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12) 13 | { 14 | fill_sq(cmd_id, sq_tail, opcode, prp1, prp2, dw10, dw11, dw12); 15 | __threadfence_system(); 16 | sq_tail = (sq_tail + 1) % queue_depth; 17 | *sqtdbl = sq_tail; 18 | cid = cmd_id; 19 | cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK; 20 | } 21 | 22 | __host__ __device__ void QueuePair::fill_sq(uint32_t cid, uint32_t pos, uint32_t opcode, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12, uint32_t req_id) 23 | { 24 | // if (req_id == 1152) 25 | // printf("%lx %lx %x %x %x %x %x %x\n", prp1, prp2, dw10, dw11, dw12, opcode, cid, namespace_id); 26 | sq[pos * 16 + 0] = opcode | (cid << 16); 27 | sq[pos * 16 + 1] = namespace_id; 28 | sq[pos * 16 + 6] = prp1 & 0xffffffff; 29 | sq[pos * 16 + 7] = prp1 >> 32; 30 | sq[pos * 16 + 8] = prp2 & 0xffffffff; 31 | sq[pos * 16 + 9] = prp2 >> 32; 32 | sq[pos * 16 + 10] = dw10; 33 | sq[pos * 16 + 11] = dw11; 34 | sq[pos * 16 + 12] = dw12; 35 | if (cmd_id_to_req_id) 36 | cmd_id_to_req_id[cid % queue_depth] = req_id; 37 | if (cmd_id_to_sq_pos) 38 | cmd_id_to_sq_pos[cid % queue_depth] = pos; 39 | if (sq_entry_busy) 40 | sq_entry_busy[pos] = true; 41 | } 42 | 43 | __host__ __device__ void QueuePair::poll(uint32_t &code, uint32_t cid) 44 | { 45 | uint32_t current_phase = ((cmd_id - 1) / queue_depth) & 1; 46 | uint32_t status = cq[cq_head * 4 + 3]; 47 | while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 48 | status = cq[cq_head * 4 + 3]; 49 | if ((status & NVME_ENTRY_CID_MASK) != cid) 50 | { 51 | AEOLUS_LOG_ERROR("expected cid: %d, actual cid: %d", cid, status & NVME_ENTRY_CID_MASK); 52 | assert(0); 53 | } 54 | code = (status >> 17) & NVME_ENTRY_SC_MASK; 55 | num_completed++; 56 | cq_head = (cq_head + 1) % queue_depth; 57 | *cqhdbl = cq_head; 58 | } 59 | 60 | __host__ __device__ void QueuePair::poll_with_dw0(uint32_t &code, uint32_t cid, uint32_t &dw0) 61 | { 62 | uint32_t current_phase = ((cmd_id - 1) / queue_depth) & 1; 63 | uint32_t status = cq[cq_head * 4 + 3]; 64 | while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 65 | status = cq[cq_head * 4 + 3]; 66 | if ((status & NVME_ENTRY_CID_MASK) != cid) 67 | { 68 | AEOLUS_LOG_ERROR("expected cid: %d, actual cid: %d", cid, status & NVME_ENTRY_CID_MASK); 69 | assert(0); 70 | } 71 | code = (status >> 17) & NVME_ENTRY_SC_MASK; 72 | dw0 = cq[cq_head * 4]; 73 | num_completed++; 74 | cq_head = (cq_head + 1) % queue_depth; 75 | *cqhdbl = cq_head; 76 | } 77 | 78 | __device__ void IoQueuePair::poll_range(uint32_t &code, int expected_sq_head, bool should_break) 79 | { 80 | // printf("cmd_id: %d, size: %d, current_phase: %d\n", cmd_id, size, current_phase); 81 | int i; 82 | uint32_t last_sq_head = ~0U; 83 | // int last_num_completed = num_completed; 84 | // int thread_id = threadIdx.x + blockIdx.x * blockDim.x; 85 | for (i = cq_head; (num_completed & NVME_ENTRY_CID_MASK) != (cmd_id & NVME_ENTRY_CID_MASK); i = (i + 1) % queue_depth) 86 | { 87 | uint32_t current_phase = (num_completed / queue_depth) & 1; 88 | uint32_t status = cq[i * 4 + 3]; 89 | uint64_t start = clock64(); 90 | while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 91 | { 92 | status = cq[i * 4 + 3]; 93 | if (clock64() - start > 1000000000) 94 | { 95 | AEOLUS_LOG_ERROR("timeout sq_tail=%d, cq_head=%d, i=%d, num_completed=%d, cmd_id=%d\n", sq_tail, cq_head, i, num_completed, cmd_id); 96 | AEOLUS_LOG_ERROR("last_sq_head: %d, expected_sq_head: %d\n", last_sq_head, expected_sq_head); 97 | // int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 98 | // if (thread_id) 99 | // return 0; 100 | // for (int m = 0; m < queue_depth; m++) 101 | // { 102 | // printf("SQE %d\n", m); 103 | // for (int n = 0; n < 16; n++) 104 | // printf("DW%2d, %08x\n", n, sq[m * 16 + n]); 105 | // } 106 | // for (int m = 0; m < queue_depth; m++) 107 | // { 108 | // printf("CQE %d\n", m); 109 | // for (int n = 0; n < 4; n++) 110 | // printf("DW%2d, %08x\n", n, cq[m * 4 + n]); 111 | // } 112 | code = 1; 113 | } 114 | } 115 | int cmd_id = status & NVME_ENTRY_CID_MASK; 116 | int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth]; 117 | if ((status >> 17) & NVME_ENTRY_SC_MASK) 118 | { 119 | printf("cq[%d] status: 0x%x, cid: %d\n", i, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK); 120 | int req_id = cmd_id_to_req_id[cmd_id % queue_depth]; 121 | printf("req_id: %d, sq_pos: %d\n", req_id, sq_pos); 122 | // for (int i = 0; i < 16; i++) 123 | // printf("%08x ", sq[sq_pos * 16 + i]); 124 | // printf("\n"); 125 | code = (status >> 17) & NVME_ENTRY_SC_MASK; 126 | } 127 | last_sq_head = cq[i * 4 + 2] & NVME_ENTRY_SQ_HEAD_MASK; 128 | sq_entry_busy[sq_pos] = false; 129 | // printf("thread %d freed sq_pos %d\n", thread_id, sq_pos); 130 | num_completed++; 131 | if (should_break && ((cq[i * 4 + 2] & NVME_ENTRY_SQ_HEAD_MASK) - expected_sq_head + queue_depth) % queue_depth <= AEOLUS_WARP_SIZE) 132 | { 133 | // printf("cq[%d] sq_head: %d, expected_sq_head: %d\n", i, cq[i * 4 + 2] & SQ_HEAD_MASK, expected_sq_head); 134 | i = (i + 1) % queue_depth; 135 | // if (num_completed - last_num_completed > 64) 136 | // printf("%d: %d completed\n", thread_id, num_completed - last_num_completed); 137 | break; 138 | } 139 | } 140 | if (i != cq_head) 141 | { 142 | cq_head = i; 143 | // printf("cq_head is %p, set cqhdbl to %d\n", cqhdbl, cq_head); 144 | *cqhdbl = cq_head; 145 | } 146 | code = 0; 147 | } 148 | 149 | __device__ void IoQueuePair::poll_multiple(uint32_t &code, int cnt) 150 | { 151 | for (int i = 0; i < cnt; i++) 152 | { 153 | uint32_t current_phase = (num_completed / queue_depth) & 1; 154 | int pos = (cq_head + i) % queue_depth; 155 | uint32_t status = cq[pos * 4 + 3]; 156 | while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 157 | status = cq[pos * 4 + 3]; 158 | int cmd_id = status & NVME_ENTRY_CID_MASK; 159 | int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth]; 160 | if ((status >> 17) & NVME_ENTRY_SC_MASK) 161 | { 162 | printf("cq[%d] status: 0x%x, cid: %d\n", pos, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK); 163 | code = (status >> 17) & NVME_ENTRY_SC_MASK; 164 | } 165 | sq_entry_busy[sq_pos] = false; 166 | num_completed++; 167 | } 168 | cq_head = (cq_head + cnt) % queue_depth; 169 | *cqhdbl = cq_head; 170 | code = 0; 171 | } 172 | 173 | __device__ void IoQueuePair::poll_until_sq_entry_free(uint32_t &code, int expected_sq_pos) { 174 | // int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 175 | // int last_num_completed = num_completed; 176 | // printf("thread %d want to free sq_pos: %d num_completed %d cmd_id %d\n", thread_id, expected_sq_pos, num_completed, cmd_id); 177 | int i; 178 | for (i = cq_head; (num_completed & NVME_ENTRY_CID_MASK) != (cmd_id & NVME_ENTRY_CID_MASK); i = (i + 1) % queue_depth) 179 | { 180 | uint32_t current_phase = (num_completed / queue_depth) & 1; 181 | uint32_t status = cq[i * 4 + 3]; 182 | while (((status & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 183 | status = cq[i * 4 + 3]; 184 | int cmd_id = status & NVME_ENTRY_CID_MASK; 185 | int sq_pos = cmd_id_to_sq_pos[cmd_id % queue_depth]; 186 | if ((status >> 17) & NVME_ENTRY_SC_MASK) 187 | { 188 | printf("cq[%d] status: 0x%x, cid: %d\n", i, (status >> 17) & NVME_ENTRY_SC_MASK, status & NVME_ENTRY_CID_MASK); 189 | int req_id = cmd_id_to_req_id[cmd_id % queue_depth]; 190 | printf("req_id: %d, sq_pos: %d\n", req_id, sq_pos); 191 | // for (int i = 0; i < 16; i++) 192 | // printf("%08x ", sq[sq_pos * 16 + i]); 193 | // printf("\n"); 194 | code = (status >> 17) & NVME_ENTRY_SC_MASK; 195 | } 196 | sq_entry_busy[sq_pos] = false; 197 | // printf("thread %d manually freed sq_pos %d\n", thread_id, sq_pos); 198 | num_completed++; 199 | if (sq_pos == expected_sq_pos) 200 | { 201 | cq_head = (i + 1) % queue_depth; 202 | // printf("cq_head is %p, set cqhdbl to %d\n", cqhdbl, cq_head); 203 | *cqhdbl = cq_head; 204 | // if (num_completed - last_num_completed > 64) 205 | // printf("%d: %d completed\n", thread_id, num_completed - last_num_completed); 206 | code = 0; 207 | } 208 | } 209 | // printf("thread %d failed to free sq_pos %d\n", thread_id, expected_sq_pos); 210 | code = 1; 211 | } 212 | 213 | __host__ __device__ void AdminQueuePair::submit_with_ns(uint32_t &cid, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12) 214 | { 215 | fill_sq_with_ns(cmd_id, sq_tail, opcode, nsid, prp1, prp2, dw10, dw11, dw12); 216 | sq_tail = (sq_tail + 1) % queue_depth; 217 | *sqtdbl = sq_tail; 218 | cid = cmd_id; 219 | cmd_id = (cmd_id + 1) & NVME_ENTRY_CID_MASK; 220 | } 221 | __host__ __device__ void AdminQueuePair::fill_sq_with_ns(uint32_t cid, uint32_t pos, uint32_t opcode, uint32_t nsid, uint64_t prp1, uint64_t prp2, uint32_t dw10, uint32_t dw11, uint32_t dw12, uint32_t req_id) 222 | { 223 | sq[pos * 16 + 0] = opcode | (cid << 16); 224 | sq[pos * 16 + 1] = nsid; 225 | sq[pos * 16 + 6] = prp1 & 0xffffffff; 226 | sq[pos * 16 + 7] = prp1 >> 32; 227 | sq[pos * 16 + 8] = prp2 & 0xffffffff; 228 | sq[pos * 16 + 9] = prp2 >> 32; 229 | sq[pos * 16 + 10] = dw10; 230 | sq[pos * 16 + 11] = dw11; 231 | sq[pos * 16 + 12] = dw12; 232 | if (cmd_id_to_req_id) 233 | cmd_id_to_req_id[cid % queue_depth] = req_id; 234 | if (cmd_id_to_sq_pos) 235 | cmd_id_to_sq_pos[cid % queue_depth] = pos; 236 | if (sq_entry_busy) 237 | sq_entry_busy[pos] = true; 238 | } 239 | 240 | __host__ uint32_t AdminQueuePair::set_num_queues(uint16_t nsqr, uint16_t ncqr) 241 | { 242 | uint32_t cid; 243 | submit( 244 | cid, NVME_ADMIN_OPCODE_SET_FEATURES, 0x0, 0x0, NVME_FEATURE_ID_NUM_QUEUES, 245 | ((ncqr-1) << 16) | (nsqr-1) 246 | ); 247 | uint32_t status; 248 | poll(status, cid); 249 | return status; 250 | } 251 | 252 | __host__ uint32_t AdminQueuePair::get_num_queues(uint16_t &nsqa, uint16_t &ncqa) 253 | { 254 | uint32_t cid; 255 | submit( 256 | cid, NVME_ADMIN_OPCODE_GET_FEATURES, 0x0, 0x0, NVME_FEATURE_ID_NUM_QUEUES, 0x0 257 | ); 258 | uint32_t dw0; 259 | uint32_t status; 260 | poll_with_dw0(status, cid, dw0); 261 | nsqa = (dw0 & 0xffff) + 1; 262 | ncqa = ((dw0 >> 16) & 0xffff) + 1; 263 | return status; 264 | } 265 | 266 | __host__ uint32_t AdminQueuePair::identify(uint8_t cns, uint16_t cntid, uint32_t nsid, uint64_t prp1) 267 | { 268 | uint32_t cid; 269 | submit_with_ns( 270 | cid, NVME_ADMIN_OPCODE_IDENTIFY, nsid, prp1, 0x0, cns | (cntid << 16), 0x0 271 | ); 272 | uint32_t status; 273 | poll(status, cid); 274 | return status; 275 | } 276 | 277 | __host__ uint32_t AdminQueuePair::create_cq_cont(uint16_t cqid, uint64_t cq_phys, uint16_t queue_depth) 278 | { 279 | uint32_t cid; 280 | submit( 281 | cid, NVME_ADMIN_OPCODE_CREATE_CQ, cq_phys, 0x0, cqid | ((queue_depth-1) << 16), 0x1 282 | ); 283 | uint32_t status; 284 | poll(status, cid); 285 | return status; 286 | } 287 | 288 | __host__ uint32_t AdminQueuePair::create_sq_cont(uint16_t sqid, uint16_t cqid, uint64_t sq_phys, uint16_t queue_depth) 289 | { 290 | uint32_t cid; 291 | submit( 292 | cid, NVME_ADMIN_OPCODE_CREATE_SQ, sq_phys, 0x0, sqid | ((queue_depth-1) << 16), (cqid << 16) | 0x1 293 | ); 294 | uint32_t status; 295 | poll(status, cid); 296 | return status; 297 | } 298 | 299 | __host__ uint32_t AdminQueuePair::delete_sq(uint16_t sqid) 300 | { 301 | uint32_t cid; 302 | submit( 303 | cid, NVME_ADMIN_OPCODE_DELETE_SQ, 0x0, 0x0, sqid, 0x0 304 | ); 305 | uint32_t status; 306 | poll(status, cid); 307 | return status; 308 | } 309 | 310 | __host__ uint32_t AdminQueuePair::delete_cq(uint16_t cqid) 311 | { 312 | uint32_t cid; 313 | submit( 314 | cid, NVME_ADMIN_OPCODE_DELETE_SQ, 0x0, 0x0, cqid, 0x0 315 | ); 316 | uint32_t status; 317 | poll(status, cid); 318 | return status; 319 | } -------------------------------------------------------------------------------- /src/applications/gemm/src/controller.cu: -------------------------------------------------------------------------------- 1 | #include "controller.cuh" 2 | #include 3 | 4 | Controller::Controller( 5 | std::vector ssd_list, int32_t num_queue_per_ssd, int32_t max_io_size, 6 | int32_t queue_depth, aeolus_dist_type dist_type, aeolus_buf_type buf_type, 7 | uint64_t *pinned_buf_phys, uint64_t pinned_buf_size 8 | ) 9 | { 10 | // Check if the input parameters are valid. 11 | 12 | AEOLUS_CUDA_CHECK(cudaGetDevice(&gpu_id)); 13 | ssd_count = ssd_list.size(); 14 | if (ssd_count <= 0) 15 | {{ 16 | AEOLUS_LOG_ERROR("Empty SSD list delivered to Controller."); 17 | exit(-1); 18 | }} 19 | // Get maximum queue number and IO size of SSDs. 20 | max_queue_num = INT_MAX; 21 | max_trans_size = INT_MAX; 22 | for (auto ssd : ssd_list) 23 | { 24 | max_queue_num = MIN(max_queue_num, ssd->free_qps.size()); 25 | max_trans_size = MIN(max_trans_size, ssd->max_io_size); 26 | } 27 | if (num_queue_per_ssd < 0) 28 | { 29 | num_queue_per_ssd = max_queue_num + 1 + num_queue_per_ssd; 30 | } 31 | if (num_queue_per_ssd <= 0 || num_queue_per_ssd > max_queue_num) 32 | { 33 | AEOLUS_LOG_ERROR( 34 | "Invalid queue number per SSD delivered to Controller." 35 | "The range should be between 1 and %d.", max_queue_num 36 | ); 37 | exit(-1); 38 | } 39 | if (max_io_size == AEOLUS_MAX_DATA_TRANSFER) 40 | { 41 | // Less than 2 MiB IO size is to ensure the PRP list of a request won't exceed a page. 42 | max_io_size = MIN(max_trans_size, 2*1024*1024); 43 | } 44 | if (max_io_size < 512 || max_io_size > max_trans_size || max_io_size > 2*1024*1024) 45 | { 46 | AEOLUS_LOG_ERROR( 47 | "Invalid max io size delivered to Controller." 48 | "The range should be between 512 and %d", MIN(max_trans_size, 2*1024*1024) 49 | ); 50 | exit(-1); 51 | } 52 | if (!isPowerOfTwo(max_io_size)) 53 | { 54 | AEOLUS_LOG_ERROR( 55 | "Invalid max io size delivered to Controller." 56 | "The value should be power of 2." 57 | ); 58 | exit(-1); 59 | } 60 | 61 | this->ssd_list = ssd_list; 62 | this->num_queue_per_ssd = num_queue_per_ssd; 63 | this->max_io_size = max_io_size; 64 | this->queue_depth = queue_depth; 65 | this->dist_type = dist_type; 66 | this->buf_type = buf_type; 67 | 68 | // Compute SSD LB prefix sum. 69 | 70 | h_ssd_num_lbs = new uint64_t[ssd_count]; 71 | for (int i = 0; i < ssd_count; i++) 72 | h_ssd_num_lbs[i] = ssd_list[i]->max_lb_num; 73 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_ssd_num_lbs, ssd_count * sizeof(uint64_t))); 74 | AEOLUS_CUDA_CHECK(cudaMemcpy(d_ssd_num_lbs, h_ssd_num_lbs, ssd_count * sizeof(uint64_t), cudaMemcpyHostToDevice)); 75 | 76 | // Alloc shared buffers. 77 | 78 | AEOLUS_CUDA_CHECK(cudaMalloc(&ssd_num_reqs, ssd_count * sizeof(int))); 79 | if (dist_type != AEOLUS_DIST_STRIPE) 80 | { 81 | AEOLUS_LOG_ERROR("Controller only supports AEOLUS_DIST_STRIPE distribution type for now\n"); 82 | } 83 | AEOLUS_CUDA_CHECK(cudaMalloc(&distributed_reqs, AEOLUS_MAX_NUM_REQUESTS * sizeof(Request))); 84 | AEOLUS_CUDA_CHECK(cudaMalloc(&req_ids, ssd_count * sizeof(int))); 85 | 86 | // Create SSD IO queue pairs. 87 | 88 | qpid_list = new int32_t *[ssd_count]; 89 | for (int i=0; ifree_qps[0]; 95 | ssd_list[i]->free_qps.erase(ssd_list[i]->free_qps.begin()); 96 | } 97 | } 98 | 99 | int sq_size = MAX(AEOLUS_HOST_PGSIZE, queue_depth*NVME_SQ_ENTRY_SIZE); 100 | assert((sq_size % AEOLUS_HOST_PGSIZE) == 0); 101 | 102 | void *d_qp_ptr; 103 | int ret = ssd_list[0]->alloc_device_memory(&d_qp_ptr, &qp_ctx, 2*sq_size*ssd_count*num_queue_per_ssd, &qp_phys); 104 | if (ret != 0) 105 | { 106 | AEOLUS_LOG_ERROR("Failed to allocate device memory for SSD IO queues: %s", strerror(ret)); 107 | exit(-1); 108 | } 109 | AEOLUS_CUDA_CHECK(cudaMemset(d_qp_ptr, 0, 2*sq_size*ssd_count*num_queue_per_ssd)); 110 | 111 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_ssdqp, ssd_count*num_queue_per_ssd*sizeof(IoQueuePair))); 112 | 113 | for (int i=0; iadmin_qp->create_cq_cont(qid, cq_phys, queue_depth); 126 | if (ret != 0) 127 | { 128 | AEOLUS_LOG_ERROR( 129 | "Failed to create CQ %d for SSD %d with status 0x%x", 130 | qid, i, ret 131 | ); 132 | exit(-1); 133 | } 134 | 135 | // Create SQ. 136 | offset = sq_size * (2*i*num_queue_per_ssd+2*j); 137 | uint64_t sq_phys = qp_phys[offset / AEOLUS_DEVICE_PGSIZE] + offset % AEOLUS_DEVICE_PGSIZE; 138 | ret = ssd_list[i]->admin_qp->create_sq_cont(qid, qid, sq_phys, queue_depth); 139 | if (ret != 0) 140 | { 141 | AEOLUS_LOG_ERROR( 142 | "Failed to create SQ %d for SSD %d with status 0x%x", 143 | qid, i, ret 144 | ); 145 | exit(-1); 146 | } 147 | // AEOLUS_LOG_INFO("CQ phy addr: 0x%lx, SQ phy addr: 0x%lx", cq_phys, sq_phys); 148 | 149 | // Create auxiliary data structures. 150 | uint32_t *d_cmd_id_to_req_id; 151 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_cmd_id_to_req_id, sizeof(uint32_t)*queue_depth)); 152 | uint32_t *d_cmd_id_to_sq_pos; 153 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_cmd_id_to_sq_pos, sizeof(uint32_t)*queue_depth)); 154 | bool *d_sq_entry_busy; 155 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_sq_entry_busy, 1*queue_depth)); 156 | AEOLUS_CUDA_CHECK(cudaMemset(d_sq_entry_busy, 0, 1*queue_depth)); 157 | IoQueuePair h_ssdqp( 158 | (volatile uint32_t *)sq_virt, (volatile uint32_t *)cq_virt, 159 | ssd_list[i]->active_ns, 160 | (uint32_t *)((uint64_t)ssd_list[i]->reg_ptr + NVME_REG_SQTDBL + qid * NVME_DBLSTRIDE), 161 | (uint32_t *)((uint64_t)ssd_list[i]->reg_ptr + NVME_REG_CQHDBL + qid * NVME_DBLSTRIDE), 162 | queue_depth, d_cmd_id_to_req_id, d_cmd_id_to_sq_pos, d_sq_entry_busy 163 | ); 164 | // AEOLUS_LOG_INFO("Created SSD IO queue pair %d for SSD %d.", qid, i); 165 | AEOLUS_CUDA_CHECK(cudaMemcpy( 166 | d_ssdqp + i*num_queue_per_ssd+j, &h_ssdqp, 167 | sizeof(IoQueuePair), cudaMemcpyHostToDevice 168 | )); 169 | } 170 | } 171 | 172 | uint64_t io_buf_size = (uint64_t)max_io_size*ssd_count*num_queue_per_ssd*queue_depth; 173 | uint64_t *h_iobuf_phys; 174 | if (buf_type == AEOLUS_BUF_USER) { 175 | 176 | // Allocate IO buffer. 177 | 178 | AEOLUS_LOG_INFO("Allocating IO buffer."); 179 | int ret = ssd_list[0]->alloc_device_memory( 180 | &d_iobuf_ptr, &iobuf_ctx, io_buf_size, &h_iobuf_phys 181 | ); 182 | if (ret != 0) 183 | { 184 | AEOLUS_LOG_ERROR("Failed to allocate device memory for IO buffer: %s", strerror(ret)); 185 | exit(-1); 186 | } 187 | } else 188 | { 189 | h_iobuf_phys = pinned_buf_phys; 190 | io_buf_size = pinned_buf_size; 191 | } 192 | if (h_iobuf_phys != nullptr) 193 | { 194 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_iobuf_phys, sizeof(uint64_t))); 195 | AEOLUS_CUDA_CHECK(cudaMemcpy(d_iobuf_phys, h_iobuf_phys, sizeof(uint64_t), cudaMemcpyHostToDevice)); 196 | } 197 | 198 | // Allocate PRP list. 199 | 200 | if (max_io_size > AEOLUS_HOST_PGSIZE * 2) { 201 | uint64_t prp_list_size = io_buf_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); 202 | if (io_buf_size > 0) 203 | { 204 | AEOLUS_LOG_INFO("Allocating PRP buffer."); 205 | ssd_list[0]->alloc_host_memory((void **)&prp_list, prp_list_size, &h_prp_phys); 206 | 207 | // Fill in PRP table. 208 | for (int i = 0; i < io_buf_size / AEOLUS_DEVICE_PGSIZE; i++) 209 | { 210 | for (int j = 0; j < AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE; j++) 211 | { 212 | if (i == 0 && j == 0) 213 | { 214 | continue; 215 | } 216 | prp_list[i * AEOLUS_DEVICE_PGSIZE / AEOLUS_HOST_PGSIZE + j - 1] = 217 | h_iobuf_phys[i] + j * AEOLUS_HOST_PGSIZE; 218 | } 219 | } 220 | } 221 | 222 | // Move PRP physical address to GPU. 223 | size_t prp_phys_size = CEIL(prp_list_size, AEOLUS_HOST_PGSIZE) * sizeof(uint64_t); 224 | AEOLUS_CUDA_CHECK(cudaMalloc(&d_prp_phys, prp_phys_size)); 225 | AEOLUS_CUDA_CHECK(cudaMemcpy(d_prp_phys, h_prp_phys, prp_phys_size, cudaMemcpyHostToDevice)); 226 | } 227 | } 228 | 229 | Controller::~Controller() 230 | { 231 | AEOLUS_LOG_INFO("Cleaning up controller."); 232 | if (buf_type == AEOLUS_BUF_USER) { 233 | if (max_io_size > 8192) { 234 | AEOLUS_CUDA_CHECK(cudaFree(d_prp_phys)); 235 | ssd_list[0]->free_host_memory(prp_list, h_prp_phys); 236 | } 237 | ssd_list[0]->free_device_memory(iobuf_ctx); 238 | AEOLUS_CUDA_CHECK(cudaFree(d_iobuf_phys)); 239 | } 240 | 241 | IoQueuePair *h_ssdqp = (IoQueuePair *)malloc(sizeof(IoQueuePair)); 242 | for (int i=0; icmd_id_to_req_id)); 251 | AEOLUS_CUDA_CHECK(cudaFree(h_ssdqp->cmd_id_to_sq_pos)); 252 | AEOLUS_CUDA_CHECK(cudaFree(h_ssdqp->sq_entry_busy)); 253 | ssd_list[i]->admin_qp->delete_sq(qpid_list[i][j]); 254 | ssd_list[i]->admin_qp->delete_cq(qpid_list[i][j]); 255 | ssd_list[i]->free_qps.push_back(qpid_list[i][j]); 256 | } 257 | delete [] qpid_list[i]; 258 | } 259 | delete [] qpid_list; 260 | 261 | AEOLUS_CUDA_CHECK(cudaFree(d_ssdqp)); 262 | ssd_list[0]->free_device_memory(qp_ctx); 263 | free(h_ssdqp); 264 | delete [] h_ssd_num_lbs; 265 | AEOLUS_CUDA_CHECK(cudaFree(d_ssd_num_lbs)); 266 | AEOLUS_LOG_INFO("Cleaning up controller done."); 267 | } 268 | 269 | __global__ static void rw_data_kernel(uint32_t opcode, int ssd_id, uint64_t start_lb, uint64_t num_lb, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type) 270 | { 271 | uint32_t cid; 272 | int global_queue_id = ssd_id * num_queues_per_ssd; 273 | uint64_t global_pos = (uint64_t)global_queue_id * queue_depth; 274 | uint64_t io_addr; 275 | if (buf_type == AEOLUS_BUF_USER) 276 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 277 | else 278 | { 279 | io_addr = prp1[0]; 280 | global_pos = 0; 281 | } 282 | uint64_t io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 283 | if (num_lb * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 284 | { 285 | int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request 286 | uint64_t offset = global_pos * prp_size; 287 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 288 | } 289 | ssdqp[global_queue_id].submit(cid, opcode, io_addr, io_addr2, start_lb & 0xffffffff, (start_lb >> 32) & 0xffffffff, NVME_RW_LIMITED_RETRY_MASK | (num_lb - 1)); 290 | uint32_t status; 291 | ssdqp[global_queue_id].poll(status, cid); 292 | // printf("ssd_id: %d, start_lb: %lu, cmd_id: %u\n", ssd_id, start_lb, ssdqp[global_queue_id].cmd_id); 293 | if (status != 0) 294 | { 295 | AEOLUS_LOG_ERROR("read/write failed with status 0x%x\n", status); 296 | assert(0); 297 | } 298 | } 299 | 300 | void Controller::lb_to_ssd_id(uint64_t lb, int &ssd_id, uint64_t &local_lb) 301 | { 302 | int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE; 303 | if (lb % lbs_per_max_io_size != 0) 304 | { 305 | AEOLUS_LOG_ERROR("Unaligned start LB %lu is unsupported now", lb); 306 | exit(-1); 307 | } 308 | ssd_id = lb / lbs_per_max_io_size % ssd_count; 309 | local_lb = lb / lbs_per_max_io_size / ssd_count * lbs_per_max_io_size; 310 | if (local_lb >= h_ssd_num_lbs[ssd_id]) 311 | { 312 | AEOLUS_LOG_ERROR("Out of bound start LB %lu", lb); 313 | exit(-1); 314 | } 315 | } 316 | 317 | void Controller::read_data(uint64_t start_lb, uint64_t num_lb, void *buf) 318 | { 319 | int ssd_id; 320 | uint64_t local_lb; 321 | lb_to_ssd_id(start_lb, ssd_id, local_lb); 322 | rw_data_kernel<<<1, 1>>>( 323 | NVME_OPCODE_READ, ssd_id, local_lb, num_lb, num_queue_per_ssd, 324 | d_ssdqp, d_iobuf_phys, d_prp_phys, queue_depth, max_io_size, buf_type 325 | ); 326 | if (buf_type == AEOLUS_BUF_USER) { 327 | AEOLUS_CUDA_CHECK(cudaMemcpy( 328 | buf, (uint8_t *)d_iobuf_ptr + (uint64_t)ssd_id * num_queue_per_ssd * 329 | queue_depth * max_io_size, 330 | num_lb * AEOLUS_LB_SIZE, cudaMemcpyDeviceToHost 331 | )); 332 | } else { 333 | // TODO! 334 | } 335 | } 336 | 337 | void Controller::write_data(uint64_t start_lb, uint64_t num_lb, void *buf) 338 | { 339 | int ssd_id; 340 | uint64_t local_lb; 341 | lb_to_ssd_id(start_lb, ssd_id, local_lb); 342 | if (buf_type == AEOLUS_BUF_USER) { 343 | AEOLUS_CUDA_CHECK(cudaMemcpy( 344 | (uint8_t *)d_iobuf_ptr + (uint64_t)ssd_id * num_queue_per_ssd * 345 | queue_depth * max_io_size, 346 | buf, num_lb * AEOLUS_LB_SIZE, cudaMemcpyHostToDevice 347 | )); 348 | } else { 349 | // TODO! 350 | } 351 | rw_data_kernel<<<1, 1>>>( 352 | NVME_OPCODE_WRITE, ssd_id, local_lb, num_lb, num_queue_per_ssd, 353 | d_ssdqp, d_iobuf_phys, d_prp_phys, queue_depth, max_io_size, buf_type 354 | ); 355 | // AEOLUS_CUDA_CHECK(cudaDeviceSynchronize()); 356 | } -------------------------------------------------------------------------------- /src/applications/gemm/src/controller_decouple.cu: -------------------------------------------------------------------------------- 1 | #include "controller.cuh" 2 | 3 | __device__ static int req_id_to_ssd_id(int req_id, int num_ssds, int *ssd_num_reqs_prefix_sum) 4 | { 5 | int ssd_id = 0; 6 | for (; ssd_id < num_ssds; ssd_id++) 7 | if (ssd_num_reqs_prefix_sum[ssd_id] > req_id) 8 | break; 9 | return ssd_id; 10 | } 11 | 12 | // Do NOT use std::pair in device function! Though this can be bypassed by --expt-relaxed-constexpr flag, 13 | // it may contain bugs. 14 | __device__ static void lb_to_ssd_id(uint64_t lb, int num_ssds, uint64_t *ssd_num_lbs, int max_io_size, int &ssd_id, uint64_t &start_lb) 15 | { 16 | int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE; 17 | assert(lb % lbs_per_max_io_size == 0); 18 | ssd_id = lb / lbs_per_max_io_size % num_ssds; 19 | start_lb = lb / lbs_per_max_io_size / num_ssds * lbs_per_max_io_size; 20 | assert(start_lb < ssd_num_lbs[ssd_id]); 21 | } 22 | 23 | __global__ static void submit_io_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *prp2, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size, uint32_t opcode, aeolus_buf_type buf_type) 24 | { 25 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 26 | int num_threads = blockDim.x * gridDim.x; 27 | for (int i = thread_id; i < num_reqs; i += num_threads) 28 | { 29 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 30 | if (ssd_id >= num_ssds) 31 | break; 32 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 33 | int queue_id = req_offset / (queue_depth - 1); 34 | if (queue_id >= num_queues_per_ssd) 35 | printf("%d %d\n", queue_id, num_queues_per_ssd); 36 | assert(queue_id < num_queues_per_ssd); 37 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 38 | int id_in_queue = req_offset % (queue_depth - 1); 39 | int queue_pos = (ssdqp[global_queue_id].sq_tail + id_in_queue) % queue_depth; 40 | 41 | uint64_t global_pos = (uint64_t)global_queue_id * queue_depth + queue_pos; 42 | uint64_t io_addr; 43 | uint64_t io_addr2; 44 | if (buf_type == AEOLUS_BUF_USER) 45 | { 46 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 47 | io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 48 | } 49 | else 50 | { 51 | io_addr = reqs[i].dest_addr; 52 | io_addr2 = reqs[i].next_addr; // io_size <= 8KB 53 | global_pos = reqs[i].next_addr; // io_size > 8KB 54 | } 55 | if (reqs[i].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 56 | { 57 | int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request 58 | uint64_t offset = global_pos * prp_size; 59 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 60 | } 61 | ssdqp[global_queue_id].fill_sq( 62 | ssdqp[global_queue_id].cmd_id + id_in_queue, // command id 63 | queue_pos, // position in SQ 64 | opcode, // opcode 65 | io_addr, // prp1 66 | io_addr2, // prp2 67 | reqs[i].start_lb & 0xffffffff, // start lb low 68 | (reqs[i].start_lb >> 32) & 0xffffffff, // start lb high 69 | NVME_RW_LIMITED_RETRY_MASK | (reqs[i].num_items - 1), // number of LBs 70 | i // req id 71 | ); 72 | } 73 | } 74 | 75 | __global__ static void ring_sq_doorbell_kernel(int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int num_reqs, int queue_depth) 76 | { 77 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 78 | int num_threads = blockDim.x * gridDim.x; 79 | for (int i = thread_id; i < num_reqs; i += num_threads) 80 | { 81 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 82 | if (ssd_id >= num_ssds) 83 | break; 84 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 85 | int queue_id = req_offset / (queue_depth - 1); 86 | assert(queue_id < num_queues_per_ssd); 87 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 88 | int id_in_queue = req_offset % (queue_depth - 1); 89 | 90 | if (id_in_queue == 0) 91 | { 92 | int cnt = ssd_num_reqs[ssd_id] - queue_id * (queue_depth - 1); 93 | if (cnt > queue_depth - 1) 94 | cnt = queue_depth - 1; 95 | ssdqp[global_queue_id].cmd_id += cnt; 96 | ssdqp[global_queue_id].sq_tail = (ssdqp[global_queue_id].sq_tail + cnt) % queue_depth; 97 | // printf("thread %d ssd %d queue %d end req %d cnt %d\n", thread_id, ssd_id, queue_id, ssd_num_reqs_prefix_sum[ssd_id], cnt); 98 | *ssdqp[global_queue_id].sqtdbl = ssdqp[global_queue_id].sq_tail; 99 | } 100 | } 101 | } 102 | 103 | __global__ static void copy_io_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size, aeolus_buf_type buf_type) 104 | { 105 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 106 | int warp_id = thread_id / AEOLUS_WARP_SIZE; 107 | int lane_id = thread_id % AEOLUS_WARP_SIZE; 108 | int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE; 109 | for (int i = warp_id; i < num_reqs; i += num_warps) 110 | { 111 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 112 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 113 | int queue_id = req_offset / (queue_depth - 1); 114 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 115 | int id_in_queue = req_offset % (queue_depth - 1); 116 | int complete_id = ssdqp[global_queue_id].num_completed + id_in_queue; 117 | int queue_pos = complete_id % queue_depth; 118 | 119 | if (lane_id == 0) 120 | { 121 | // printf("polling req %d ssd %d queue %d complete_id %d queue_pos %d num_completed %d\n", i, ssd_id, queue_id, complete_id, queue_pos, ssdqp[global_queue_id].num_completed); 122 | uint32_t current_phase = (complete_id / queue_depth) & 1; 123 | while (((ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 124 | ; 125 | uint32_t status = ssdqp[global_queue_id].cq[queue_pos * 4 + 3]; 126 | uint32_t cmd_id = status & NVME_ENTRY_CID_MASK; 127 | if ((status >> 17) & NVME_ENTRY_SC_MASK) 128 | { 129 | AEOLUS_LOG_ERROR("thread %d cq[%d] status: 0x%x, cid: %d\n", thread_id, queue_pos, (status >> 17) & NVME_ENTRY_SC_MASK, cmd_id); 130 | assert(0); 131 | } 132 | } 133 | 134 | if (buf_type == AEOLUS_BUF_USER) 135 | { 136 | int cmd_id = ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_CID_MASK; 137 | int req_id = ssdqp[global_queue_id].cmd_id_to_req_id[cmd_id % queue_depth]; 138 | int sq_pos = ssdqp[global_queue_id].cmd_id_to_sq_pos[cmd_id % queue_depth]; 139 | for (int j = lane_id; j < reqs[req_id].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE) 140 | ((uint64_t *)reqs[req_id].dest_addr)[j] = IO_buf_base[(uint64_t)global_queue_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + j]; 141 | } 142 | } 143 | } 144 | 145 | __global__ static void ring_cq_doorbell_kernel(int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int num_reqs, int queue_depth) 146 | { 147 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 148 | int num_threads = blockDim.x * gridDim.x; 149 | for (int i = thread_id; i < num_reqs; i += num_threads) 150 | { 151 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 152 | if (ssd_id >= num_ssds) 153 | break; 154 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 155 | int queue_id = req_offset / (queue_depth - 1); 156 | assert(queue_id < num_queues_per_ssd); 157 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 158 | int id_in_queue = req_offset % (queue_depth - 1); 159 | 160 | if (id_in_queue == 0) 161 | { 162 | int cnt = ssd_num_reqs[ssd_id] - queue_id * (queue_depth - 1); 163 | if (cnt > queue_depth - 1) 164 | cnt = queue_depth - 1; 165 | ssdqp[global_queue_id].num_completed += cnt; 166 | ssdqp[global_queue_id].cq_head = (ssdqp[global_queue_id].cq_head + cnt) % queue_depth; 167 | *ssdqp[global_queue_id].cqhdbl = ssdqp[global_queue_id].cq_head; 168 | // printf("queue %d num_completed %d cq_head %d\n", global_queue_id, ssdqp[global_queue_id].num_completed, ssdqp[global_queue_id].cq_head); 169 | } 170 | } 171 | } 172 | 173 | __global__ static void copy_write_data_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size) 174 | { 175 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 176 | int warp_id = thread_id / AEOLUS_WARP_SIZE; 177 | int lane_id = thread_id % AEOLUS_WARP_SIZE; 178 | int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE; 179 | for (int i = warp_id; i < num_reqs; i += num_warps) 180 | { 181 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 182 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 183 | int queue_id = req_offset / (queue_depth - 1); 184 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 185 | int id_in_queue = req_offset % (queue_depth - 1); 186 | int queue_pos = (ssdqp[global_queue_id].sq_tail + id_in_queue) % queue_depth; 187 | 188 | for (int j = lane_id; j < reqs[i].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE) 189 | IO_buf_base[(uint64_t)global_queue_id * queue_depth * max_io_size / 8 + queue_pos * max_io_size / 8 + j] = ((uint64_t *)reqs[i].dest_addr)[j]; 190 | } 191 | } 192 | 193 | __global__ static void poll_write_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_queues_per_ssd, IoQueuePair *ssdqp, uint64_t *IO_buf_base, int *ssd_num_reqs_prefix_sum, int queue_depth, int max_io_size) 194 | { 195 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 196 | int warp_id = thread_id / AEOLUS_WARP_SIZE; 197 | int lane_id = thread_id % AEOLUS_WARP_SIZE; 198 | int num_warps = blockDim.x * gridDim.x / AEOLUS_WARP_SIZE; 199 | for (int i = warp_id; i < num_reqs; i += num_warps) 200 | { 201 | int ssd_id = req_id_to_ssd_id(i, num_ssds, ssd_num_reqs_prefix_sum); 202 | int req_offset = i - (ssd_id == 0 ? 0 : ssd_num_reqs_prefix_sum[ssd_id - 1]); 203 | int queue_id = req_offset / (queue_depth - 1); 204 | int global_queue_id = ssd_id * num_queues_per_ssd + queue_id; 205 | int id_in_queue = req_offset % (queue_depth - 1); 206 | int complete_id = ssdqp[global_queue_id].num_completed + id_in_queue; 207 | int queue_pos = complete_id % queue_depth; 208 | 209 | if (lane_id == 0) 210 | { 211 | // printf("polling req %d ssd %d queue %d complete_id %d queue_pos %d num_completed %d\n", i, ssd_id, queue_id, complete_id, queue_pos, ssdqp[global_queue_id].num_completed); 212 | uint32_t current_phase = (complete_id / queue_depth) & 1; 213 | while (((ssdqp[global_queue_id].cq[queue_pos * 4 + 3] & NVME_ENTRY_PHASE_MASK) >> 16) == current_phase) 214 | ; 215 | uint32_t status = ssdqp[global_queue_id].cq[queue_pos * 4 + 3]; 216 | uint32_t cmd_id = status & NVME_ENTRY_CID_MASK; 217 | if ((status >> 17) & NVME_ENTRY_SC_MASK) 218 | { 219 | AEOLUS_LOG_ERROR("thread %d cq[%d] status: 0x%x, cid: %d\n", thread_id, queue_pos, (status >> 17) & NVME_ENTRY_SC_MASK, cmd_id); 220 | assert(0); 221 | } 222 | } 223 | } 224 | } 225 | 226 | __global__ static void preprocess_io_req_1(Request *reqs, int num_reqs, int num_ssds, int *ssd_num_reqs, uint64_t *ssd_num_lbs, int max_io_size) 227 | { 228 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 229 | int num_threads = blockDim.x * gridDim.x; 230 | for (int i = tid; i < num_reqs; i += num_threads) 231 | { 232 | int ssd_id; 233 | uint64_t start_lb; // Not used. 234 | lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb); 235 | // assert(ssd_id < num_ssds); 236 | if (ssd_id < num_ssds && ssd_id >= 0) 237 | { 238 | atomicAdd(&ssd_num_reqs[ssd_id], 1); 239 | } 240 | else 241 | { 242 | AEOLUS_LOG_ERROR("ssd_id out of bound: %d\n", ssd_id); 243 | } 244 | } 245 | } 246 | 247 | __global__ static void preprocess_io_req_2(int num_ssds, int num_queues_per_ssd, int *ssd_num_reqs, int *ssd_num_reqs_prefix_sum, int queue_depth) 248 | { 249 | for (int i = 0; i < num_ssds; i++) 250 | { 251 | // assert(ssd_num_reqs[i] <= num_queues_per_ssd * (queue_depth - 1)); 252 | if (ssd_num_reqs[i] > num_queues_per_ssd * (queue_depth - 1)) 253 | { 254 | AEOLUS_LOG_ERROR("ssd_num_reqs[%d]: %d\n", i, ssd_num_reqs[i]); 255 | } 256 | ssd_num_reqs_prefix_sum[i] = ssd_num_reqs[i]; 257 | if (i > 0) 258 | ssd_num_reqs_prefix_sum[i] += ssd_num_reqs_prefix_sum[i - 1]; 259 | } 260 | } 261 | 262 | __global__ static void distribute_io_req_1(int num_ssds, int *ssd_num_reqs_prefix_sum, int *req_ids) 263 | { 264 | for (int i = 0; i < num_ssds; i++) 265 | req_ids[i] = i ? ssd_num_reqs_prefix_sum[i - 1] : 0; 266 | } 267 | 268 | __global__ static void distribute_io_req_2(Request *reqs, int num_reqs, int num_ssds, Request *distributed_reqs, int *req_ids, uint64_t *ssd_num_lbs, int max_io_size) 269 | { 270 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 271 | int num_threads = blockDim.x * gridDim.x; 272 | for (int i = tid; i < num_reqs; i += num_threads) 273 | { 274 | int ssd_id; 275 | uint64_t start_lb; 276 | lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb); 277 | // assert(ssd_id < num_ssds); 278 | if (ssd_id < num_ssds && ssd_id >= 0) 279 | { 280 | int req_id = atomicAdd(&req_ids[ssd_id], 1); 281 | distributed_reqs[req_id] = reqs[i]; 282 | distributed_reqs[req_id].start_lb = start_lb; 283 | } 284 | } 285 | } 286 | 287 | __global__ static void distribute_io_req_3(int num_ssds, int *ssd_num_reqs_prefix_sum, int *req_ids) 288 | { 289 | for (int i = 0; i < num_ssds; i++) 290 | { 291 | if (req_ids[i] != ssd_num_reqs_prefix_sum[i]) 292 | { 293 | AEOLUS_LOG_ERROR("req id %d %d\n", req_ids[i], ssd_num_reqs_prefix_sum[i]); 294 | } 295 | // assert(req_ids[i] == ssd_num_reqs_prefix_sum[i]); 296 | } 297 | } 298 | 299 | void ControllerDecoupled::submit_io_req(Request *reqs, int num_reqs, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys) 300 | { 301 | if (num_reqs > AEOLUS_MAX_NUM_REQUESTS) 302 | { 303 | AEOLUS_LOG_ERROR("num_reqs %d > AEOLUS_MAX_NUM_REQUESTS %d", num_reqs, AEOLUS_MAX_NUM_REQUESTS); 304 | exit(1); 305 | } 306 | if (num_reqs > ssd_count * num_queue_per_ssd * queue_depth) 307 | { 308 | AEOLUS_LOG_ERROR("num_reqs %d > ssd_count %d * num_queue_per_ssd %d * queue_depth %d", num_reqs, ssd_count, num_queue_per_ssd, queue_depth); 309 | exit(1); 310 | } 311 | AEOLUS_CUDA_CHECK(cudaMemsetAsync(ssd_num_reqs, 0, sizeof(int) * ssd_count, stream)); 312 | int num_blocks = 32; 313 | preprocess_io_req_1<<>>(reqs, num_reqs, ssd_count, ssd_num_reqs, d_ssd_num_lbs, max_io_size); 314 | preprocess_io_req_2<<<1, 1, 0, stream>>>(ssd_count, num_queue_per_ssd, ssd_num_reqs, ssd_num_reqs_prefix_sum, queue_depth); 315 | distribute_io_req_1<<<1, 1, 0, stream>>>(ssd_count, ssd_num_reqs_prefix_sum, req_ids); 316 | distribute_io_req_2<<>>(reqs, num_reqs, ssd_count, distributed_reqs, req_ids, d_ssd_num_lbs, max_io_size); 317 | distribute_io_req_3<<<1, 1, 0, stream>>>(ssd_count, ssd_num_reqs_prefix_sum, req_ids); 318 | uint32_t opcode = NVME_OPCODE_READ; 319 | if (dir == AEOLUS_DIR_WRITE) 320 | { 321 | opcode = NVME_OPCODE_WRITE; 322 | if (buf_type == AEOLUS_BUF_USER) 323 | copy_write_data_kernel<<>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size); 324 | } 325 | if (d_prp_phys == nullptr) 326 | d_prp_phys = this->d_prp_phys; 327 | submit_io_req_kernel<<>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, d_prp_phys, ssd_num_reqs_prefix_sum, queue_depth, max_io_size, opcode, buf_type); 328 | ring_sq_doorbell_kernel<<>>(ssd_count, num_queue_per_ssd, d_ssdqp, ssd_num_reqs, ssd_num_reqs_prefix_sum, num_reqs, queue_depth); 329 | this->num_reqs = num_reqs; 330 | this->stream = stream; 331 | this->dir = dir; 332 | } 333 | 334 | void ControllerDecoupled::poll() 335 | { 336 | int num_blocks = 32; 337 | if (dir == AEOLUS_DIR_READ) 338 | copy_io_req_kernel<<>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size, buf_type); 339 | else 340 | poll_write_req_kernel<<>>(distributed_reqs, num_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, (uint64_t *)d_iobuf_ptr, ssd_num_reqs_prefix_sum, queue_depth, max_io_size); 341 | ring_cq_doorbell_kernel<<>>(ssd_count, num_queue_per_ssd, d_ssdqp, ssd_num_reqs, ssd_num_reqs_prefix_sum, num_reqs, queue_depth); 342 | } -------------------------------------------------------------------------------- /src/applications/gemm/src/controller_legacy.cu: -------------------------------------------------------------------------------- 1 | #include "controller.cuh" 2 | 3 | __device__ static void lb_to_ssd_id(uint64_t lb, int num_ssds, uint64_t *ssd_num_lbs, int max_io_size, int &ssd_id, uint64_t &start_lb) 4 | { 5 | int lbs_per_max_io_size = max_io_size / AEOLUS_LB_SIZE; 6 | assert(lb % lbs_per_max_io_size == 0); 7 | ssd_id = lb / lbs_per_max_io_size % num_ssds; 8 | start_lb = lb / lbs_per_max_io_size / num_ssds * lbs_per_max_io_size; 9 | assert(start_lb < ssd_num_lbs[ssd_id]); 10 | } 11 | 12 | __global__ static void do_read_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *IO_buf_base, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type) 13 | { 14 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 15 | int warp_id = thread_id / AEOLUS_WARP_SIZE; // global queue id 16 | int lane_id = thread_id % AEOLUS_WARP_SIZE; 17 | int ssd_id = warp_id / num_warps_per_ssd; 18 | if (ssd_id >= num_ssds) 19 | return; 20 | 21 | // submit first page of double buffer 22 | assert(thread_id < num_reqs); 23 | int base_req_id = thread_id - lane_id; 24 | int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth; 25 | 26 | uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos; 27 | uint64_t io_addr; 28 | uint64_t io_addr2; 29 | if (buf_type == AEOLUS_BUF_USER) 30 | { 31 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 32 | io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 33 | } 34 | else 35 | { 36 | io_addr = reqs[thread_id].dest_addr; 37 | io_addr2 = reqs[thread_id].next_addr; // io_size <= 8KB 38 | global_pos = reqs[thread_id].next_addr; // io_size > 8KB 39 | } 40 | int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request 41 | if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 42 | { 43 | uint64_t offset = global_pos * prp_size; 44 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 45 | } 46 | if (lane_id == 0) 47 | { 48 | // ssdqp[warp_id].cmd_id = 0; 49 | // printf("queue %d cmd_id %d\n", warp_id, ssdqp[warp_id].cmd_id); 50 | for (int i = 0; i < queue_depth; i++) 51 | ssdqp[warp_id].sq_entry_busy[i] = false; 52 | } 53 | int num_lbs = reqs[thread_id].num_items ? reqs[thread_id].num_items - 1 : 0; 54 | ssdqp[warp_id].fill_sq( 55 | ssdqp[warp_id].cmd_id + lane_id, // command id 56 | sq_pos, // position in SQ 57 | NVME_OPCODE_READ, // opcode 58 | io_addr, // prp1 59 | io_addr2, // prp2 60 | reqs[thread_id].start_lb & 0xffffffff, // start lb low 61 | (reqs[thread_id].start_lb >> 32) & 0xffffffff, // start lb high 62 | NVME_RW_LIMITED_RETRY_MASK | num_lbs, // number of LBs 63 | thread_id // req id 64 | ); 65 | // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, thread_id, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos); 66 | 67 | __threadfence_system(); 68 | // __syncwarp(); 69 | if (lane_id == 0) 70 | { 71 | ssdqp[warp_id].cmd_id += AEOLUS_WARP_SIZE; 72 | ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + AEOLUS_WARP_SIZE) % queue_depth; 73 | // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail); 74 | *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail; 75 | } 76 | 77 | int stride = num_ssds * num_warps_per_ssd * AEOLUS_WARP_SIZE; 78 | for (int i = thread_id + stride; i < num_reqs + stride; i += stride) 79 | { 80 | int prev_sq_tail = ssdqp[warp_id].sq_tail; 81 | base_req_id = i - lane_id; // first req_id in warp 82 | if (i < num_reqs) 83 | { 84 | // submit second page of double buffer 85 | int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth; 86 | 87 | uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos; 88 | uint64_t io_addr; 89 | uint64_t io_addr2; 90 | if (buf_type == AEOLUS_BUF_USER) 91 | { 92 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 93 | io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 94 | } 95 | else 96 | { 97 | io_addr = reqs[i].dest_addr; 98 | io_addr2 = reqs[i].next_addr; // io_size <= 8KB 99 | global_pos = reqs[i].next_addr; // io_size > 8KB 100 | } 101 | if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 102 | { 103 | uint64_t offset = global_pos * prp_size; 104 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 105 | } 106 | assert(ssdqp[warp_id].sq_entry_busy[sq_pos] == false); 107 | // if (i >= stride * 4 && !req_processed[i - stride * 4]) 108 | // { 109 | // printf("thread %d req_id %d not processed\n", thread_id, i - stride * 4); 110 | // for (int i = 0; i < ssdqp[warp_id].cmd_id; i++) 111 | // { 112 | // int req_id = ssdqp[warp_id].cmd_id_to_req_id[i]; 113 | // int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[i]; 114 | // if (req_id != 0xffffffff) 115 | // printf("thread %d cmd_id %d req_id %d processed %d sq_pos %d busy %d\n", thread_id, i, req_id, req_processed[req_id], sq_pos, ssdqp[warp_id].sq_entry_busy[sq_pos]); 116 | // } 117 | // assert(0); 118 | // } 119 | int num_lbs = reqs[i].num_items ? reqs[i].num_items - 1 : 0; 120 | ssdqp[warp_id].fill_sq( 121 | ssdqp[warp_id].cmd_id + lane_id, // command id 122 | sq_pos, // position in SQ 123 | NVME_OPCODE_READ, // opcode 124 | io_addr, // prp1 125 | io_addr2, // prp2 126 | reqs[i].start_lb & 0xffffffff, // start lb low 127 | (reqs[i].start_lb >> 32) & 0xffffffff, // start lb high 128 | NVME_RW_LIMITED_RETRY_MASK | num_lbs, // number of LBs 129 | i // req id 130 | ); 131 | // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, i, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos); 132 | 133 | __threadfence_system(); 134 | // __syncwarp(); 135 | if (lane_id == 0) 136 | { 137 | int cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE; 138 | assert(cnt == AEOLUS_WARP_SIZE); 139 | ssdqp[warp_id].cmd_id += cnt; 140 | ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + cnt) % queue_depth; 141 | // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail); 142 | *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail; 143 | } 144 | } 145 | 146 | // poll and copy the *previous* page of double buffer 147 | int prev_cq_head = ssdqp[warp_id].cq_head; 148 | if (lane_id == 0) 149 | { 150 | uint32_t code; 151 | ssdqp[warp_id].poll_range(code, prev_sq_tail, i < num_reqs); 152 | assert(code == 0); 153 | if (i + stride < num_reqs) 154 | { 155 | base_req_id += stride; 156 | int next_cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE; 157 | for (int j = 0; j < next_cnt; j++) 158 | { 159 | int sq_pos = (ssdqp[warp_id].sq_tail + j) % queue_depth; 160 | if (ssdqp[warp_id].sq_entry_busy[sq_pos]) 161 | { 162 | ssdqp[warp_id].poll_until_sq_entry_free(code, sq_pos); 163 | assert(code == 0); 164 | } 165 | } 166 | } 167 | } 168 | 169 | if (buf_type == AEOLUS_BUF_USER) 170 | { 171 | // copy data from IO buffer to app buffer 172 | for (int j = prev_cq_head; j != ssdqp[warp_id].cq_head; j = (j + 1) % queue_depth) 173 | { 174 | int cmd_id = (ssdqp[warp_id].cq[j * 4 + 3] & NVME_ENTRY_CID_MASK) % queue_depth; 175 | int req_id = ssdqp[warp_id].cmd_id_to_req_id[cmd_id]; 176 | int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[cmd_id]; 177 | for (int k = lane_id; k < reqs[req_id].num_items * AEOLUS_LB_SIZE / 8; k += AEOLUS_WARP_SIZE) 178 | ((uint64_t *)reqs[req_id].dest_addr)[k] = IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + k]; 179 | } 180 | } 181 | } 182 | } 183 | 184 | __global__ static void do_write_req_kernel(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, IoQueuePair *ssdqp, uint64_t *prp1, uint64_t *IO_buf_base, uint64_t *prp2, int queue_depth, int max_io_size, aeolus_buf_type buf_type) 185 | { 186 | int thread_id = blockIdx.x * blockDim.x + threadIdx.x; 187 | int warp_id = thread_id / AEOLUS_WARP_SIZE; // global queue id 188 | int lane_id = thread_id % AEOLUS_WARP_SIZE; 189 | int ssd_id = warp_id / num_warps_per_ssd; 190 | if (ssd_id >= num_ssds) 191 | return; 192 | 193 | // submit first page of double buffer 194 | assert(thread_id < num_reqs); 195 | int base_req_id = thread_id - lane_id; 196 | int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth; 197 | 198 | uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos; 199 | uint64_t io_addr; 200 | uint64_t io_addr2; 201 | if (buf_type == AEOLUS_BUF_USER) 202 | { 203 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 204 | io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 205 | } 206 | else 207 | { 208 | io_addr = reqs[thread_id].dest_addr; 209 | io_addr2 = reqs[thread_id].next_addr; // io_size <= 8KB 210 | global_pos = reqs[thread_id].next_addr; // io_size > 8KB 211 | } 212 | int prp_size = max_io_size / AEOLUS_HOST_PGSIZE * sizeof(uint64_t); // PRP list size of a request 213 | if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 214 | { 215 | uint64_t offset = global_pos * prp_size; 216 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 217 | } 218 | if (lane_id == 0) 219 | { 220 | // ssdqp[warp_id].cmd_id = 0; 221 | // printf("queue %d cmd_id %d\n", warp_id, ssdqp[warp_id].cmd_id); 222 | for (int i = 0; i < queue_depth; i++) 223 | ssdqp[warp_id].sq_entry_busy[i] = false; 224 | } 225 | int num_lbs = reqs[thread_id].num_items ? reqs[thread_id].num_items - 1 : 0; 226 | ssdqp[warp_id].fill_sq( 227 | ssdqp[warp_id].cmd_id + lane_id, // command id 228 | sq_pos, // position in SQ 229 | NVME_OPCODE_WRITE, // opcode 230 | io_addr, // prp1 231 | io_addr2, // prp2 232 | reqs[thread_id].start_lb & 0xffffffff, // start lb low 233 | (reqs[thread_id].start_lb >> 32) & 0xffffffff, // start lb high 234 | NVME_RW_LIMITED_RETRY_MASK | num_lbs, // number of LBs 235 | thread_id // req id 236 | ); 237 | // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, thread_id, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos); 238 | 239 | if (buf_type == AEOLUS_BUF_USER) 240 | { 241 | for (int i = base_req_id; i < base_req_id + AEOLUS_WARP_SIZE; i++) 242 | for (int j = lane_id; j < reqs[i].num_items * AEOLUS_LB_SIZE / 8; j += AEOLUS_WARP_SIZE) 243 | { 244 | int sq_pos = (ssdqp[warp_id].sq_tail + i - base_req_id) % queue_depth; 245 | IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + j] = ((uint64_t *)reqs[i].dest_addr)[j]; 246 | } 247 | } 248 | 249 | __threadfence_system(); 250 | // __syncwarp(); 251 | if (lane_id == 0) 252 | { 253 | ssdqp[warp_id].cmd_id += AEOLUS_WARP_SIZE; 254 | ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + AEOLUS_WARP_SIZE) % queue_depth; 255 | // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail); 256 | *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail; 257 | } 258 | 259 | int stride = num_ssds * num_warps_per_ssd * AEOLUS_WARP_SIZE; 260 | for (int i = thread_id + stride; i < num_reqs + stride; i += stride) 261 | { 262 | int prev_sq_tail = ssdqp[warp_id].sq_tail; 263 | base_req_id = i - lane_id; // first req_id in warp 264 | if (i < num_reqs) 265 | { 266 | // submit second page of double buffer 267 | int sq_pos = (ssdqp[warp_id].sq_tail + lane_id) % queue_depth; 268 | 269 | uint64_t global_pos = (uint64_t)warp_id * queue_depth + sq_pos; 270 | uint64_t io_addr; 271 | uint64_t io_addr2; 272 | if (buf_type == AEOLUS_BUF_USER) 273 | { 274 | io_addr = prp1[0] + global_pos * max_io_size; // assume contiguous! 275 | io_addr2 = io_addr / AEOLUS_HOST_PGSIZE * AEOLUS_HOST_PGSIZE + AEOLUS_HOST_PGSIZE; 276 | } 277 | else 278 | { 279 | io_addr = reqs[i].dest_addr; 280 | io_addr2 = reqs[i].next_addr; // io_size <= 8KB 281 | global_pos = reqs[i].next_addr; // io_size > 8KB 282 | } 283 | if (reqs[thread_id].num_items * AEOLUS_LB_SIZE > AEOLUS_HOST_PGSIZE * 2) 284 | { 285 | uint64_t offset = global_pos * prp_size; 286 | io_addr2 = prp2[offset / AEOLUS_HOST_PGSIZE] + offset % AEOLUS_HOST_PGSIZE; 287 | } 288 | assert(ssdqp[warp_id].sq_entry_busy[sq_pos] == false); 289 | // if (i >= stride * 4 && !req_processed[i - stride * 4]) 290 | // { 291 | // printf("thread %d req_id %d not processed\n", thread_id, i - stride * 4); 292 | // for (int i = 0; i < ssdqp[warp_id].cmd_id; i++) 293 | // { 294 | // int req_id = ssdqp[warp_id].cmd_id_to_req_id[i]; 295 | // int sq_pos = ssdqp[warp_id].cmd_id_to_sq_pos[i]; 296 | // if (req_id != 0xffffffff) 297 | // printf("thread %d cmd_id %d req_id %d processed %d sq_pos %d busy %d\n", thread_id, i, req_id, req_processed[req_id], sq_pos, ssdqp[warp_id].sq_entry_busy[sq_pos]); 298 | // } 299 | // assert(0); 300 | // } 301 | int num_lbs = reqs[i].num_items ? reqs[i].num_items - 1 : 0; 302 | ssdqp[warp_id].fill_sq( 303 | ssdqp[warp_id].cmd_id + lane_id, // command id 304 | sq_pos, // position in SQ 305 | NVME_OPCODE_WRITE, // opcode 306 | io_addr, // prp1 307 | io_addr2, // prp2 308 | reqs[i].start_lb & 0xffffffff, // start lb low 309 | (reqs[i].start_lb >> 32) & 0xffffffff, // start lb high 310 | NVME_RW_LIMITED_RETRY_MASK | num_lbs, // number of LBs 311 | i // req id 312 | ); 313 | // printf("thread %d req_id %d cmd_id %d num_completed %d sq_pos %d\n", thread_id, i, ssdqp[warp_id].cmd_id + lane_id, ssdqp[warp_id].num_completed, sq_pos); 314 | 315 | if (buf_type == AEOLUS_BUF_USER) 316 | { 317 | for (int j = base_req_id; j < base_req_id + AEOLUS_WARP_SIZE; j++) 318 | for (int k = lane_id; k < reqs[j].num_items * AEOLUS_LB_SIZE / 8; k += AEOLUS_WARP_SIZE) 319 | { 320 | int sq_pos = (ssdqp[warp_id].sq_tail + j - base_req_id) % queue_depth; 321 | IO_buf_base[(uint64_t)warp_id * queue_depth * max_io_size / 8 + sq_pos * max_io_size / 8 + k] = ((uint64_t *)reqs[j].dest_addr)[k]; 322 | } 323 | } 324 | 325 | __threadfence_system(); 326 | // __syncwarp(); 327 | if (lane_id == 0) 328 | { 329 | int cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE; 330 | assert(cnt == AEOLUS_WARP_SIZE); 331 | ssdqp[warp_id].cmd_id += cnt; 332 | ssdqp[warp_id].sq_tail = (ssdqp[warp_id].sq_tail + cnt) % queue_depth; 333 | // printf("Warp %d, sq_tail is %p, set sqtdbl to %d\n", warp_id, ssdqp[warp_id].sqtdbl, ssdqp[warp_id].sq_tail); 334 | *ssdqp[warp_id].sqtdbl = ssdqp[warp_id].sq_tail; 335 | } 336 | } 337 | 338 | // poll and copy the *previous* page of double buffer 339 | if (lane_id == 0) 340 | { 341 | uint32_t code; 342 | ssdqp[warp_id].poll_range(code, prev_sq_tail, i < num_reqs); 343 | assert(code == 0); 344 | if (i + stride < num_reqs) 345 | { 346 | base_req_id += stride; 347 | int next_cnt = num_reqs - base_req_id < AEOLUS_WARP_SIZE ? num_reqs - base_req_id : AEOLUS_WARP_SIZE; 348 | for (int j = 0; j < next_cnt; j++) 349 | { 350 | int sq_pos = (ssdqp[warp_id].sq_tail + j) % queue_depth; 351 | if (ssdqp[warp_id].sq_entry_busy[sq_pos]) 352 | { 353 | ssdqp[warp_id].poll_until_sq_entry_free(code, sq_pos); 354 | assert(code == 0); 355 | } 356 | } 357 | } 358 | } 359 | } 360 | } 361 | 362 | __global__ static void preprocess_io_req_1(Request *reqs, int num_reqs, int num_ssds, int *ssd_num_reqs, uint64_t *ssd_num_lbs, int max_io_size) 363 | { 364 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 365 | int num_threads = blockDim.x * gridDim.x; 366 | for (int i = tid; i < num_reqs; i += num_threads) 367 | { 368 | int ssd_id; 369 | uint64_t start_lb; // Not used. 370 | lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb); 371 | assert(ssd_id < num_ssds); 372 | atomicAdd(&ssd_num_reqs[ssd_id], 1); 373 | } 374 | } 375 | 376 | __global__ static void preprocess_io_req_2(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, int *ssd_num_reqs, int *num_distributed_reqs) 377 | { 378 | int max_bucket = 0; 379 | for (int i = 0; i < num_ssds; i++) 380 | if (ssd_num_reqs[i] > max_bucket) 381 | max_bucket = ssd_num_reqs[i]; 382 | int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE; 383 | max_bucket = (max_bucket + num_reqs_per_chunk - 1) / num_reqs_per_chunk * num_reqs_per_chunk; 384 | *num_distributed_reqs = max_bucket * num_ssds; 385 | } 386 | 387 | __global__ static void distribute_io_req_1(int num_ssds, int num_warps_per_ssd, int *req_ids) 388 | { 389 | int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE; 390 | for (int i = 0; i < num_ssds; i++) 391 | req_ids[i] = i * num_reqs_per_chunk; 392 | } 393 | 394 | __global__ static void distribute_io_req_2(Request *reqs, int num_reqs, int num_ssds, int num_warps_per_ssd, Request *distributed_reqs, int *req_ids, uint64_t *ssd_num_lbs, int max_io_size) 395 | { 396 | int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE; 397 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 398 | int num_threads = blockDim.x * gridDim.x; 399 | for (int i = tid; i < num_reqs; i += num_threads) 400 | { 401 | int ssd_id; 402 | uint64_t start_lb; 403 | lb_to_ssd_id(reqs[i].start_lb, num_ssds, ssd_num_lbs, max_io_size, ssd_id, start_lb); 404 | assert(ssd_id < num_ssds); 405 | int req_id; 406 | for (;;) 407 | { 408 | req_id = req_ids[ssd_id]; 409 | int next_req_id = req_id + 1; 410 | if (next_req_id % num_reqs_per_chunk == 0) 411 | next_req_id += num_reqs_per_chunk * (num_ssds - 1); 412 | if (atomicCAS(&req_ids[ssd_id], req_id, next_req_id) == req_id) 413 | break; 414 | } 415 | distributed_reqs[req_id] = reqs[i]; 416 | distributed_reqs[req_id].start_lb = start_lb; 417 | } 418 | } 419 | 420 | __global__ static void distribute_io_req_3(int num_ssds, int num_warps_per_ssd, Request *distributed_reqs, int *req_ids, int *num_distributed_reqs) 421 | { 422 | int num_reqs_per_chunk = num_warps_per_ssd * AEOLUS_WARP_SIZE; 423 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 424 | int num_threads = blockDim.x * gridDim.x; 425 | for (int i = tid; i < num_ssds; i += num_threads) 426 | for (int j = req_ids[i]; j < *num_distributed_reqs;) 427 | { 428 | distributed_reqs[j].num_items = 0; 429 | distributed_reqs[j++].start_lb = 0; 430 | if (j % num_reqs_per_chunk == 0) 431 | j += num_reqs_per_chunk * (num_ssds - 1); 432 | } 433 | } 434 | 435 | void ControllerLegacy::submit_io_req(Request *req, int num_req, aeolus_access_dir dir, cudaStream_t stream, uint64_t* d_prp_phys) { 436 | if (num_req > AEOLUS_MAX_NUM_REQUESTS) 437 | { 438 | AEOLUS_LOG_ERROR("num_reqs %d > AEOLUS_MAX_NUM_REQUESTS %d", num_req, AEOLUS_MAX_NUM_REQUESTS); 439 | return; 440 | } 441 | AEOLUS_CUDA_CHECK(cudaMemsetAsync(ssd_num_reqs, 0, sizeof(int) * ssd_count, stream)); 442 | int num_blocks = 8; 443 | preprocess_io_req_1<<>>(req, num_req, ssd_count, ssd_num_reqs, d_ssd_num_lbs, max_io_size); 444 | int *num_distributed_reqs; 445 | AEOLUS_CUDA_CHECK(cudaMalloc(&num_distributed_reqs, sizeof(int))); 446 | preprocess_io_req_2<<<1, 1, 0, stream>>>(req, num_req, ssd_count, num_queue_per_ssd, ssd_num_reqs, num_distributed_reqs); 447 | distribute_io_req_1<<<1, 1, 0, stream>>>(ssd_count, num_queue_per_ssd, req_ids); 448 | distribute_io_req_2<<>>(req, num_req, ssd_count, num_queue_per_ssd, distributed_reqs, req_ids, d_ssd_num_lbs, max_io_size); 449 | distribute_io_req_3<<>>(ssd_count, num_queue_per_ssd, distributed_reqs, req_ids, num_distributed_reqs); 450 | 451 | int h_num_distributed_reqs; 452 | AEOLUS_CUDA_CHECK(cudaMemcpy(&h_num_distributed_reqs, num_distributed_reqs, sizeof(int), cudaMemcpyDeviceToHost)); 453 | int num_threads = ssd_count * num_queue_per_ssd * AEOLUS_WARP_SIZE; 454 | num_blocks = CEIL(num_threads, AEOLUS_NUM_THREADS_PER_BLOCK); 455 | if (d_prp_phys == nullptr) 456 | d_prp_phys = this->d_prp_phys; 457 | if (dir == AEOLUS_DIR_READ) 458 | do_read_req_kernel<<>>(distributed_reqs, h_num_distributed_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, (uint64_t *)d_iobuf_ptr, d_prp_phys, queue_depth, max_io_size, buf_type); 459 | else 460 | do_write_req_kernel<<>>(distributed_reqs, h_num_distributed_reqs, ssd_count, num_queue_per_ssd, d_ssdqp, d_iobuf_phys, (uint64_t *)d_iobuf_ptr, d_prp_phys, queue_depth, max_io_size, buf_type); 461 | AEOLUS_CUDA_CHECK(cudaFree(num_distributed_reqs)); 462 | } --------------------------------------------------------------------------------