├── mica_pmem ├── .gitignore ├── Makefile ├── run.sh ├── test.cc └── bench.cc ├── scripts ├── .gitignore ├── clean.sh ├── mlx_env.sh ├── reconfigure_fsdax_to_devdax.sh ├── utils.sh └── ipmctl_watch.sh ├── README.md ├── nvme_perf ├── c │ ├── .gitignore │ ├── Makefile │ └── hello.c └── latency.sh ├── rdma ├── rdma-write-bw │ ├── .gitignore │ ├── kill.sh │ ├── config │ ├── README.md │ ├── Makefile │ ├── run-servers.sh │ ├── run-machine.sh │ └── main.cc ├── rw-tput-receiver │ ├── .gitignore │ ├── kill.sh │ ├── config │ ├── notes.md │ ├── Makefile │ ├── run-servers.sh │ ├── run-machine.sh │ └── main.cc ├── rdma-write-flush-lat │ ├── .gitignore │ ├── kill.sh │ ├── README.md │ ├── Makefile │ ├── run-servers.sh │ ├── run-machine.sh │ ├── latency.h │ └── main.cc └── libhrd_cpp │ └── hrd.h ├── hopscotch_pmem ├── .gitignore ├── Makefile ├── run.sh ├── LICENSE ├── test.cc └── bench.cc ├── hog ├── hog ├── Makefile ├── run.sh └── hog.cc ├── .clang-format ├── log_store ├── Makefile ├── run.sh ├── rotating_counter.h └── bench.cc ├── ioat ├── .clang-format ├── Makefile ├── setup_dpdk.sh ├── run.sh ├── virt2phy.h ├── bench.cc └── huge_alloc.h ├── cacheline_versions ├── Makefile ├── sweep.sh ├── run.sh └── bench.cc ├── microbench ├── read_latency │ ├── Makefile │ ├── run.sh │ └── bench.cc ├── write_latency │ ├── Makefile │ ├── run.sh │ └── bench.cc ├── Makefile ├── run.sh ├── README ├── seq_read_tput.h ├── rand_write_tput.h ├── bench.h ├── rand_read_tput.h ├── seq_write_tput.h ├── rand_write_latency.h ├── rand_read_latency.h ├── seq_write_latency.h └── bench.cc ├── pmemkv_perf ├── run.sh └── bench.cc ├── circular_writes_tput ├── dram_only │ ├── Makefile │ ├── sweep.sh │ ├── run.sh │ └── bench.cc ├── Makefile ├── sweep.sh ├── run.sh └── bench.cc ├── randomizer ├── Makefile └── main.cc ├── .gitignore ├── utils ├── hdr_histogram_wrapper.h └── timer.h ├── .ycm_extra_conf.py └── common.h /mica_pmem/.gitignore: -------------------------------------------------------------------------------- 1 | test 2 | -------------------------------------------------------------------------------- /scripts/.gitignore: -------------------------------------------------------------------------------- 1 | watch_out 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Persistent memory benchmarks 2 | -------------------------------------------------------------------------------- /nvme_perf/c/.gitignore: -------------------------------------------------------------------------------- 1 | perf 2 | hello 3 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/.gitignore: -------------------------------------------------------------------------------- 1 | write-bw 2 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/.gitignore: -------------------------------------------------------------------------------- 1 | main 2 | -------------------------------------------------------------------------------- /hopscotch_pmem/.gitignore: -------------------------------------------------------------------------------- 1 | test 2 | bench 3 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/.gitignore: -------------------------------------------------------------------------------- 1 | write-flush 2 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/kill.sh: -------------------------------------------------------------------------------- 1 | sudo pkill main 2 | sudo pkill memcached 3 | -------------------------------------------------------------------------------- /hog/hog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anujkaliaiitd/pmem-bench/HEAD/hog/hog -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/kill.sh: -------------------------------------------------------------------------------- 1 | sudo pkill main 2 | sudo pkill memcached 3 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/kill.sh: -------------------------------------------------------------------------------- 1 | sudo pkill main 2 | sudo pkill memcached 3 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | AllowShortCaseLabelsOnASingleLine: true 3 | -------------------------------------------------------------------------------- /hog/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o hog hog.cc -lpmem -march=native 3 | clean: 4 | rm hog 5 | -------------------------------------------------------------------------------- /hog/run.sh: -------------------------------------------------------------------------------- 1 | make 2 | rm -f /tmp/hogout 3 | sudo -E taskset -c 23 ./hog > /tmp/hogout & 4 | -------------------------------------------------------------------------------- /log_store/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o bench bench.cc -lpmem -march=native 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/config: -------------------------------------------------------------------------------- 1 | --min_write_size 1024 2 | --max_write_size 131072 3 | --window_size 32 4 | -------------------------------------------------------------------------------- /ioat/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | AllowShortCaseLabelsOnASingleLine: true 3 | SortIncludes: false 4 | -------------------------------------------------------------------------------- /cacheline_versions/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o bench bench.cc -lpmem -march=native -lgflags 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /microbench/read_latency/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -g -o bench bench.cc -lpmem -march=native 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /microbench/write_latency/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -g -o bench bench.cc -lpmem -march=native 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /pmemkv_perf/run.sh: -------------------------------------------------------------------------------- 1 | pmempool rm --verbose /dev/dax0.0 2 | pmempool create --layout pmemkv obj /dev/dax0.0 3 | ./bench 4 | -------------------------------------------------------------------------------- /circular_writes_tput/dram_only/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o bench bench.cc -lpmem -march=native 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /circular_writes_tput/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o bench bench.cc -lpmem -march=native -lgflags -lpthread -lnuma 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/config: -------------------------------------------------------------------------------- 1 | --num_client_processes 2 2 | --num_threads_per_client 4 3 | --use_uc 0 4 | --size 64 5 | --postlist 4 6 | --do_read 0 7 | -------------------------------------------------------------------------------- /randomizer/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -O3 -o main main.cc -lpmem -lhdr_histogram_static -lpthread -lgflags -march=native -lnuma -lcityhash 3 | clean: 4 | rm main 5 | -------------------------------------------------------------------------------- /scripts/clean.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Clean temporary files generated by CMake for eRPC 3 | rm -rf build CMakeFiles CMakeCache.txt cmake_install.cmake CTestTestfile.cmake Makefile Testing 4 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/notes.md: -------------------------------------------------------------------------------- 1 | # Oct 27, with commit 32b029c: 2 | * One Optane DIMM at server, DDIO disabled 3 | * 64-byte random writes from 8 clients spread over two machines get 9 M/s 4 | total 5 | -------------------------------------------------------------------------------- /hopscotch_pmem/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -g -o test test.cc -lcityhash -lgtest -lpmem 3 | g++ -g -O3 -DNDEBUG bench.cc -o bench -lpmem -lcityhash -lpthread -lgtest -lnuma -lgflags -march=native 4 | clean: 5 | rm test bench 6 | -------------------------------------------------------------------------------- /mica_pmem/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ -g test.cc -o test -lpmem -lcityhash -lpthread -lgtest -lnuma 3 | g++ -g -O3 -DNDEBUG bench.cc -o bench -lpmem -lcityhash -lpthread -lgtest -lnuma -lgflags -march=native 4 | clean: 5 | rm test 6 | -------------------------------------------------------------------------------- /scripts/mlx_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Config for Mellanox userspace driver 3 | 4 | export MLX4_SINGLE_THREADED=1 5 | export MLX5_SINGLE_THREADED=1 6 | export MLX_QP_ALLOC_TYPE="HUGE" 7 | export MLX_CQ_ALLOC_TYPE="HUGE" 8 | -------------------------------------------------------------------------------- /circular_writes_tput/dram_only/sweep.sh: -------------------------------------------------------------------------------- 1 | for num_counters in `seq 1 32`; do 2 | make 1>/dev/null 2>/dev/null 3 | t=`/usr/bin/time -f "%e" numactl --physcpubind=3 --membind=0 ./bench $num_counters` 4 | 5 | #echo "$kNumCounters;$t" 6 | done 7 | -------------------------------------------------------------------------------- /nvme_perf/c/Makefile: -------------------------------------------------------------------------------- 1 | # On the Intel AEP servers, SPDK is installed at system-level, but DPDK isn't. 2 | all: 3 | gcc -o hello hello.c -L /home/akalia/sandbox/spdk/dpdk/build/lib \ 4 | -lspdk_nvme -lspdk_util -lspdk_env_dpdk -lspdk_log \ 5 | -lpthread -ldpdk -lnuma -ldl -luuid 6 | -------------------------------------------------------------------------------- /circular_writes_tput/sweep.sh: -------------------------------------------------------------------------------- 1 | for num_counters in 1 2 3 4 5 8 16; do 2 | for stride_size in 64 256; do 3 | rm config.h 4 | touch config.h 5 | 6 | sudo -E env numactl --physcpubind=3 --membind=0 ./bench \ 7 | --num_counters=$num_counters \ 8 | --stride_size=$stride_size 9 | done 10 | done 11 | -------------------------------------------------------------------------------- /microbench/Makefile: -------------------------------------------------------------------------------- 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native 2 | SOURCES=bench.cc 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem 4 | 5 | all: 6 | g++ -std=c++11 -O3 ${CPP_FLAGS} -o bench ${SOURCES} ${LIBS} 7 | 8 | clean: 9 | rm bench 10 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/README.md: -------------------------------------------------------------------------------- 1 | Latency to flush a write from remote NIC cache 2 | 3 | DRAM writes: 4 | * 16-byte inline WRITE latency = 1.3 us (median and 99th percentile) 5 | * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.6 us 50%, 2.9 us 99% 6 | * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.4 us 50%, 2.6 us 99% 7 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/README.md: -------------------------------------------------------------------------------- 1 | Latency to flush a write from remote NIC cache 2 | 3 | DRAM writes: 4 | * 16-byte inline WRITE latency = 1.3 us (median and 99th percentile) 5 | * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.6 us 50%, 2.9 us 99% 6 | * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.4 us 50%, 2.6 us 99% 7 | -------------------------------------------------------------------------------- /cacheline_versions/sweep.sh: -------------------------------------------------------------------------------- 1 | for kNumCounters in 1 2 3 4 5 8 16; do 2 | for kStrideSize in 256 4096; do 3 | rm config.h 4 | touch config.h 5 | 6 | echo "static constexpr size_t kNumCounters = $kNumCounters;" >> config.h 7 | echo "static constexpr size_t kStrideSize = $kStrideSize;" >> config.h 8 | 9 | make 10 | ./run.sh 11 | done 12 | done 13 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/Makefile: -------------------------------------------------------------------------------- 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem 4 | 5 | all: 6 | g++ -std=c++11 -O3 ${CPP_FLAGS} -o write-bw ${SOURCES} ${LIBS} 7 | 8 | clean: 9 | rm write-bw 10 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/Makefile: -------------------------------------------------------------------------------- 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem 4 | 5 | all: 6 | g++ -std=c++11 -O3 ${CPP_FLAGS} -o main ${SOURCES} ${LIBS} 7 | 8 | clean: 9 | rm write-bw 10 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/Makefile: -------------------------------------------------------------------------------- 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma 4 | 5 | all: 6 | g++ -std=c++11 -O3 ${CPP_FLAGS} -o write-flush ${SOURCES} ${LIBS} 7 | 8 | clean: 9 | rm write-flush 10 | -------------------------------------------------------------------------------- /circular_writes_tput/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | if [ "$#" -gt 1 ]; then 5 | blue "Illegal number of arguments." 6 | blue "Usage: ./run.sh, or ./run.sh gdb" 7 | exit 8 | fi 9 | 10 | # Check for non-gdb mode 11 | if [ "$#" -eq 0 ]; then 12 | sudo -E env numactl --physcpubind=3 --membind=0 $exe 13 | fi 14 | 15 | # Check for gdb mode 16 | if [ "$#" -eq 1 ]; then 17 | gdb -ex run --args $exe --num_threads=$num_threads 18 | fi 19 | -------------------------------------------------------------------------------- /ioat/Makefile: -------------------------------------------------------------------------------- 1 | DPDK_HOME=/usr 2 | 3 | CFLAGS=-Wall -Wextra -Werror -pedantic -fpermissive -march=native \ 4 | -Wold-style-cast -Wsign-conversion \ 5 | -Wno-unused-function 6 | 7 | all: 8 | g++ -O3 -std=c++11 ${CFLAGS} -o bench bench.cc -isystem ${DPDK_HOME}/include/dpdk/ -march=native -L ${DPDK_HOME}/lib/ \ 9 | -Wl,--whole-archive \ 10 | -ldpdk -lnuma -lpthread -ldl -lm -lgflags -lpmem \ 11 | -Wl,--no-whole-archive \ 12 | 13 | clean: 14 | rm bench 15 | -------------------------------------------------------------------------------- /log_store/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | num_threads=1 5 | 6 | if [ "$#" -gt 1 ]; then 7 | blue "Illegal number of arguments." 8 | blue "Usage: ./run.sh, or ./run.sh gdb" 9 | exit 10 | fi 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 0 ]; then 14 | numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 1 ]; then 19 | gdb -ex run --args $exe --num_threads=$num_threads 20 | fi 21 | -------------------------------------------------------------------------------- /microbench/read_latency/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | num_threads=1 5 | 6 | if [ "$#" -gt 1 ]; then 7 | blue "Illegal number of arguments." 8 | blue "Usage: ./run.sh, or ./run.sh gdb" 9 | exit 10 | fi 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 0 ]; then 14 | numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 1 ]; then 19 | gdb -ex run --args $exe --num_threads=$num_threads 20 | fi 21 | -------------------------------------------------------------------------------- /cacheline_versions/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | num_threads=1 5 | 6 | if [ "$#" -gt 1 ]; then 7 | blue "Illegal number of arguments." 8 | blue "Usage: ./run.sh, or ./run.sh gdb" 9 | exit 10 | fi 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 0 ]; then 14 | sudo -E numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 1 ]; then 19 | gdb -ex run --args $exe --num_threads=$num_threads 20 | fi 21 | -------------------------------------------------------------------------------- /microbench/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | num_threads=1 5 | 6 | if [ "$#" -gt 1 ]; then 7 | blue "Illegal number of arguments." 8 | blue "Usage: ./run.sh, or ./run.sh gdb" 9 | exit 10 | fi 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 0 ]; then 14 | sudo -E numactl --cpunodebind=0 --membind=0 $exe --num_threads=$num_threads 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 1 ]; then 19 | sudo -E gdb -ex run --args $exe --num_threads=$num_threads 20 | fi 21 | -------------------------------------------------------------------------------- /microbench/write_latency/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | num_threads=1 5 | 6 | if [ "$#" -gt 1 ]; then 7 | blue "Illegal number of arguments." 8 | blue "Usage: ./run.sh, or ./run.sh gdb" 9 | exit 10 | fi 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 0 ]; then 14 | numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 1 ]; then 19 | gdb -ex run --args $exe --num_threads=$num_threads 20 | fi 21 | -------------------------------------------------------------------------------- /scripts/reconfigure_fsdax_to_devdax.sh: -------------------------------------------------------------------------------- 1 | # Destroy fsdax namespaces 2 | 3 | # This can fail even when /mnt/pmem0 and /mnt/pmem1 are empty. Rebooting solves it. 4 | echo "Unmounting" 5 | sudo umount /mnt/pmem0 6 | sudo umount /mnt/pmem1 7 | 8 | echo "Destroying namespaces via ndctl. This takes a while." 9 | sudo ndctl destroy-namespace -f all 10 | 11 | echo "Recreating devdax namespaces" 12 | sudo ndctl create-namespace --mode devdax --region 0 13 | sudo ndctl create-namespace --mode devdax --region 1 14 | -------------------------------------------------------------------------------- /circular_writes_tput/dram_only/run.sh: -------------------------------------------------------------------------------- 1 | exe="./bench" 2 | chmod +x $exe 3 | 4 | if [ "$#" -gt 2 ]; then 5 | blue "Illegal number of arguments." 6 | blue "Usage: ./run.sh , or ./run.sh " 7 | exit 8 | fi 9 | 10 | num_counters=$1 11 | 12 | # Check for non-gdb mode 13 | if [ "$#" -eq 1 ]; then 14 | sudo numactl --physcpubind=3 --membind=0 $exe $num_counters 15 | fi 16 | 17 | # Check for gdb mode 18 | if [ "$#" -eq 2 ]; then 19 | gdb -ex run --args $exe $num_counters 20 | fi 21 | -------------------------------------------------------------------------------- /ioat/setup_dpdk.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | dpdk=~/sandbox/dpdk-19.08/ 3 | 4 | sudo modprobe uio 5 | sudo insmod $dpdk/x86_64-native-linux-gcc/kmod/igb_uio.ko 6 | 7 | # Create hugepage mount 8 | sudo mkdir -p /mnt/huge 9 | grep -s /mnt/huge /proc/mounts > /dev/null 10 | 11 | if [ $? -ne 0 ] ; then 12 | sudo mount -t hugetlbfs nodev /mnt/huge 13 | fi 14 | 15 | # Bind IOAT devices on NUMA node 0, choose igb_uio (userspace) or ioatdma (kernel) 16 | for i in `seq 0 7`; do 17 | sudo ${dpdk}/usertools/dpdk-devbind.py -b igb_uio 0000:00:04.$i 18 | done 19 | 20 | # Bind IOAT devices on NUMA node 1, choose igb_uio (userspace) or ioatdma (kernel) 21 | for i in `seq 0 7`; do 22 | sudo ${dpdk}/usertools/dpdk-devbind.py -b ioatdma 0000:80:04.$i 23 | done 24 | -------------------------------------------------------------------------------- /scripts/utils.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Utilities for other scripts 3 | 4 | # Echo in blue color 5 | function blue() { 6 | es=`tput setaf 4` 7 | ee=`tput sgr0` 8 | echo "${es}$1${ee}" 9 | } 10 | 11 | # Drop all SHM 12 | function drop_shm() { 13 | for i in $(ipcs -m | awk '{ print $2; }'); do 14 | sudo ipcrm -m $i 2>/dev/null 15 | done 16 | } 17 | 18 | # Check if an environment variable is set. If it is not, exit. 19 | function check_env() { 20 | if [ -z "$1" ]; then 21 | echo "utils: Environment variable $1 not set. Exiting." 22 | exit 23 | fi 24 | } 25 | 26 | # Check if a file ($1) exists. If it does not, exit. 27 | function assert_file_exists() { 28 | if [ ! -f $1 ]; then 29 | echo "utils: File $1 not found! Exiting." 30 | exit 0 31 | fi 32 | } 33 | -------------------------------------------------------------------------------- /circular_writes_tput/dram_only/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define clwb(addr) \ 7 | asm volatile(".byte 0x66; xsaveopt %0" : "+m"(*(volatile char *)(addr))); 8 | 9 | int main(int argc, char **argv) { 10 | if (argc <= 1) { 11 | printf("Usage ./bench [num_counters]\n"); 12 | exit(0); 13 | } 14 | 15 | size_t num_counters = static_cast(atoi(argv[1])); 16 | uint8_t *buf = reinterpret_cast(memalign(num_counters * 64, 4096)); 17 | 18 | size_t data = 0; 19 | for (size_t i = 0; i < 10000000; i++) { 20 | size_t buf_offset = (i % num_counters) * 64; 21 | buf[buf_offset] = data++; 22 | clwb(&buf[buf_offset]); 23 | asm volatile("sfence" ::: "memory"); 24 | } 25 | 26 | free(buf); 27 | } 28 | -------------------------------------------------------------------------------- /microbench/README: -------------------------------------------------------------------------------- 1 | Persistent Memory Development Kit 2 | 3 | This is examples/libpmem/README. 4 | 5 | This directory contains examples for libpmem, the library containing 6 | low-level persistent memory support. A detailed explanation of these 7 | examples can be found here: http://pmem.io/pmdk/libpmem 8 | 9 | manpage.c is the example used in the libpmem man page. 10 | 11 | simple_copy.c is a simple pmem_memcpy() example. 12 | 13 | full_copy.c shows how to use pmem_memcpy_nodrain(). 14 | 15 | To build these examples: 16 | make 17 | 18 | These examples can be built against an installed system using: 19 | make LIBDIR=/usr/lib INCDIR=/usr/include 20 | 21 | If you're looking for documentation to get you started using PMDK, 22 | start here: http://pmem.io/pmdk and follow the links to examples and 23 | man pages. Developers new to PMDK are probably looking for libpmemobj. 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bench 2 | # Object files 3 | *.o 4 | *.ko 5 | *.obj 6 | *.elf 7 | main 8 | tags 9 | 1 10 | 11 | # Precompiled Headers 12 | *.gch 13 | *.pch 14 | 15 | # Libraries 16 | *.lib 17 | *.a 18 | *.la 19 | *.lo 20 | 21 | # Shared objects (inc. Windows DLLs) 22 | *.dll 23 | *.so 24 | *.so.* 25 | *.dylib 26 | 27 | # Executables 28 | *.exe 29 | *.out 30 | *.app 31 | *.i*86 32 | *.x86_64 33 | *.hex 34 | 35 | # Debug files 36 | *.dSYM/ 37 | 38 | # Apt NFS files 39 | .nfs000* 40 | 41 | # CMake 42 | build 43 | CMakeCache.txt 44 | CMakeFiles 45 | cmake_install.cmake 46 | CMakeScripts 47 | Testing 48 | CTestTestfile.cmake 49 | 50 | # gdb 51 | .gdb_history 52 | 53 | # Mac 54 | .DS_Store 55 | 56 | # Doxygen 57 | html 58 | latex 59 | 60 | # Common temp files 61 | ibnet_out 62 | sweep_out 63 | scripts/autorun_process_file 64 | scripts/autorun_app_file 65 | gdb_history 66 | *.swp 67 | src/config.h 68 | .ycm_extra_conf.py 69 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/run-servers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | drop_shm 9 | exe="./write-flush" 10 | chmod +x $exe 11 | 12 | blue "Reset server QP registry" 13 | sudo pkill memcached 14 | 15 | # Spawn memcached, but wait for it to start 16 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null & 17 | while ! nc -z localhost 11211; do sleep .1; done 18 | echo "Server: memcached server is open for business on port 11211" 19 | 20 | # Check for non-gdb mode 21 | if [ "$#" -eq 0 ]; then 22 | sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 0 23 | fi 24 | 25 | # Check for gdb mode 26 | if [ "$#" -eq 1 ]; then 27 | sudo -E gdb -ex run --args $exe --is_client 0 28 | fi 29 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/run-servers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | blue "Dropping SHM regions" 9 | drop_shm 10 | exe="./main" 11 | chmod +x $exe 12 | 13 | blue "Reset server QP registry" 14 | sudo pkill memcached 15 | 16 | # Spawn memcached, but wait for it to start 17 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null & 18 | while ! nc -z localhost 11211; do sleep .1; done 19 | echo "Server: memcached server is open for business on port 11211" 20 | 21 | # Check for non-gdb mode 22 | if [ "$#" -eq 0 ]; then 23 | sudo -E numactl --cpunodebind=0 --membind=0 $exe \ 24 | --is_client 0 $(cat config) 25 | fi 26 | 27 | # Check for gdb mode 28 | if [ "$#" -eq 1 ]; then 29 | sudo -E gdb -ex run --args $exe --is_client 0 $(cat config) 30 | fi 31 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/run-servers.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | blue "Dropping SHM regions" 9 | drop_shm 10 | exe="./write-bw" 11 | chmod +x $exe 12 | 13 | blue "Reset server QP registry" 14 | sudo pkill memcached 15 | 16 | # Spawn memcached, but wait for it to start 17 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null & 18 | while ! nc -z localhost 11211; do sleep .1; done 19 | echo "Server: memcached server is open for business on port 11211" 20 | 21 | # Check for non-gdb mode 22 | if [ "$#" -eq 0 ]; then 23 | sudo -E numactl --cpunodebind=0 --membind=0 $exe \ 24 | --is_client 0 $(cat config) 25 | fi 26 | 27 | # Check for gdb mode 28 | if [ "$#" -eq 1 ]; then 29 | sudo -E gdb -ex run --args $exe --is_client 0 $(cat config) 30 | fi 31 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/run-machine.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | drop_shm 9 | exe="./write-flush" 10 | chmod +x $exe 11 | 12 | # Check number of arguments 13 | if [ "$#" -gt 2 ]; then 14 | blue "Illegal number of arguments." 15 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 16 | exit 17 | fi 18 | 19 | if [ "$#" -eq 0 ]; then 20 | blue "Illegal number of arguments." 21 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 22 | exit 23 | fi 24 | 25 | # Check for non-gdb mode 26 | if [ "$#" -eq 1 ]; then 27 | sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 1 28 | fi 29 | 30 | # Check for gdb mode 31 | if [ "$#" -eq 2 ]; then 32 | sudo -E gdb -ex run --args $exe --is_client 1 33 | fi 34 | -------------------------------------------------------------------------------- /microbench/seq_read_tput.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_seq_read_tput(uint8_t *pbuf, size_t thread_id, size_t num_threads) { 4 | static constexpr size_t kReadSize = MB(256); 5 | auto *buf = new uint8_t[kReadSize]; 6 | 7 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 8 | struct timespec start; 9 | size_t sum = 0; 10 | 11 | for (size_t iter = 0; iter < 20; iter++) { 12 | clock_gettime(CLOCK_REALTIME, &start); 13 | 14 | // Generate a 64-byte aligned address to read kReadSize bytes from 15 | size_t start_address = roundup<64>(pcg() % kPmemFileSize); 16 | if (start_address + kReadSize >= kPmemFileSize) { 17 | iter--; 18 | continue; 19 | } 20 | 21 | memcpy(buf, &pbuf[start_address], kReadSize); 22 | sum += buf[pcg() % kReadSize]; 23 | 24 | double tot_sec = sec_since(start); 25 | printf("Thread %zu of %zu, seq read tput = %.2f GB/sec, sum = %zu\n", 26 | thread_id, num_threads, kReadSize * 1.0 / (GB(1) * tot_sec), sum); 27 | } 28 | 29 | delete[] buf; 30 | } 31 | -------------------------------------------------------------------------------- /mica_pmem/run.sh: -------------------------------------------------------------------------------- 1 | batch_size=16 2 | benchmark=5050 3 | sweep_optimizations=1 4 | pmem_file="/mnt/pmem12/raft_log" 5 | 6 | one_million=1048576 # Just a constant to adjust keys_total below 7 | keys_total=`expr 1024 \* $one_million` 8 | 9 | rm -rf /tmp/mica_bench* 10 | 11 | for num_threads in 1 2 4 8 16 24; do 12 | keys_per_thread=`expr $keys_total / $num_threads` 13 | 14 | # Non-GDB mode 15 | if [ "$#" -eq 0 ]; then 16 | numactl --cpunodebind=0 --membind=0 ./bench \ 17 | --table_key_capacity $keys_per_thread \ 18 | --batch_size $batch_size \ 19 | --benchmark $benchmark \ 20 | --pmem_file $pmem_file \ 21 | --sweep_optimizations $sweep_optimizations \ 22 | --num_threads $num_threads 23 | fi 24 | printf "\n\n" 25 | done 26 | 27 | num_threads=1 28 | # GDB mode 29 | if [ "$#" -eq 1 ]; then 30 | echo "do.sh: Launching process with GDB" 31 | num_keys=65536 32 | gdb -ex run --args ./bench \ 33 | --table_key_capacity $num_keys \ 34 | --batch_size $batch_size \ 35 | --benchmark $benchmark \ 36 | --pmem_file $pmem_file \ 37 | --num_threads $num_threads 38 | fi 39 | -------------------------------------------------------------------------------- /hopscotch_pmem/run.sh: -------------------------------------------------------------------------------- 1 | batch_size=16 2 | benchmark=get 3 | sweep_optimizations=1 4 | pmem_file="/mnt/pmem12/raft_log" 5 | 6 | one_million=1048576 # Just a constant to adjust keys_total below 7 | keys_total=`expr 64 \* $one_million` 8 | 9 | rm -rf /tmp/mica_bench* 10 | 11 | for num_threads in 1 2 4 8 16 24; do 12 | keys_per_thread=`expr $keys_total / $num_threads` 13 | 14 | # Non-GDB mode 15 | if [ "$#" -eq 0 ]; then 16 | numactl --cpunodebind=0 --membind=0 ./bench \ 17 | --table_key_capacity $keys_per_thread \ 18 | --batch_size $batch_size \ 19 | --benchmark $benchmark \ 20 | --pmem_file $pmem_file \ 21 | --sweep_optimizations $sweep_optimizations \ 22 | --num_threads $num_threads 23 | fi 24 | printf "\n\n" 25 | done 26 | 27 | num_threads=1 28 | # GDB mode 29 | if [ "$#" -eq 1 ]; then 30 | echo "do.sh: Launching process with GDB" 31 | num_keys=65536 32 | gdb -ex run --args ./bench \ 33 | --table_key_capacity $num_keys \ 34 | --batch_size $batch_size \ 35 | --benchmark $benchmark \ 36 | --pmem_file $pmem_file \ 37 | --num_threads $num_threads 38 | fi 39 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/run-machine.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | # Check number of arguments 9 | if [ "$#" -gt 2 ]; then 10 | blue "Illegal number of arguments." 11 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 12 | exit 13 | fi 14 | 15 | if [ "$#" -eq 0 ]; then 16 | blue "Illegal number of arguments." 17 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 18 | exit 19 | fi 20 | 21 | machine_id=$1 22 | 23 | drop_shm 24 | exe="./write-bw" 25 | chmod +x $exe 26 | 27 | # Check for non-gdb mode 28 | if [ "$#" -eq 1 ]; then 29 | sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 1 \ 30 | --machine_id $machine_id $(cat config) 31 | fi 32 | 33 | # Check for gdb mode 34 | if [ "$#" -eq 2 ]; then 35 | sudo -E gdb -ex run --args $exe --is_client 1 \ 36 | --machine_id $machine_id $(cat config) 37 | fi 38 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/run-machine.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | source $(dirname $0)/../../scripts/utils.sh 3 | source $(dirname $0)/../../scripts/mlx_env.sh 4 | #export HRD_REGISTRY_IP="fawn-pluto0" 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net" 6 | export HRD_REGISTRY_IP="192.168.18.2" 7 | 8 | # Check number of arguments 9 | if [ "$#" -gt 2 ]; then 10 | blue "Illegal number of arguments." 11 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 12 | exit 13 | fi 14 | 15 | if [ "$#" -eq 0 ]; then 16 | blue "Illegal number of arguments." 17 | blue "Usage: ./run-machine.sh , or ./run-machine.sh gdb" 18 | exit 19 | fi 20 | 21 | machine_id=$1 22 | 23 | drop_shm 24 | exe="./main" 25 | chmod +x $exe 26 | 27 | # Check for non-gdb mode 28 | if [ "$#" -eq 1 ]; then 29 | sudo -E numactl --cpunodebind=0 --membind=0 $exe --is_client 1 \ 30 | --machine_id $machine_id $(cat config) 31 | fi 32 | 33 | # Check for gdb mode 34 | if [ "$#" -eq 2 ]; then 35 | sudo -E gdb -ex run --args $exe --is_client 1 \ 36 | --machine_id $machine_id $(cat config) 37 | fi 38 | -------------------------------------------------------------------------------- /hog/hog.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "../common.h" 9 | 10 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log"; 11 | static constexpr const char *kFileName = "/dev/dax0.0"; 12 | static constexpr size_t kPmemFileSize = GB(8); 13 | 14 | int main() { 15 | rt_assert(getuid() == 0, "You need to be root to run this benchmark"); 16 | uint8_t *pbuf; 17 | size_t mapped_len; 18 | 19 | int is_pmem; 20 | pbuf = reinterpret_cast( 21 | pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem)); 22 | 23 | rt_assert(pbuf != nullptr); 24 | rt_assert(mapped_len >= kPmemFileSize); 25 | 26 | size_t iter = 0; 27 | auto *buf = reinterpret_cast(malloc(kPmemFileSize)); 28 | 29 | while (true) { 30 | struct timespec start; 31 | clock_gettime(CLOCK_REALTIME, &start); 32 | pmem_memcpy_persist(pbuf, buf, kPmemFileSize); 33 | printf("Hog: iter = %zu, bandwidth = %.2f GB/s\n", iter, 34 | (kPmemFileSize * 1.0 / GB(1)) / sec_since(start)); 35 | iter++; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /hopscotch_pmem/LICENSE: -------------------------------------------------------------------------------- 1 | /*_ 2 | * Copyright (c) 2016 Hirochika Asai 3 | * All rights reserved. 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy 6 | * of this software and associated documentation files (the "Software"), to deal 7 | * in the Software without restriction, including without limitation the rights 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | * copies of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | * SOFTWARE. 22 | */ 23 | 24 | -------------------------------------------------------------------------------- /nvme_perf/latency.sh: -------------------------------------------------------------------------------- 1 | perf_exe="/home/akalia/sandbox/spdk/examples/nvme/perf/perf" 2 | 3 | rm -f tmpout_* 4 | rm -rf final_out 5 | touch final_out 6 | 7 | # Last one wins 8 | bench=read # Sequential reads 9 | bench=randwrite # Random writes 10 | bench=write # Sequential writes 11 | bench=randread # Random reads 12 | 13 | echo "size us_avg us_median us_999 us_99" >> final_out 14 | 15 | for ((size = 512; size <= 65536; size *= 2)); do 16 | tmpfile="tmpout_$size" 17 | 18 | # -q: queue depth 19 | # -o: object size to write 20 | # -t: time in seconds 21 | # -c: core mask (core 24) 22 | # -L: generate histogram 23 | sudo numactl --cpunodebind=1 --membind=1 $perf_exe \ 24 | -q 1 -o $size -w $bench -t 2 -c 0x1000000 -L > $tmpfile 25 | 26 | us_avg=`cat $tmpfile | grep Total | sed -s 's/ */ /g' | cut -d ' ' -f 5` 27 | us_median=`cat $tmpfile | grep "50\.00000" | tr -d ' ' | cut -d ":" -f 2 | sed 's/us//g'` 28 | us_99=`cat $tmpfile | grep "99\.00000" | tr -d ' ' | cut -d ":" -f 2 | sed 's/us//g'` 29 | us_999=`cat $tmpfile | grep "99\.90000" | tr -d ' ' | cut -d ":" -f 2 | sed 's/us//g'` 30 | 31 | echo $size $us_avg $us_median $us_999 $us_99 32 | echo $size $us_avg $us_median $us_999 $us_99 >> final_out 33 | done 34 | 35 | cat final_out 36 | rm -f tmpout_* 37 | rm -rf final_out 38 | -------------------------------------------------------------------------------- /ioat/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | num_prints=3 4 | use_ioat=1 5 | use_pmem=1 6 | numa_node=0 7 | 8 | stat_file=$(mktemp) 9 | out_file=$(mktemp) 10 | 11 | function sweep_num_ioat_engines() { 12 | window_sizes="1 8" 13 | echo "size $window_sizes" > ${stat_file} # Stats file header 14 | 15 | for size in 1024 2048 4096 8192 16384 32768 65536 131072; do 16 | stat_str="$size" # Saved in stat_file at the end of a window 17 | for window_size in $window_sizes; do 18 | sudo -E env numactl --cpunodebind=$numa_node --membind=$numa_node ./bench \ 19 | --num_prints $num_prints \ 20 | --use_ioat $use_ioat \ 21 | --use_pmem $use_pmem \ 22 | --numa_node $numa_node \ 23 | --size $size \ 24 | --window_size $window_size 1>${out_file} 2>${out_file} 25 | 26 | # The last num_prints lines of out_file are formatted like: 27 | # 10.2 GB/s 28 | avg=`cat ${out_file} | tail -$num_prints | cut -d' ' -f 1 | avg.awk` 29 | echo "size $size, window size $window_size, tput $avg GB/s" 30 | 31 | stat_str="$stat_str $avg" 32 | done 33 | 34 | echo "Saving $stat_str to ${stat_file}" 35 | echo $stat_str >> ${stat_file} 36 | done 37 | 38 | echo "Results for: use_ioat $use_ioat, use_pmem $use_pmem" 39 | cat ${stat_file} 40 | } 41 | 42 | sweep_num_ioat_engines 43 | -------------------------------------------------------------------------------- /pmemkv_perf/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../common.h" 3 | #include "/home/akalia/sandbox/pmemkv/src/pmemkv.h" 4 | 5 | #define LOG(msg) std::cout << msg << "\n" 6 | 7 | using namespace pmemkv; 8 | 9 | int main() { 10 | LOG("Opening datastore"); 11 | KVEngine* kv = 12 | KVEngine::Open("kvtree3", "/dev/dax0.0", 1073741824); // 1 GB pool 13 | assert(kv != nullptr); 14 | 15 | struct timespec start; 16 | clock_gettime(CLOCK_REALTIME, &start); 17 | for (size_t i = 0; i < 100000; i++) { 18 | std::string k = std::to_string(i); 19 | std::string v = std::to_string(i); 20 | kv->Put(k, v); 21 | kv->Put(k, v); 22 | kv->Put(k, v); 23 | } 24 | 25 | double seconds = sec_since(start); 26 | printf("seconds = %.2f\n", seconds); 27 | 28 | LOG("Putting new key"); 29 | KVStatus s = kv->Put("key1", "value1"); 30 | assert(s == OK && kv->Count() == 1); 31 | 32 | LOG("Reading key back"); 33 | string value; 34 | s = kv->Get("key1", &value); 35 | assert(s == OK && value == "value1"); 36 | 37 | LOG("Iterating existing keys"); 38 | kv->Put("key2", "value2"); 39 | kv->Put("key3", "value3"); 40 | kv->All([](int, const char* k) { LOG(" visited: " << k); }); 41 | 42 | LOG("Removing existing key"); 43 | s = kv->Remove("key1"); 44 | assert(s == OK && !kv->Exists("key1")); 45 | 46 | LOG("Closing datastore"); 47 | delete kv; 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /utils/hdr_histogram_wrapper.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // A wrapper for hdr_histogram that supports floating point values with 4 | // magnified precision. A floating point record x is inserted as x * AMP. 5 | template 6 | class HdrHistogramAmp { 7 | public: 8 | HdrHistogramAmp(int64_t min, int64_t max, uint32_t precision) { 9 | int ret = hdr_init(min * AMP, max * AMP, precision, &hist); 10 | rt_assert(ret == 0); 11 | } 12 | 13 | ~HdrHistogramAmp() { hdr_close(hist); } 14 | 15 | inline void record_value(double v) { hdr_record_value(hist, v * AMP); } 16 | 17 | double percentile(double p) { 18 | return hdr_value_at_percentile(hist, p) / (AMP * 1.0); 19 | } 20 | 21 | void reset() { hdr_reset(hist); } 22 | 23 | hdr_histogram *get_raw_hist() { return hist; } 24 | 25 | private: 26 | hdr_histogram *hist = nullptr; 27 | }; 28 | 29 | // A conveinince wrapper for hdr_histogram 30 | class HdrHistogram { 31 | public: 32 | HdrHistogram(int64_t min, int64_t max, int precision) { 33 | int ret = hdr_init(min, max, precision, &hist); 34 | rt_assert(ret == 0); 35 | } 36 | 37 | ~HdrHistogram() { hdr_close(hist); } 38 | 39 | inline void record_value(size_t v) { 40 | hdr_record_value(hist, static_cast(v)); 41 | } 42 | 43 | size_t percentile(double p) const { 44 | return static_cast(hdr_value_at_percentile(hist, p)); 45 | } 46 | 47 | void reset() { hdr_reset(hist); } 48 | 49 | hdr_histogram *get_raw_hist() { return hist; } 50 | 51 | private: 52 | hdr_histogram *hist = nullptr; 53 | } 54 | -------------------------------------------------------------------------------- /microbench/rand_write_tput.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_rand_write_tput(uint8_t *pbuf, size_t thread_id, size_t copy_sz, 4 | size_t num_threads) { 5 | static constexpr size_t kBatchSize = 8; 6 | static constexpr size_t kNumIters = GB(64); 7 | 8 | // Write to non-overlapping addresses 9 | const size_t bytes_per_thread = kPmemFileSize / num_threads; 10 | const size_t base_addr = thread_id * bytes_per_thread; 11 | 12 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 13 | struct timespec start; 14 | 15 | auto *copy_arr = new uint8_t[copy_sz]; 16 | for (size_t i = 0; i < copy_sz; i++) copy_arr[i] = pcg(); 17 | 18 | for (size_t iter = 0; iter < 1; iter++) { 19 | clock_gettime(CLOCK_REALTIME, &start); 20 | 21 | for (size_t i = 0; i < kNumIters / kBatchSize; i++) { 22 | size_t offset[kBatchSize]; 23 | for (size_t j = 0; j < kBatchSize; j++) { 24 | offset[j] = base_addr + (pcg() % bytes_per_thread); 25 | offset[j] = roundup<256>(offset[j]); 26 | if (offset[j] + copy_sz >= kPmemFileSize) { 27 | j--; 28 | continue; 29 | } 30 | pmem_memcpy_nodrain(&pbuf[offset[j]], copy_arr, copy_sz); 31 | } 32 | pmem_drain(); 33 | } 34 | 35 | double tot_sec = sec_since(start); 36 | double rate = kNumIters / tot_sec; 37 | double tput_GBps = kNumIters * copy_sz / (1000000000 * tot_sec); 38 | 39 | printf("Thread %zu of %zu, size %zu: rand writes: (%.2f M/s, %.2f GB/s)\n", 40 | thread_id, num_threads, copy_sz, rate / 1000000, tput_GBps); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /microbench/bench.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file bench.h 3 | * @brief Common code shared by benchmark implementations in header files 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "../common.h" 25 | #include "../utils/timer.h" 26 | 27 | DEFINE_uint64(num_threads, 0, "Number of threads"); 28 | 29 | // static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log"; 30 | static constexpr const char *kPmemFile = "/dev/dax0.0"; 31 | 32 | static constexpr size_t kPmemFileSize = GB(32); 33 | 34 | static constexpr bool kMeasureLatency = false; 35 | double freq_ghz = 0.0; 36 | static size_t align64(size_t x) { return x - x % 64; } 37 | 38 | static constexpr int kHdrPrecision = 2; // Precision for hdr histograms 39 | static constexpr int kMinPmemLatCycles = 1; // Min pmem latency in cycles 40 | static constexpr int kMaxPmemLatCycles = MB(1); // Max pmem latency in cycles 41 | 42 | static constexpr size_t kNumaNode = 0; 43 | 44 | /// Get a random offset in the file with at least \p space after it 45 | size_t get_random_offset_with_space(pcg64_fast &pcg, size_t space) { 46 | size_t iters = 0; 47 | while (true) { 48 | size_t rand_offset = pcg() % kPmemFileSize; 49 | if (kPmemFileSize - rand_offset > space) return rand_offset; 50 | iters++; 51 | if (iters > 2) printf("Random offset took over 2 iters\n"); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /microbench/rand_read_tput.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_rand_read_tput(uint8_t *pbuf, size_t thread_id, const size_t copy_sz, 4 | size_t num_threads) { 5 | static constexpr size_t kNumIters = MB(4); 6 | assert(copy_sz == 64 || copy_sz == 256 || copy_sz == 512 || copy_sz == 1024); 7 | 8 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 9 | struct timespec start; 10 | size_t sum = 0; 11 | 12 | for (size_t iter = 0; iter < 5; iter++) { 13 | clock_gettime(CLOCK_REALTIME, &start); 14 | 15 | if (copy_sz == 64) { 16 | for (size_t i = 0; i < kNumIters; i++) { 17 | size_t offset = roundup<64>(pcg() % kPmemFileSize); 18 | sum += pbuf[offset]; 19 | } 20 | } else if (copy_sz == 256) { 21 | for (size_t i = 0; i < kNumIters; i++) { 22 | size_t offset = roundup<64>(pcg() % kPmemFileSize); 23 | for (size_t cl = 0; cl < 4; cl++) { 24 | sum += pbuf[offset + cl * 64]; 25 | } 26 | } 27 | } else if (copy_sz == 512) { 28 | for (size_t i = 0; i < kNumIters; i++) { 29 | size_t offset = roundup<64>(pcg() % kPmemFileSize); 30 | for (size_t cl = 0; cl < 8; cl++) { 31 | sum += pbuf[offset + cl * 64]; 32 | } 33 | } 34 | } else if (copy_sz == 1024) { 35 | for (size_t i = 0; i < kNumIters; i++) { 36 | size_t offset = roundup<64>(pcg() % kPmemFileSize); 37 | for (size_t cl = 0; cl < 16; cl++) { 38 | sum += pbuf[offset + cl * 64]; 39 | } 40 | } 41 | } 42 | 43 | double tot_sec = sec_since(start); 44 | double rate = kNumIters / tot_sec; 45 | printf( 46 | "Thread %zu of %zu, copy sz %zu: random read tput = %.2f M/sec, " 47 | "sum = %zu\n", 48 | thread_id, num_threads, copy_sz, rate / 1000000, sum); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /microbench/write_latency/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "../../common.h" 12 | 13 | static constexpr size_t kWriteSize = 512; 14 | static constexpr size_t kNumIters = 1000000; 15 | 16 | int main() { 17 | uint8_t *data = reinterpret_cast(memalign(4096, kWriteSize)); 18 | 19 | size_t mapped_len; 20 | int is_pmem; 21 | uint8_t *pbuf = reinterpret_cast( 22 | pmem_map_file("/dev/dax0.0", 0, 0, 0666, &mapped_len, &is_pmem)); 23 | assert(pbuf != nullptr); 24 | assert(mapped_len >= kWriteSize * kNumIters); 25 | 26 | size_t file_offset = 0; 27 | std::vector latency_vec; 28 | latency_vec.reserve(kNumIters); 29 | 30 | for (size_t msr = 0; msr < 10; msr++) { 31 | // Initialize measurement 32 | latency_vec.clear(); 33 | struct timespec bench_start; 34 | clock_gettime(CLOCK_REALTIME, &bench_start); 35 | 36 | // Real work 37 | for (size_t i = 0; i < kNumIters; i++) { 38 | size_t start_tsc = rdtsc(); 39 | mfence(); 40 | pmem_memmove_persist(&pbuf[file_offset], data, kWriteSize); 41 | mfence(); 42 | 43 | latency_vec.push_back(rdtsc() - start_tsc); 44 | 45 | file_offset += kWriteSize; 46 | if (file_offset + kWriteSize >= mapped_len) file_offset = 0; 47 | } 48 | 49 | double bench_seconds = sec_since(bench_start); 50 | printf("Throughput of writes = %.2f M ops/s, %.2f GB/s\n", 51 | kNumIters / (bench_seconds * 1000000), 52 | kNumIters * kWriteSize / (bench_seconds * 1000000000)); 53 | 54 | std::sort(latency_vec.begin(), latency_vec.end()); 55 | printf("Latency (cycles): median %zu, 99%% %zu, 99.9%% %zu\n", 56 | latency_vec.at(kNumIters * .5), latency_vec.at(kNumIters * .99), 57 | latency_vec.at(kNumIters * .999)); 58 | } 59 | 60 | pmem_unmap(pbuf, mapped_len); 61 | exit(0); 62 | } 63 | -------------------------------------------------------------------------------- /microbench/seq_write_tput.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_seq_write_tput(uint8_t *pbuf, size_t thread_id, size_t copy_sz, 4 | double *avg_tput_GBps) { 5 | // We perform multiple measurements. In each measurement, a thread writes 6 | // kCopyPerThreadPerMsr bytes in copy_sz chunks. 7 | static constexpr size_t kNumMsr = 1; 8 | static constexpr size_t kCopyPerThreadPerMsr = GB(1); 9 | rt_assert(kCopyPerThreadPerMsr % copy_sz == 0, "Unaligned copy size"); 10 | 11 | void *dram_src_buf = memalign(4096, copy_sz); 12 | memset(dram_src_buf, 0, copy_sz); 13 | 14 | // Each thread write to non-overlapping addresses 15 | const size_t excl_bytes_per_thread = kPmemFileSize / FLAGS_num_threads; 16 | const size_t base_offset = roundup<256>(thread_id * excl_bytes_per_thread); 17 | 18 | // We begin copies from a random aligned offset in the file. This prevents 19 | // multiple calls from writing to the same file region. std::random_device 20 | // produces a non-deterministic seed. 21 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 22 | size_t offset = base_offset + (pcg() % excl_bytes_per_thread); 23 | offset = roundup<256>(offset); 24 | 25 | double tput_sum_GBps = 0; // Used to compute average througput at the end 26 | 27 | for (size_t msr = 0; msr < kNumMsr; msr++) { 28 | struct timespec start; 29 | clock_gettime(CLOCK_REALTIME, &start); 30 | 31 | for (size_t i = 0; i < kCopyPerThreadPerMsr / copy_sz; i++) { 32 | pmem_memmove_persist(&pbuf[offset], dram_src_buf, copy_sz); 33 | offset += copy_sz; 34 | if (offset + copy_sz >= base_offset + excl_bytes_per_thread) { 35 | offset = base_offset; 36 | } 37 | } 38 | 39 | double tot_sec = sec_since(start); 40 | double tput_GBps = kCopyPerThreadPerMsr / (tot_sec * 1000000000); 41 | printf("Thread %zu: copy_sz %zu, %.2f GB/s. Offset = %zu\n", thread_id, 42 | copy_sz, tput_GBps, offset); 43 | tput_sum_GBps += tput_GBps; 44 | } 45 | 46 | *avg_tput_GBps = tput_sum_GBps / kNumMsr; 47 | free(dram_src_buf); 48 | } 49 | -------------------------------------------------------------------------------- /randomizer/main.cc: -------------------------------------------------------------------------------- 1 | // This can be used to write random contents to a pmem file so that later 2 | // experiments don't benefit from any crazy value prediction of a zeroed file. 3 | 4 | #include 5 | #include 6 | #include "../common.h" 7 | 8 | static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log"; 9 | static constexpr size_t kPmemFileSizeGB = 512; // The expected file size 10 | static constexpr size_t kPmemFileSize = kPmemFileSizeGB * GB(1); 11 | static constexpr size_t kRandTemplateSz = GB(32); 12 | 13 | int main(int, char **) { 14 | uint8_t *pbuf; 15 | size_t mapped_len; 16 | int is_pmem; 17 | 18 | pbuf = reinterpret_cast(pmem_map_file( 19 | kPmemFile, 0 /* length */, 0 /* flags */, 0666, &mapped_len, &is_pmem)); 20 | 21 | rt_assert(pbuf != nullptr, 22 | "pmem_map_file() failed. " + std::string(strerror(errno))); 23 | rt_assert(mapped_len >= kPmemFileSize, 24 | "pmem file too small " + std::to_string(mapped_len)); 25 | rt_assert(reinterpret_cast(pbuf) % 4096 == 0, 26 | "Mapped buffer isn't page-aligned"); 27 | rt_assert(is_pmem == 1, "File is not pmem"); 28 | 29 | printf("Generating random contents\n"); 30 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 31 | size_t *rand_buf = reinterpret_cast(malloc(kRandTemplateSz)); 32 | for (size_t i = 0; i < kRandTemplateSz / sizeof(size_t); i++) { 33 | rand_buf[i] = pcg(); 34 | } 35 | 36 | printf("Writing random contents to the whole file.\n"); 37 | rt_assert(kPmemFileSize % kRandTemplateSz == 0); 38 | 39 | for (size_t i = 0; i < kPmemFileSize; i += kRandTemplateSz) { 40 | struct timespec start; 41 | clock_gettime(CLOCK_REALTIME, &start); 42 | pmem_memcpy_persist(&pbuf[i], rand_buf, kRandTemplateSz); 43 | printf("Fraction complete = %.2f. Took %.3f sec for %zu GB.\n", 44 | (i + 1) * 1.0 / kPmemFileSize, sec_since(start), 45 | kRandTemplateSz / GB(1)); 46 | } 47 | 48 | printf("Done writing.\n"); 49 | 50 | pmem_unmap(pbuf, mapped_len); 51 | exit(0); 52 | } 53 | -------------------------------------------------------------------------------- /log_store/rotating_counter.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | class Counter { 9 | public: 10 | static constexpr size_t kNumBuffers = 16; 11 | static constexpr size_t kBufferSize = 256; 12 | 13 | /** 14 | * @brief Construct a counter 15 | * 16 | * @param pbuf The start address of the counter on persistent memory 17 | * 18 | * @param create_new If true, the counter is reset to zero. If false, the 19 | * counter is initialized using the prior pmem contents. 20 | */ 21 | Counter(uint8_t *pbuf, bool create_new) : ctr_base_addr(pbuf) { 22 | if (create_new) { 23 | pmem_memset_persist(pbuf, 0, kNumBuffers * kBufferSize); 24 | } else { 25 | size_t cur_max = 0; // Maximum value among the counters 26 | size_t cur_max_i = 0; // Index of the maximum value 27 | for (size_t i = 0; i < kNumBuffers; i++) { 28 | size_t *counter_i = reinterpret_cast(&pbuf[i * kBufferSize]); 29 | if (*counter_i > cur_max) { 30 | cur_max = *counter_i; 31 | cur_max_i = i; 32 | } 33 | } 34 | 35 | v_value = cur_max; 36 | buffer_idx = (cur_max_i + 1) % kNumBuffers; 37 | } 38 | } 39 | 40 | Counter() {} 41 | 42 | /// The amount of contiguous pmem needed for this counter 43 | static size_t get_reqd_space() { return kNumBuffers * kBufferSize; } 44 | 45 | // Increment by always writing to the same location 46 | inline void increment_naive(size_t increment) { 47 | v_value += increment; 48 | pmem_memcpy_persist(&ctr_base_addr[0], &v_value, sizeof(v_value)); 49 | } 50 | 51 | // Increment by writing to rotating locations, but don't do full-cacheline 52 | // writes 53 | inline void increment_rotate(size_t increment) { 54 | v_value += increment; 55 | pmem_memcpy_persist(&ctr_base_addr[buffer_idx * kBufferSize], &v_value, 56 | sizeof(v_value)); 57 | buffer_idx = (buffer_idx + 1) % kNumBuffers; 58 | } 59 | 60 | size_t v_value = 0; // Volatile value of the counter 61 | 62 | size_t buffer_idx = 0; 63 | uint8_t *ctr_base_addr = nullptr; // Starting address of the counter on pmem 64 | }; 65 | -------------------------------------------------------------------------------- /microbench/rand_write_latency.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_rand_write_latency(uint8_t *pbuf) { 4 | double freq_ghz = measure_rdtsc_freq(); 5 | 6 | static constexpr size_t kWriteBytes = MB(64); 7 | static constexpr size_t kMinIters = 50000; 8 | static constexpr size_t kMinWriteSz = 64; 9 | static constexpr size_t kMaxWriteSz = KB(64); 10 | 11 | size_t file_offset = 0; 12 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 13 | 14 | static_assert(kWriteBytes / kMinWriteSz >= kMinIters, ""); 15 | std::vector latency_vec; 16 | latency_vec.reserve(kWriteBytes / kMinWriteSz); 17 | 18 | uint8_t *data = reinterpret_cast(memalign(4096, kMaxWriteSz)); 19 | 20 | for (size_t msr = 0; msr < 10; msr++) { 21 | printf("size avg_ns 50_ns 999_ns\n"); 22 | std::ostringstream verify_tsc_str; // Compare tsc results with realtime 23 | 24 | for (size_t size = kMinWriteSz; size <= kMaxWriteSz; size *= 2) { 25 | struct timespec start_time; 26 | clock_gettime(CLOCK_REALTIME, &start_time); 27 | 28 | latency_vec.clear(); 29 | const size_t num_iters = 30 | kWriteBytes / size <= kMinIters ? kMinIters : kWriteBytes / size; 31 | 32 | for (size_t i = 0; i < num_iters; i++) { 33 | file_offset = roundup<64>(pcg() % kPmemFileSize); 34 | 35 | size_t start_tsc = timer::Start(); 36 | pmem_memmove_persist(&pbuf[file_offset], data, size); 37 | 38 | latency_vec.push_back(timer::Stop() - start_tsc); 39 | } 40 | 41 | size_t ns_avg_realtime = ns_since(start_time) / num_iters; 42 | size_t ns_avg_rdtsc = 43 | std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) / 44 | (latency_vec.size() * freq_ghz); 45 | verify_tsc_str << size << ": Average latency (ns) " << ns_avg_realtime 46 | << " (realtime) " << ns_avg_rdtsc << " (rdtsc) " 47 | << (ns_avg_realtime - ns_avg_rdtsc) << " (delta) " 48 | << "\n"; 49 | 50 | std::sort(latency_vec.begin(), latency_vec.end()); 51 | printf("%zu %zu %.1f %.1f\n", size, ns_avg_realtime, 52 | latency_vec.at(num_iters * .50) / freq_ghz, 53 | latency_vec.at(num_iters * .999) / freq_ghz); 54 | } 55 | 56 | printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str()); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /microbench/read_latency/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | static constexpr size_t kNumIters = 1000000; 13 | static constexpr size_t kFileSizeGB = 512; 14 | static constexpr size_t kFileSizeBytes = (1ull << 30) * kFileSizeGB; 15 | // static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log"; 16 | static constexpr const char *kPmemFile = "/dev/dax0.0"; 17 | 18 | inline uint32_t fastrand(uint64_t &seed) { 19 | seed = seed * 1103515245 + 12345; 20 | return static_cast(seed >> 32); 21 | } 22 | 23 | /// Return nanoseconds elapsed since timestamp \p t0 24 | static double ns_since(const struct timespec &t0) { 25 | struct timespec t1; 26 | clock_gettime(CLOCK_REALTIME, &t1); 27 | return (t1.tv_sec - t0.tv_sec) * 1000000000.0 + (t1.tv_nsec - t0.tv_nsec); 28 | } 29 | 30 | // Used for shuffle-based pointer chain measurement 31 | struct cacheline_t { 32 | cacheline_t *ptr; 33 | size_t pad[7]; 34 | }; 35 | static_assert(sizeof(cacheline_t) == 64, ""); 36 | 37 | int main() { 38 | if (getuid() != 0) { 39 | // Mapping devdax files needs root perms for now 40 | printf("You need to be root to run this benchmark\n"); 41 | exit(-1); 42 | } 43 | 44 | printf("Measuring random read latency with buffer size = %zu GB\n", 45 | kFileSizeGB); 46 | 47 | size_t mapped_len; 48 | int is_pmem; 49 | uint8_t *pbuf = reinterpret_cast( 50 | pmem_map_file(kPmemFile, 0, 0, 0666, &mapped_len, &is_pmem)); 51 | assert(pbuf != nullptr); 52 | assert(mapped_len >= kFileSizeBytes); 53 | assert(is_pmem == 1); 54 | 55 | size_t sum = 0; 56 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 57 | 58 | for (size_t msr = 0; msr < 10; msr++) { 59 | // Initialize measurement 60 | struct timespec bench_start; 61 | clock_gettime(CLOCK_REALTIME, &bench_start); 62 | 63 | // Real work 64 | for (size_t i = 0; i < kNumIters; i++) { 65 | size_t file_offset = (sum + pcg()) % kFileSizeBytes; 66 | sum += pbuf[file_offset]; // Make the next read dependent 67 | } 68 | 69 | double bench_ns = ns_since(bench_start); 70 | printf("Average read latency = %.1f ns, sum = %zu\n", bench_ns / kNumIters, 71 | sum); 72 | } 73 | 74 | pmem_unmap(pbuf, mapped_len); 75 | exit(0); 76 | } 77 | -------------------------------------------------------------------------------- /cacheline_versions/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "../common.h" 12 | #include "../utils/timer.h" 13 | #include "config.h" 14 | 15 | DEFINE_uint64(use_pmem, 1, "Use persistent memory"); 16 | DEFINE_uint64(object_size, KB(4), "Size of objects"); 17 | 18 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log"; 19 | static constexpr const char *kFileName = "/dev/dax0.0"; 20 | static constexpr bool kUsePmem = true; 21 | static constexpr size_t kFileSize = GB(32); 22 | 23 | int main(int argc, char **argv) { 24 | gflags::ParseCommandLineFlags(&argc, &argv, true); 25 | rt_assert(getuid() == 0, "You need to be root to run this benchmark"); 26 | 27 | uint8_t *pbuf; 28 | size_t mapped_len; 29 | 30 | if (FLAGS_use_pmem == 1) { 31 | printf("Using persistent memory buffer, size %zu\n", FLAGS_object_size); 32 | int is_pmem; 33 | pbuf = reinterpret_cast( 34 | pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem)); 35 | 36 | rt_assert(pbuf != nullptr); 37 | rt_assert(mapped_len >= kFileSize); 38 | } else { 39 | printf("Using volatile memory buffer\n"); 40 | pbuf = reinterpret_cast(malloc(kFileSize)); 41 | } 42 | 43 | size_t iter = 0; 44 | size_t sum = 0; 45 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 46 | 47 | while (true) { 48 | size_t rand = pcg(); 49 | size_t offset = roundup<64>(rand % kFileSize); 50 | if (offset + FLAGS_object_size >= kFileSize) continue; 51 | 52 | uint8_t *obj = &pbuf[offset]; 53 | for (size_t i = 0; i < FLAGS_object_size / 64; i++) sum += obj[i * 64]; 54 | 55 | struct timespec bench_start; 56 | clock_gettime(CLOCK_REALTIME, &bench_start); 57 | 58 | for (size_t i = 0; i < FLAGS_object_size / 64; i++) obj[i * 64] = iter % 2; 59 | memset(obj, iter, FLAGS_object_size); 60 | for (size_t i = 0; i < FLAGS_object_size / 64; i++) obj[i * 64] = iter % 3; 61 | 62 | printf("Object size %zu, time = %.2f us, bw = %.2f GB/s, size %zu\n", 63 | FLAGS_object_size, sec_since(bench_start) * 1000000, 64 | FLAGS_object_size / (1024 * 1024 * 1024.0 * sec_since(bench_start)), 65 | FLAGS_object_size); 66 | 67 | iter++; 68 | } 69 | 70 | if (kUsePmem) pmem_unmap(pbuf, mapped_len); 71 | } 72 | -------------------------------------------------------------------------------- /microbench/rand_read_latency.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_rand_read_latency(uint8_t *pbuf) { 4 | double freq_ghz = measure_rdtsc_freq(); 5 | 6 | static constexpr bool kMeasurePercentiles = false; 7 | static constexpr size_t kReadBytes = MB(128); 8 | static constexpr size_t kMinIters = 50000; 9 | static constexpr size_t kMinReadSz = 64; 10 | static constexpr size_t kMaxReadSz = KB(64); 11 | 12 | size_t file_offset = 0; 13 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 14 | 15 | static_assert(kReadBytes / kMinReadSz >= kMinIters, ""); 16 | std::vector latency_vec; 17 | latency_vec.reserve(kReadBytes / kMinReadSz); 18 | 19 | size_t sum = 0; 20 | 21 | for (size_t msr = 0; msr < 10; msr++) { 22 | printf("size avg_ns 50_ns 999_ns\n"); 23 | std::ostringstream verify_tsc_str; // Compare tsc results with realtime 24 | 25 | for (size_t size = kMinReadSz; size <= kMaxReadSz; size *= 2) { 26 | struct timespec start_time; 27 | clock_gettime(CLOCK_REALTIME, &start_time); 28 | 29 | latency_vec.clear(); 30 | const size_t num_iters = 31 | kReadBytes / size <= kMinIters ? kMinIters : kReadBytes / size; 32 | 33 | for (size_t i = 0; i < num_iters; i++) { 34 | size_t rand = sum + pcg(); 35 | file_offset = roundup<64>(rand % kPmemFileSize); 36 | 37 | size_t start_tsc; 38 | if (kMeasurePercentiles) start_tsc = timer::Start(); 39 | for (size_t j = 0; j < size; j += 64) { 40 | sum += pbuf[file_offset + j]; 41 | } 42 | 43 | if (kMeasurePercentiles) { 44 | latency_vec.push_back(timer::Stop() - start_tsc); 45 | } 46 | } 47 | 48 | size_t ns_avg_realtime = ns_since(start_time) / num_iters; 49 | 50 | if (kMeasurePercentiles) { 51 | std::sort(latency_vec.begin(), latency_vec.end()); 52 | printf("%zu %zu %.1f %.1f\n", size, ns_avg_realtime, 53 | latency_vec.at(num_iters * .50) / freq_ghz, 54 | latency_vec.at(num_iters * .999) / freq_ghz); 55 | 56 | size_t ns_avg_rdtsc = 57 | std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) / 58 | (latency_vec.size() * freq_ghz); 59 | verify_tsc_str << size << ": Average latency (ns) " << ns_avg_realtime 60 | << " (realtime) " << ns_avg_rdtsc << " (rdtsc) " 61 | << (ns_avg_realtime - ns_avg_rdtsc) << " (delta) " 62 | << "\n"; 63 | } else { 64 | printf("%zu %zu -1.0 -1.0\n", size, ns_avg_realtime); 65 | } 66 | } 67 | 68 | printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str()); 69 | printf("sum = %zu\n", sum); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /circular_writes_tput/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "../common.h" 13 | #include "../utils/timer.h" 14 | #include "config.h" 15 | 16 | // Config parameters: 17 | // FLAGS_num_counters: Number of counters emulating one counter 18 | // FLAGS_stride_size: Distance between counters 19 | 20 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log"; 21 | static constexpr const char *kFileName = "/dev/dax0.0"; 22 | static constexpr size_t kNumIters = 1000000; 23 | static constexpr bool kUsePmem = true; 24 | static constexpr bool kUseNtStore = true; 25 | 26 | DEFINE_uint64(num_counters, 16, "Number of counters to rotate on"); 27 | DEFINE_uint64(stride_size, 256, "Stride size"); 28 | 29 | int main(int argc, char **argv) { 30 | gflags::ParseCommandLineFlags(&argc, &argv, true); 31 | rt_assert(getuid() == 0, "You need to be root to run this benchmark"); 32 | rt_assert(FLAGS_stride_size >= sizeof(size_t), ""); 33 | rt_assert(FLAGS_stride_size % sizeof(size_t) == 0, ""); 34 | 35 | uint8_t *pbuf; 36 | size_t mapped_len; 37 | 38 | if (kUsePmem) { 39 | printf("Using persistent memory buffer\n"); 40 | int is_pmem; 41 | pbuf = reinterpret_cast( 42 | pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem)); 43 | 44 | rt_assert(pbuf != nullptr); 45 | rt_assert(mapped_len >= FLAGS_num_counters * FLAGS_stride_size); 46 | } else { 47 | printf("Using DRAM buffer\n"); 48 | pbuf = reinterpret_cast( 49 | malloc(FLAGS_num_counters * FLAGS_stride_size)); 50 | } 51 | 52 | size_t counter_val = 1; 53 | size_t counter_idx = 0; 54 | for (size_t msr = 0; msr < 5; msr++) { 55 | struct timespec bench_start; 56 | clock_gettime(CLOCK_REALTIME, &bench_start); 57 | 58 | for (size_t i = 0; i < kNumIters; i++) { 59 | size_t buffer_offset = counter_idx * FLAGS_stride_size; 60 | 61 | if (kUseNtStore) { 62 | pmem_memcpy_persist(&pbuf[buffer_offset], &counter_val, sizeof(size_t)); 63 | } else { 64 | *reinterpret_cast(&pbuf[buffer_offset]) = counter_val; 65 | pmem_clwb(&pbuf[buffer_offset]); 66 | sfence(); 67 | } 68 | 69 | counter_idx++; 70 | if (counter_idx == FLAGS_num_counters) counter_idx = 0; 71 | counter_val++; 72 | } 73 | 74 | printf("num_counters %zu, stride size %zu: %.2f M/s.\n", FLAGS_num_counters, 75 | FLAGS_stride_size, kNumIters / (sec_since(bench_start) * 1000000)); 76 | } 77 | 78 | if (kUsePmem) pmem_unmap(pbuf, mapped_len); 79 | } 80 | -------------------------------------------------------------------------------- /ioat/virt2phy.h: -------------------------------------------------------------------------------- 1 | // Credits: DPDK 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../common.h" 15 | 16 | /** 17 | * @brief A class to translate any mapped virtual address in the current process 18 | * to its physical address. 19 | * 20 | * Requires root access. 21 | */ 22 | class Virt2Phy { 23 | static constexpr size_t kPfnMaskSize = 8; 24 | 25 | public: 26 | Virt2Phy() { 27 | fd = open("/proc/self/pagemap", O_RDONLY); 28 | if (fd < 0) { 29 | printf("%s(): cannot open /proc/self/pagemap\n", strerror(errno)); 30 | exit(-1); 31 | } 32 | 33 | page_size = static_cast(getpagesize()); // Standard page size 34 | } 35 | 36 | ~Virt2Phy() { close(fd); } 37 | 38 | /** 39 | * @brief Return the physical address of this virtual address 40 | * @return The physical address on success, zero on failure 41 | */ 42 | uint64_t translate(const void *virtaddr) { 43 | auto virt_pfn = static_cast( 44 | reinterpret_cast(virtaddr) / page_size); 45 | size_t offset = sizeof(uint64_t) * virt_pfn; 46 | 47 | uint64_t page; 48 | int ret = pread(fd, &page, kPfnMaskSize, static_cast(offset)); 49 | 50 | if (ret < 0) { 51 | fprintf(stderr, "cannot read /proc/self/pagemap: %s\n", strerror(errno)); 52 | return 0; 53 | } else if (ret != static_cast(kPfnMaskSize)) { 54 | fprintf(stderr, 55 | "read %d bytes from /proc/self/pagemap but expected %zu:\n", ret, 56 | kPfnMaskSize); 57 | return 0; 58 | } 59 | 60 | // The pfn (page frame number) are bits 0-54 (see pagemap.txt in linux 61 | // Documentation) 62 | if ((page & 0x7fffffffffffffULL) == 0) return 0; 63 | 64 | uint64_t physaddr = ((page & 0x7fffffffffffffULL) * page_size) + 65 | (reinterpret_cast(virtaddr) % page_size); 66 | 67 | return physaddr; 68 | } 69 | 70 | private: 71 | int fd; 72 | size_t page_size; 73 | }; 74 | 75 | class HugepageCachingVirt2Phy { 76 | public: 77 | uint64_t translate(void *_va) { 78 | uint64_t va = reinterpret_cast(_va); 79 | uint64_t va_2MB = (va & ~(MB(2) - 1)); 80 | 81 | auto result = v2p_cache.find(va_2MB); 82 | if (likely(result != v2p_cache.end())) { 83 | return result->second + (va % MB(2)); 84 | } 85 | 86 | // Here, we have a cache miss 87 | uint64_t phy_addr = v2p.translate(reinterpret_cast(va_2MB)); 88 | v2p_cache.emplace(va_2MB, phy_addr); 89 | 90 | return phy_addr + (va % MB(2)); 91 | } 92 | 93 | private: 94 | Virt2Phy v2p; 95 | std::unordered_map v2p_cache; 96 | }; 97 | -------------------------------------------------------------------------------- /mica_pmem/test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "pmica.h" 5 | 6 | static constexpr size_t kDefaultFileOffset = 1024; 7 | static constexpr const char* kPmemFile = "/mnt/pmem12/raft_log"; 8 | 9 | TEST(Basic, Simple) { 10 | size_t num_keys = 32; 11 | pmica::HashMap hashmap(kPmemFile, kDefaultFileOffset, 12 | num_keys, 1.0); 13 | 14 | size_t key, value; 15 | 16 | key = 1; 17 | value = 1; 18 | bool success = hashmap.set_nodrain(&key, &value); 19 | assert(success); 20 | 21 | key = 2; 22 | value = 2; 23 | success = hashmap.set_nodrain(&key, &value); 24 | assert(success); 25 | 26 | success = hashmap.set_nodrain(&key, &value); 27 | assert(success); 28 | 29 | key = 3; 30 | value = 3; 31 | success = hashmap.set_nodrain(&key, &value); 32 | assert(success); 33 | 34 | key = 1; 35 | value = 0; 36 | success = hashmap.get(&key, &value); 37 | assert(value == 1); 38 | assert(success); 39 | 40 | key = 2; 41 | value = 0; 42 | success = hashmap.get(&key, &value); 43 | assert(value == 2); 44 | assert(success); 45 | 46 | key = 4; 47 | value = 0; 48 | success = hashmap.get(&key, &value); 49 | assert(value == 0); 50 | assert(!success); 51 | } 52 | 53 | TEST(Basic, Overload) { 54 | size_t num_keys = 32; 55 | pmica::HashMap hashmap(kPmemFile, kDefaultFileOffset, 56 | num_keys, 1.0); 57 | 58 | std::map insert_success_map; 59 | size_t num_success = 0; 60 | 61 | for (size_t i = 1; i <= num_keys; i++) { 62 | bool success = hashmap.set_nodrain(&i, &i); 63 | insert_success_map[i] = success; 64 | 65 | if (success) num_success++; 66 | } 67 | 68 | printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys); 69 | 70 | for (size_t i = 1; i <= num_keys; i++) { 71 | size_t v; 72 | bool success = hashmap.get(&i, &v); 73 | assert(success == insert_success_map[i]); 74 | if (success) assert(v == i); 75 | } 76 | } 77 | 78 | TEST(Basic, Large) { 79 | pmica::HashMap hashmap(kPmemFile, kDefaultFileOffset, 80 | (1ull << 30), 0.2); 81 | 82 | size_t num_keys = 32; 83 | std::map insert_success_map; 84 | size_t num_success = 0; 85 | 86 | for (size_t i = 1; i <= num_keys; i++) { 87 | bool success = hashmap.set_nodrain(&i, &i); 88 | insert_success_map[i] = success; 89 | 90 | if (success) num_success++; 91 | } 92 | 93 | printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys); 94 | 95 | for (size_t i = 1; i <= num_keys; i++) { 96 | size_t v; 97 | bool success = hashmap.get(&i, &v); 98 | assert(success == insert_success_map[i]); 99 | if (success) assert(v == i); 100 | } 101 | } 102 | 103 | int main(int argc, char** argv) { 104 | testing::InitGoogleTest(&argc, argv); 105 | return RUN_ALL_TESTS(); 106 | } 107 | -------------------------------------------------------------------------------- /microbench/seq_write_latency.h: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | void bench_seq_write_latency(uint8_t *pbuf) { 4 | double freq_ghz = measure_rdtsc_freq(); 5 | 6 | static constexpr bool kMeasurePercentiles = true; 7 | 8 | // Update the source data for every write. Not doing so decreases latency. 9 | static constexpr bool kChangeWriteSource = false; 10 | 11 | static constexpr size_t kWriteBytes = MB(64); 12 | static constexpr size_t kMinIters = 50000; 13 | static constexpr size_t kMinWriteSz = 64; 14 | static constexpr size_t kMaxWriteSz = KB(64); 15 | 16 | size_t file_offset = 0; 17 | 18 | static_assert(kWriteBytes / kMinWriteSz >= kMinIters, ""); 19 | std::vector latency_vec; 20 | latency_vec.reserve(kWriteBytes / kMinWriteSz); 21 | 22 | size_t *data = reinterpret_cast(memalign(4096, kMaxWriteSz)); 23 | memset(data, 31, kMaxWriteSz); 24 | 25 | for (size_t msr = 0; msr < 100; msr++) { 26 | printf("size avg_ns 50_ns 999_ns\n"); 27 | std::ostringstream verify_tsc_str; // Compare tsc results with realtime 28 | 29 | for (size_t wr_size = kMinWriteSz; wr_size <= kMaxWriteSz; wr_size *= 2) { 30 | struct timespec start_time; 31 | clock_gettime(CLOCK_REALTIME, &start_time); 32 | 33 | latency_vec.clear(); 34 | file_offset = roundup<256>(file_offset); 35 | const size_t num_iters = kWriteBytes / wr_size <= kMinIters 36 | ? kMinIters 37 | : kWriteBytes / wr_size; 38 | 39 | for (size_t i = 0; i < num_iters; i++) { 40 | if (kChangeWriteSource) { 41 | for (size_t cl = 0; cl < wr_size / 64; cl++) data[cl * 8]++; 42 | } 43 | 44 | size_t start_tsc; 45 | if (kMeasurePercentiles) start_tsc = timer::Start(); 46 | pmem_memmove_persist(&pbuf[file_offset], data, wr_size); 47 | 48 | if (kMeasurePercentiles) { 49 | latency_vec.push_back(timer::Stop() - start_tsc); 50 | } 51 | 52 | file_offset += wr_size; 53 | if (file_offset + wr_size >= kPmemFileSize) file_offset = 0; 54 | } 55 | 56 | size_t ns_avg_realtime = ns_since(start_time) / num_iters; 57 | 58 | if (kMeasurePercentiles) { 59 | std::sort(latency_vec.begin(), latency_vec.end()); 60 | printf("%zu %zu %.1f %.1f\n", wr_size, ns_avg_realtime, 61 | latency_vec.at(num_iters * .50) / freq_ghz, 62 | latency_vec.at(num_iters * .999) / freq_ghz); 63 | 64 | size_t ns_avg_rdtsc = 65 | std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) / 66 | (latency_vec.size() * freq_ghz); 67 | verify_tsc_str << wr_size << ": Avg latency (ns) " << ns_avg_realtime 68 | << " (realtime) " << ns_avg_rdtsc << " (rdtsc) " 69 | << (ns_avg_realtime - ns_avg_rdtsc) << " (delta). offst " 70 | << file_offset << "\n"; 71 | } else { 72 | printf("%zu %zu -1.0 -1.0\n", wr_size, ns_avg_realtime); 73 | } 74 | } 75 | 76 | printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str()); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /hopscotch_pmem/test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "phopscotch.h" 5 | 6 | static constexpr size_t kDefaultFileOffset = 1024; 7 | static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log"; 8 | 9 | TEST(Basic, Simple) { 10 | size_t num_keys = 32; 11 | phopscotch::HashMap hashmap(kPmemFile, kDefaultFileOffset, 12 | num_keys); 13 | 14 | size_t key, value; 15 | 16 | key = 1; 17 | value = 1; 18 | bool success = hashmap.set_nodrain(&key, &value); 19 | assert(success); 20 | 21 | key = 2; 22 | value = 2; 23 | success = hashmap.set_nodrain(&key, &value); 24 | assert(success); 25 | 26 | success = hashmap.set_nodrain(&key, &value); 27 | assert(success); 28 | 29 | key = 3; 30 | value = 3; 31 | success = hashmap.set_nodrain(&key, &value); 32 | assert(success); 33 | 34 | key = 1; 35 | value = 0; 36 | success = hashmap.get(&key, &value); 37 | assert(value == 1); 38 | assert(success); 39 | 40 | key = 2; 41 | value = 0; 42 | success = hashmap.get(&key, &value); 43 | assert(value == 2); 44 | assert(success); 45 | 46 | key = 4; 47 | value = 0; 48 | success = hashmap.get(&key, &value); 49 | assert(value == 0); 50 | assert(!success); 51 | } 52 | 53 | TEST(Basic, Overload) { 54 | size_t num_keys = 1 * 1024 * 1024; 55 | phopscotch::HashMap hashmap(kPmemFile, kDefaultFileOffset, 56 | num_keys); 57 | 58 | size_t max_key_inserted = 0; 59 | for (size_t i = 1; i <= num_keys; i++) { 60 | bool success = hashmap.set_nodrain(&i, &i); 61 | if (!success) { 62 | size_t hash = hashmap.get_hash(&i); 63 | printf("Failed for key %zu, bucket %zuu\n", i, 64 | hash % hashmap.num_buckets); 65 | break; 66 | } 67 | 68 | max_key_inserted = i; 69 | } 70 | 71 | printf("Loaded fraction = %.2f\n", max_key_inserted * 1.0 / num_keys); 72 | hashmap.print_stats(); 73 | 74 | for (size_t i = 1; i <= num_keys; i++) { 75 | size_t v; 76 | bool success = hashmap.get(&i, &v); 77 | assert(success == (i <= max_key_inserted)); 78 | if (success) assert(v == i); 79 | } 80 | } 81 | 82 | TEST(Basic, Large) { 83 | phopscotch::HashMap hashmap(kPmemFile, kDefaultFileOffset, 84 | (1ull << 30)); 85 | 86 | size_t num_keys = 32; 87 | std::map insert_success_map; 88 | size_t num_success = 0; 89 | 90 | for (size_t i = 1; i <= num_keys; i++) { 91 | bool success = hashmap.set_nodrain(&i, &i); 92 | insert_success_map[i] = success; 93 | 94 | if (success) num_success++; 95 | } 96 | 97 | printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys); 98 | 99 | for (size_t i = 1; i <= num_keys; i++) { 100 | size_t v; 101 | bool success = hashmap.get(&i, &v); 102 | assert(success == insert_success_map[i]); 103 | if (success) assert(v == i); 104 | } 105 | } 106 | 107 | int main(int argc, char **argv) { 108 | testing::InitGoogleTest(&argc, argv); 109 | return RUN_ALL_TESTS(); 110 | } 111 | -------------------------------------------------------------------------------- /scripts/ipmctl_watch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # README: 4 | # 5 | # This script monitors write amplification for DIMM 0 using ipmctl 6 | # 7 | # MediaWrites = Number of 64-byte writes to NVM. The NVM controller issues 8 | # 256-byte writes internally, but ipmctl multiplies it by four 9 | # 10 | # WriteRequests = Number of 64-byte write requests received on the DDR bus 11 | # 12 | # Notes: 13 | # * This script uses `printf` to convert hex to decimal, and `xargs` to 14 | # trim surrounding whitespaces. 15 | # * During idle period, the NVM controller may write to NVM even when no DDR 16 | # commands are received. This causes write amplification to be ~100. 17 | # 18 | 19 | dimms=0x0001 # Single non-interleaved 20 | dimms=0x0001,0x0011,0x0021,0x0101,0x0111,0x0121 # All DIMMs at socket 0 21 | 22 | # Sum metric arg #1 from file watch_out 23 | sum_from_watch_out() { 24 | temp_file=$(mktemp) 25 | cat watch_out | grep $1 | cut -d'=' -f 2 > $temp_file 26 | 27 | sum=0 28 | while read hex; do 29 | dec=`printf "%d\n" $hex` 30 | sum=`expr $sum + $dec` 31 | done < ${temp_file} 32 | 33 | echo $sum 34 | rm ${temp_file} 35 | } 36 | 37 | # Regenerate watch_out 38 | refresh_watch_out() { 39 | rm -f watch_out 40 | touch watch_out 41 | sudo ipmctl show -dimm $dimms -performance MediaWrites,WriteRequests,MediaReads,ReadRequests > watch_out 42 | } 43 | 44 | refresh_watch_out 45 | media_writes_0=`sum_from_watch_out MediaWrites` 46 | ddr_writes_0=`sum_from_watch_out WriteRequests` 47 | media_reads_0=`sum_from_watch_out MediaReads` 48 | ddr_reads_0=`sum_from_watch_out ReadRequests` 49 | 50 | sleep_seconds=1 51 | while true; do 52 | sleep $sleep_seconds 53 | 54 | refresh_watch_out 55 | media_writes_1=`sum_from_watch_out MediaWrites` 56 | ddr_writes_1=`sum_from_watch_out WriteRequests` 57 | media_reads_1=`sum_from_watch_out MediaReads` 58 | ddr_reads_1=`sum_from_watch_out ReadRequests` 59 | 60 | media_writes_delta=`calc $media_writes_1 - $media_writes_0 | xargs` 61 | ddr_writes_delta=`calc $ddr_writes_1 - $ddr_writes_0 | xargs` 62 | media_reads_delta=`calc $media_reads_1 - $media_reads_0 | xargs` 63 | ddr_reads_delta=`calc $ddr_reads_1 - $ddr_reads_0 | xargs` 64 | 65 | media_writes_GBs=`python -c "print $media_writes_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs` 66 | ddr_writes_GBs=`python -c "print $ddr_writes_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs` 67 | media_reads_GBs=`python -c "print $media_reads_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs` 68 | ddr_reads_GBs=`python -c "print $ddr_reads_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs` 69 | write_amp=`calc $media_writes_delta / $ddr_writes_delta | xargs` 70 | read_amp=`calc $media_reads_delta / $ddr_reads_delta | xargs` 71 | 72 | echo "Media writes = $media_writes_delta ($media_writes_GBs GB/s), DDR writes = $ddr_writes_delta ($ddr_writes_GBs GB/s), amplification = $write_amp" 73 | echo "Media reads = $media_reads_delta ($media_reads_GBs GB/s), DDR reads = $ddr_reads_delta ($ddr_reads_GBs GB/s), amplification = $read_amp" 74 | echo "" 75 | 76 | media_writes_0=$media_writes_1 77 | ddr_writes_0=$ddr_writes_1 78 | media_reads_0=$media_reads_1 79 | ddr_reads_0=$ddr_reads_1 80 | done 81 | -------------------------------------------------------------------------------- /log_store/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../common.h" 8 | #include "rotating_counter.h" 9 | 10 | static constexpr const char *kFileName = "/mnt/pmem12/raft_log"; 11 | static constexpr size_t kNumMeasurements = 2; 12 | static constexpr size_t kNumIters = 1000000; 13 | 14 | // Amount of data appended to the log in on iteration 15 | static constexpr size_t kMaxLogDataSize = 4096; 16 | 17 | void counter_only_bench(uint8_t *pbuf) { 18 | Counter ctr(pbuf, true /* create a new counter */); 19 | 20 | for (size_t msr = 0; msr < kNumMeasurements; msr++) { 21 | struct timespec bench_start; 22 | clock_gettime(CLOCK_REALTIME, &bench_start); 23 | 24 | for (size_t i = 0; i < kNumIters; i++) ctr.increment_naive(1); 25 | 26 | double bench_seconds = sec_since(bench_start); 27 | printf("Naive counter: %.2f M increments/s\n", 28 | kNumIters / (bench_seconds * 1000000)); 29 | } 30 | 31 | for (size_t msr = 0; msr < kNumMeasurements; msr++) { 32 | struct timespec bench_start; 33 | clock_gettime(CLOCK_REALTIME, &bench_start); 34 | 35 | for (size_t i = 0; i < kNumIters; i++) ctr.increment_rotate(1); 36 | 37 | double bench_seconds = sec_since(bench_start); 38 | printf("Rotating counter: %.2f M increments/s\n", 39 | kNumIters / (bench_seconds * 1000000)); 40 | } 41 | } 42 | 43 | class Log { 44 | public: 45 | // Assume pbuf is large enough to never overflow 46 | Log(uint8_t *pbuf) { 47 | ctr = Counter(pbuf, true /* create_new */); 48 | log_base_addr = pbuf + Counter::get_reqd_space(); 49 | } 50 | 51 | // Append with naive counter incrementing 52 | void append_naive(uint8_t *data, size_t data_size) { 53 | pmem_memcpy_persist(log_base_addr + ctr.v_value, data, data_size); 54 | ctr.increment_naive(data_size); 55 | } 56 | 57 | // Append with rotating counter incrementing 58 | void append_rotating(uint8_t *data, size_t data_size) { 59 | pmem_memcpy_persist(log_base_addr + ctr.v_value, data, data_size); 60 | ctr.increment_rotate(data_size); 61 | } 62 | 63 | Counter ctr; 64 | uint8_t *log_base_addr = nullptr; // Starting address of log contents on pmem 65 | }; 66 | 67 | void log_bench(uint8_t *pbuf) { 68 | uint8_t source[kMaxLogDataSize] = {0}; 69 | 70 | printf("write_bytes naive_GBps rotating_GBps\n"); 71 | 72 | // Sweep over write sizes 73 | for (size_t write_sz = 64; write_sz <= kMaxLogDataSize; write_sz *= 2) { 74 | double naive_GBps, rotating_GBps; 75 | 76 | { 77 | // Naive log 78 | Log log(pbuf); 79 | struct timespec bench_start; 80 | clock_gettime(CLOCK_REALTIME, &bench_start); 81 | 82 | for (size_t i = 0; i < kNumIters; i++) { 83 | // Modify the source 84 | for (size_t j = 0; j < write_sz / 64; j += 64) source[j]++; 85 | log.append_naive(source, write_sz); 86 | } 87 | 88 | double bench_seconds = sec_since(bench_start); 89 | naive_GBps = kNumIters * write_sz / (bench_seconds * GB(1)); 90 | } 91 | 92 | { 93 | // Rotating log 94 | Log log(pbuf); 95 | struct timespec bench_start; 96 | clock_gettime(CLOCK_REALTIME, &bench_start); 97 | 98 | for (size_t i = 0; i < kNumIters; i++) { 99 | // Modify the source 100 | for (size_t j = 0; j < write_sz / 64; j += 64) source[j]++; 101 | log.append_rotating(source, write_sz); 102 | } 103 | 104 | double bench_seconds = sec_since(bench_start); 105 | rotating_GBps = kNumIters * write_sz / (bench_seconds * GB(1)); 106 | } 107 | 108 | printf("%zu %.2f %.2f\n", write_sz, naive_GBps, rotating_GBps); 109 | } 110 | } 111 | 112 | int main() { 113 | size_t mapped_len; 114 | int is_pmem; 115 | uint8_t *pbuf = reinterpret_cast( 116 | pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem)); 117 | 118 | assert(pbuf != nullptr); 119 | assert(mapped_len >= Counter::get_reqd_space()); 120 | 121 | counter_only_bench(pbuf); 122 | for (size_t msr = 0; msr < kNumMeasurements; msr++) log_bench(pbuf); 123 | 124 | pmem_unmap(pbuf, mapped_len); 125 | exit(0); 126 | } 127 | -------------------------------------------------------------------------------- /utils/timer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | // High-resolution (~10 ns) timestamps, using fences to prevent reordering and 18 | // ensure exactly the desired regions are measured. 19 | 20 | #include 21 | 22 | namespace timer { 23 | 24 | // Start/Stop return absolute timestamps and must be placed immediately before 25 | // and after the region to measure. We provide separate Start/Stop functions 26 | // because they use different fences. 27 | // 28 | // Background: RDTSC is not 'serializing'; earlier instructions may complete 29 | // after it, and/or later instructions may complete before it. 'Fences' ensure 30 | // regions' elapsed times are independent of such reordering. The only 31 | // documented unprivileged serializing instruction is CPUID, which acts as a 32 | // full fence (no reordering across it in either direction). Unfortunately 33 | // the latency of CPUID varies wildly (perhaps made worse by not initializing 34 | // its EAX input). Because it cannot reliably be deducted from the region's 35 | // elapsed time, it must not be included in the region to measure (i.e. 36 | // between the two RDTSC). 37 | // 38 | // The newer RDTSCP is sometimes described as serializing, but it actually 39 | // only serves as a half-fence with release semantics. Although all 40 | // instructions in the region will complete before the final timestamp is 41 | // captured, subsequent instructions may leak into the region and increase the 42 | // elapsed time. Inserting another fence after the final RDTSCP would prevent 43 | // such reordering without affecting the measured region. 44 | // 45 | // Fortunately, such a fence exists. The LFENCE instruction is only documented 46 | // to delay later loads until earlier loads are visible. However, Intel's 47 | // reference manual says it acts as a full fence (waiting until all earlier 48 | // instructions have completed, and delaying later instructions until it 49 | // completes). AMD assigns the same behavior to MFENCE. 50 | // 51 | // We need a fence before the initial RDTSC to prevent earlier instructions 52 | // from leaking into the region, and arguably another after RDTSC to avoid 53 | // region instructions from completing before the timestamp is recorded. 54 | // When surrounded by fences, the additional RDTSCP half-fence provides no 55 | // benefit, so the initial timestamp can be recorded via RDTSC, which has 56 | // lower overhead than RDTSCP because it does not read TSC_AUX. In summary, 57 | // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. 58 | // 59 | // Using Start+Start leads to higher variance and overhead than Stop+Stop. 60 | // However, Stop+Stop includes an LFENCE in the region measurements, which 61 | // adds a delay dependent on earlier loads. The combination of Start+Stop 62 | // is faster than Start+Start and more consistent than Stop+Stop because 63 | // the first LFENCE already delayed subsequent loads before the measured 64 | // region. This combination seems not to have been considered in prior work: 65 | // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c 66 | // 67 | // Note: performance counters can measure 'exact' instructions-retired or 68 | // (unhalted) cycle counts. The RDPMC instruction is not serializing and also 69 | // requires fences. Unfortunately, it is not accessible on all OSes and we 70 | // prefer to avoid kernel-mode drivers. Performance counters are also affected 71 | // by several under/over-count errata, so we use the TSC instead. 72 | 73 | // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, 74 | // divide by InvariantTicksPerSecond. 75 | inline uint64_t Start() { 76 | uint64_t t; 77 | asm volatile( 78 | "lfence\n\t" 79 | "rdtsc\n\t" 80 | "shl $32, %%rdx\n\t" 81 | "or %%rdx, %0\n\t" 82 | "lfence" 83 | : "=a"(t) 84 | : 85 | // "memory" avoids reordering. rdx = TSC >> 32. 86 | // "cc" = flags modified by SHL. 87 | : "rdx", "memory", "cc"); 88 | return t; 89 | } 90 | 91 | uint64_t Stop() { 92 | uint64_t t; 93 | // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). 94 | asm volatile( 95 | "rdtscp\n\t" 96 | "shl $32, %%rdx\n\t" 97 | "or %%rdx, %0\n\t" 98 | "lfence" 99 | : "=a"(t) 100 | : 101 | // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. 102 | // "cc" = flags modified by SHL. 103 | : "rcx", "rdx", "memory", "cc"); 104 | return t; 105 | } 106 | } // namespace timer 107 | -------------------------------------------------------------------------------- /.ycm_extra_conf.py: -------------------------------------------------------------------------------- 1 | # Generated by YCM Generator at 2017-06-18 18:43:13.661945 2 | 3 | # This file is NOT licensed under the GPLv3, which is the license for the rest 4 | # of YouCompleteMe. 5 | # 6 | # Here's the license text for this file: 7 | # 8 | # This is free and unencumbered software released into the public domain. 9 | # 10 | # Anyone is free to copy, modify, publish, use, compile, sell, or 11 | # distribute this software, either in source code form or as a compiled 12 | # binary, for any purpose, commercial or non-commercial, and by any 13 | # means. 14 | # 15 | # In jurisdictions that recognize copyright laws, the author or authors 16 | # of this software dedicate any and all copyright interest in the 17 | # software to the public domain. We make this dedication for the benefit 18 | # of the public at large and to the detriment of our heirs and 19 | # successors. We intend this dedication to be an overt act of 20 | # relinquishment in perpetuity of all present and future rights to this 21 | # software under copyright law. 22 | # 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 26 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 27 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 28 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 29 | # OTHER DEALINGS IN THE SOFTWARE. 30 | # 31 | # For more information, please refer to 32 | 33 | import os 34 | import ycm_core 35 | 36 | from os.path import expanduser 37 | home_dir = expanduser("~") 38 | 39 | flags = [ 40 | '-x', 41 | 'c++', 42 | '-I' + str(home_dir) + '/rdma_bench', 43 | '-Wall', 44 | '-Werror', 45 | '-Wextra', 46 | '-Wno-inline', 47 | '-Wno-unused-function', 48 | '-Wold-style-cast', 49 | '-Wsign-conversion', 50 | '-std=c++11', 51 | ] 52 | 53 | 54 | # Set this to the absolute path to the folder (NOT the file!) containing the 55 | # compile_commands.json file to use that instead of 'flags'. See here for 56 | # more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html 57 | # 58 | # You can get CMake to generate this file for you by adding: 59 | # set( CMAKE_EXPORT_COMPILE_COMMANDS 1 ) 60 | # to your CMakeLists.txt file. 61 | # 62 | # Most projects will NOT need to set this to anything; you can just change the 63 | # 'flags' list of compilation flags. Notice that YCM itself uses that approach. 64 | compilation_database_folder = '' 65 | 66 | if os.path.exists( compilation_database_folder ): 67 | database = ycm_core.CompilationDatabase( compilation_database_folder ) 68 | else: 69 | database = None 70 | 71 | SOURCE_EXTENSIONS = [ '.C', '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ] 72 | 73 | def DirectoryOfThisScript(): 74 | return os.path.dirname( os.path.abspath( __file__ ) ) 75 | 76 | 77 | def MakeRelativePathsInFlagsAbsolute( flags, working_directory ): 78 | if not working_directory: 79 | return list( flags ) 80 | new_flags = [] 81 | make_next_absolute = False 82 | path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ] 83 | for flag in flags: 84 | new_flag = flag 85 | 86 | if make_next_absolute: 87 | make_next_absolute = False 88 | if not flag.startswith( '/' ): 89 | new_flag = os.path.join( working_directory, flag ) 90 | 91 | for path_flag in path_flags: 92 | if flag == path_flag: 93 | make_next_absolute = True 94 | break 95 | 96 | if flag.startswith( path_flag ): 97 | path = flag[ len( path_flag ): ] 98 | new_flag = path_flag + os.path.join( working_directory, path ) 99 | break 100 | 101 | if new_flag: 102 | new_flags.append( new_flag ) 103 | return new_flags 104 | 105 | 106 | def IsHeaderFile( filename ): 107 | extension = os.path.splitext( filename )[ 1 ] 108 | return extension in [ '.H', '.h', '.hxx', '.hpp', '.hh' ] 109 | 110 | 111 | def GetCompilationInfoForFile( filename ): 112 | # The compilation_commands.json file generated by CMake does not have entries 113 | # for header files. So we do our best by asking the db for flags for a 114 | # corresponding source file, if any. If one exists, the flags for that file 115 | # should be good enough. 116 | if IsHeaderFile( filename ): 117 | basename = os.path.splitext( filename )[ 0 ] 118 | for extension in SOURCE_EXTENSIONS: 119 | replacement_file = basename + extension 120 | if os.path.exists( replacement_file ): 121 | compilation_info = database.GetCompilationInfoForFile( 122 | replacement_file ) 123 | if compilation_info.compiler_flags_: 124 | return compilation_info 125 | return None 126 | return database.GetCompilationInfoForFile( filename ) 127 | 128 | 129 | def FlagsForFile( filename, **kwargs ): 130 | if database: 131 | # Bear in mind that compilation_info.compiler_flags_ does NOT return a 132 | # python list, but a "list-like" StringVec object 133 | compilation_info = GetCompilationInfoForFile( filename ) 134 | if not compilation_info: 135 | return None 136 | 137 | final_flags = MakeRelativePathsInFlagsAbsolute( 138 | compilation_info.compiler_flags_, 139 | compilation_info.compiler_working_dir_ ) 140 | 141 | else: 142 | relative_to = DirectoryOfThisScript() 143 | final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to ) 144 | 145 | return { 146 | 'flags': final_flags, 147 | 'do_cache': True 148 | } 149 | 150 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/latency.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file latency.h 3 | * @author MICA authors, akalia 4 | */ 5 | 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | /* 15 | * @brief Fast but approximate latency distribution measurement for latency 16 | * values up to 4000 microseconds (i.e., 4 ms). Adding a latency sample is 17 | * fast, but computing a statistic is slow. 18 | */ 19 | class Latency { 20 | public: 21 | Latency() { reset(); } 22 | 23 | void reset() { memset(this, 0, sizeof(Latency)); } 24 | 25 | /// Add a latency sample 26 | void update(size_t us) { 27 | if (us < 128) 28 | bin0_[us]++; 29 | else if (us < 384) 30 | bin1_[(us - 128) / 2]++; 31 | else if (us < 896) 32 | bin2_[(us - 384) / 4]++; 33 | else if (us < 1920) 34 | bin3_[(us - 896) / 8]++; 35 | else if (us < 3968) 36 | bin4_[(us - 1920) / 16]++; 37 | else 38 | bin5_++; 39 | } 40 | 41 | /// Combine two distributions 42 | Latency& operator+=(const Latency& o) { 43 | size_t i; 44 | for (i = 0; i < 128; i++) bin0_[i] += o.bin0_[i]; 45 | for (i = 0; i < 128; i++) bin1_[i] += o.bin1_[i]; 46 | for (i = 0; i < 128; i++) bin2_[i] += o.bin2_[i]; 47 | for (i = 0; i < 128; i++) bin3_[i] += o.bin3_[i]; 48 | for (i = 0; i < 128; i++) bin4_[i] += o.bin4_[i]; 49 | bin5_ += o.bin5_; 50 | return *this; 51 | } 52 | 53 | /// Return the total number of samples 54 | size_t count() const { 55 | size_t count = 0; 56 | size_t i; 57 | for (i = 0; i < 128; i++) count += bin0_[i]; 58 | for (i = 0; i < 128; i++) count += bin1_[i]; 59 | for (i = 0; i < 128; i++) count += bin2_[i]; 60 | for (i = 0; i < 128; i++) count += bin3_[i]; 61 | for (i = 0; i < 128; i++) count += bin4_[i]; 62 | count += bin5_; 63 | return count; 64 | } 65 | 66 | /// Return the (approximate) sum of all samples 67 | size_t sum() const { 68 | size_t sum = 0; 69 | size_t i; 70 | for (i = 0; i < 128; i++) sum += bin0_[i] * (0 + i * 1); 71 | for (i = 0; i < 128; i++) sum += bin1_[i] * (128 + i * 2); 72 | for (i = 0; i < 128; i++) sum += bin2_[i] * (384 + i * 4); 73 | for (i = 0; i < 128; i++) sum += bin3_[i] * (896 + i * 8); 74 | for (i = 0; i < 128; i++) sum += bin4_[i] * (1920 + i * 16); 75 | sum += bin5_ * 3968; 76 | return sum; 77 | } 78 | 79 | /// Return the (approximate) average sample 80 | double avg() const { 81 | return static_cast(sum()) / 82 | static_cast(std::max(size_t(1), count())); 83 | } 84 | 85 | /// Return the (approximate) minimum sample 86 | size_t min() const { 87 | size_t i; 88 | for (i = 0; i < 128; i++) 89 | if (bin0_[i] != 0) return 0 + i * 1; 90 | for (i = 0; i < 128; i++) 91 | if (bin1_[i] != 0) return 128 + i * 2; 92 | for (i = 0; i < 128; i++) 93 | if (bin2_[i] != 0) return 384 + i * 4; 94 | for (i = 0; i < 128; i++) 95 | if (bin3_[i] != 0) return 896 + i * 8; 96 | for (i = 0; i < 128; i++) 97 | if (bin4_[i] != 0) return 1920 + i * 16; 98 | // if (bin5_ != 0) return 3968; 99 | return 3968; 100 | } 101 | 102 | /// Return the (approximate) max sample 103 | size_t max() const { 104 | int64_t i; 105 | if (bin5_ != 0) return 3968; 106 | for (i = 127; i >= 0; i--) 107 | if (bin4_[i] != 0) return 1920 + static_cast(i) * 16; 108 | for (i = 127; i >= 0; i--) 109 | if (bin3_[i] != 0) return 896 + static_cast(i) * 8; 110 | for (i = 127; i >= 0; i--) 111 | if (bin2_[i] != 0) return 384 + static_cast(i) * 4; 112 | for (i = 127; i >= 0; i--) 113 | if (bin1_[i] != 0) return 128 + static_cast(i) * 2; 114 | for (i = 127; i >= 0; i--) 115 | if (bin0_[i] != 0) return 0 + static_cast(i) * 1; 116 | return 0; 117 | } 118 | 119 | /// Return the (approximate) p-th percentile sample 120 | size_t perc(double p) const { 121 | size_t i; 122 | int64_t thres = static_cast(p * static_cast(count())); 123 | for (i = 0; i < 128; i++) 124 | if ((thres -= static_cast(bin0_[i])) < 0) return 0 + i * 1; 125 | for (i = 0; i < 128; i++) 126 | if ((thres -= static_cast(bin1_[i])) < 0) return 128 + i * 2; 127 | for (i = 0; i < 128; i++) 128 | if ((thres -= static_cast(bin2_[i])) < 0) return 384 + i * 4; 129 | for (i = 0; i < 128; i++) 130 | if ((thres -= static_cast(bin3_[i])) < 0) return 896 + i * 8; 131 | for (i = 0; i < 128; i++) 132 | if ((thres -= static_cast(bin4_[i])) < 0) return 1920 + i * 16; 133 | return 3968; 134 | } 135 | 136 | /// Print the distribution to a file 137 | void print(FILE* fp) const { 138 | size_t i; 139 | for (i = 0; i < 128; i++) 140 | if (bin0_[i] != 0) 141 | fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 0 + i * 1, bin0_[i]); 142 | for (i = 0; i < 128; i++) 143 | if (bin1_[i] != 0) 144 | fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 128 + i * 2, bin1_[i]); 145 | for (i = 0; i < 128; i++) 146 | if (bin2_[i] != 0) 147 | fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 384 + i * 4, bin2_[i]); 148 | for (i = 0; i < 128; i++) 149 | if (bin3_[i] != 0) 150 | fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 896 + i * 8, bin3_[i]); 151 | for (i = 0; i < 128; i++) 152 | if (bin4_[i] != 0) 153 | fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 1920 + i * 16, bin4_[i]); 154 | if (bin5_ != 0) fprintf(fp, "%4d %6" PRIu64 "\n", 3968, bin5_); 155 | } 156 | 157 | private: 158 | // [0, 128) us 159 | size_t bin0_[128]; 160 | // [128, 384) us 161 | size_t bin1_[128]; 162 | // [384, 896) us 163 | size_t bin2_[128]; 164 | // [896, 1920) us 165 | size_t bin3_[128]; 166 | // [1920, 3968) us 167 | size_t bin4_[128]; 168 | // [3968, inf) us 169 | size_t bin5_; 170 | }; 171 | -------------------------------------------------------------------------------- /rdma/rdma-write-bw/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "../libhrd_cpp/hrd.h" 14 | 15 | DEFINE_uint64(is_client, 0, "Is this process a client?"); 16 | DEFINE_uint64(machine_id, 0, "Index among client machines (for clients)"); 17 | DEFINE_uint64(min_write_size, 0, "Client's min RDMA write size"); 18 | DEFINE_uint64(max_write_size, 0, "Client's max RDMA write size"); 19 | DEFINE_uint64(window_size, 0, "Number of writes outstanding at client"); 20 | 21 | static constexpr size_t kPmemFileSize = GB(4); 22 | 23 | // If true, server zeroes out its buffer and reports write throughput 24 | static constexpr bool kZeroServerBuf = true; 25 | 26 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages. 27 | static constexpr bool kUsePmem = true; 28 | static constexpr const char* kPmemFile = "/dev/dax0.0"; 29 | 30 | // If true, we use read-after-write to force persistence 31 | static constexpr bool kReadAfterWrite = true; 32 | 33 | static constexpr bool kVerbose = false; 34 | 35 | // Map the devdax buffer at the server 36 | uint8_t* get_pmem_buf_server() { 37 | int fd = open(kPmemFile, O_RDWR); 38 | rt_assert(fd >= 0, "devdax open failed"); 39 | 40 | void* buf = 41 | mmap(nullptr, kPmemFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 42 | rt_assert(buf != MAP_FAILED, "mmap failed for devdax"); 43 | rt_assert(reinterpret_cast(buf) % 256 == 0); 44 | 45 | return reinterpret_cast(buf); 46 | } 47 | 48 | void server_func() { 49 | uint8_t* pmem_buf = nullptr; 50 | if (kUsePmem) { 51 | pmem_buf = get_pmem_buf_server(); 52 | 53 | // Fill in the persistent buffer, also sanity-check local write throughput 54 | if (kZeroServerBuf) { 55 | printf("main: Zero-ing pmem buffer\n"); 56 | struct timespec start; 57 | clock_gettime(CLOCK_REALTIME, &start); 58 | pmem_memset_persist(pmem_buf, 0, kPmemFileSize); 59 | printf("main: Zero-ed %f MB of pmem at %.1f GB/s\n", 60 | kPmemFileSize * 1.0 / MB(1), 61 | kPmemFileSize / (1000000000.0 * sec_since(start))); 62 | } 63 | } 64 | 65 | struct hrd_conn_config_t conn_config; 66 | conn_config.num_qps = 1; 67 | conn_config.use_uc = false; 68 | conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr; 69 | conn_config.buf_size = kPmemFileSize; 70 | conn_config.buf_shm_key = kUsePmem ? -1 : 3185; 71 | 72 | auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */, 73 | &conn_config, nullptr /* dgram config */); 74 | 75 | // Publish server QP 76 | auto srv_qp_name = std::string("server"); 77 | hrd_publish_conn_qp(cb, 0, srv_qp_name.c_str()); 78 | 79 | printf("main: Server published. Waiting for client\n"); 80 | 81 | auto conn_name = std::string("client"); 82 | hrd_qp_attr_t* conn_qp = nullptr; 83 | while (conn_qp == nullptr) { 84 | conn_qp = hrd_get_published_qp(conn_name.c_str()); 85 | if (conn_qp == nullptr) { 86 | usleep(200000); 87 | continue; 88 | } 89 | 90 | printf("main: Server found client! Connecting..\n"); 91 | hrd_connect_qp(cb, 0, conn_qp); 92 | } 93 | 94 | hrd_publish_ready("server"); 95 | printf("main: Server ready. Going to sleep.\n"); 96 | 97 | while (true) sleep(1); 98 | } 99 | 100 | void client_func() { 101 | hrd_conn_config_t conn_config; 102 | 103 | conn_config.num_qps = 1; 104 | conn_config.use_uc = false; 105 | conn_config.prealloc_buf = nullptr; 106 | conn_config.buf_size = FLAGS_max_write_size; 107 | conn_config.buf_shm_key = 3185; 108 | 109 | auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */, 110 | &conn_config, nullptr /* dgram config */); 111 | memset(const_cast(cb->conn_buf), 31, FLAGS_max_write_size); 112 | 113 | hrd_publish_conn_qp(cb, 0, "client"); 114 | printf("main: Client published. Waiting for server.\n"); 115 | 116 | hrd_qp_attr_t* srv_qp = nullptr; 117 | while (srv_qp == nullptr) { 118 | srv_qp = hrd_get_published_qp("server"); 119 | if (srv_qp == nullptr) usleep(2000); 120 | } 121 | 122 | printf("main: Found server. Connecting..\n"); 123 | hrd_connect_qp(cb, 0, srv_qp); 124 | printf("main: Client connected!\n"); 125 | 126 | hrd_wait_till_ready("server"); 127 | 128 | struct timespec start; 129 | size_t total_bytes_written = 0; 130 | size_t pending_ops = 0; 131 | size_t remote_offset = 0; 132 | size_t cur_write_size = FLAGS_min_write_size; 133 | 134 | clock_gettime(CLOCK_REALTIME, &start); 135 | 136 | while (true) { 137 | if (pending_ops < FLAGS_window_size) { 138 | struct ibv_send_wr write_wr, read_wr, *bad_send_wr; 139 | struct ibv_sge write_sge, read_sge; 140 | 141 | // RDMA-write kClientWriteSize bytes 142 | write_sge.addr = reinterpret_cast(&cb->conn_buf[0]); 143 | write_sge.length = cur_write_size; 144 | write_sge.lkey = cb->conn_buf_mr->lkey; 145 | 146 | write_wr.opcode = IBV_WR_RDMA_WRITE; 147 | write_wr.num_sge = 1; 148 | write_wr.sg_list = &write_sge; 149 | write_wr.send_flags = kReadAfterWrite ? 0 : IBV_SEND_SIGNALED; 150 | 151 | if (remote_offset + cur_write_size > kPmemFileSize) remote_offset = 0; 152 | write_wr.wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset; 153 | write_wr.wr.rdma.rkey = srv_qp->rkey; 154 | write_wr.next = kReadAfterWrite ? &read_wr : nullptr; 155 | 156 | remote_offset += cur_write_size; 157 | 158 | if (kReadAfterWrite) { 159 | // RDMA-read 8 bytes from the end of the written buffer 160 | read_sge.addr = reinterpret_cast(&cb->conn_buf[0]); 161 | read_sge.length = sizeof(size_t); 162 | read_sge.lkey = cb->conn_buf_mr->lkey; 163 | 164 | read_wr.opcode = IBV_WR_RDMA_READ; 165 | read_wr.num_sge = 1; 166 | read_wr.sg_list = &read_sge; 167 | read_wr.send_flags = IBV_SEND_SIGNALED; 168 | read_wr.wr.rdma.remote_addr = 169 | write_wr.wr.rdma.remote_addr + cur_write_size - sizeof(size_t); 170 | read_wr.wr.rdma.rkey = srv_qp->rkey; 171 | read_wr.next = nullptr; 172 | } 173 | 174 | int ret = ibv_post_send(cb->conn_qp[0], &write_wr, &bad_send_wr); 175 | rt_assert(ret == 0); 176 | pending_ops++; 177 | 178 | if (kVerbose) printf("Client posted. Pending = %zu\n", pending_ops); 179 | } 180 | 181 | if (pending_ops == FLAGS_window_size) { 182 | struct ibv_wc wc; 183 | hrd_poll_cq(cb->conn_cq[0], 1, &wc); 184 | pending_ops--; 185 | 186 | if (kVerbose) printf("Client polled. Pending = %zu\n", pending_ops); 187 | total_bytes_written += cur_write_size; 188 | } 189 | 190 | if (total_bytes_written >= GB(4)) { 191 | double secs = sec_since(start); 192 | 193 | printf("Client: size %zu, %.2f Gbps.\n", cur_write_size, 194 | total_bytes_written * 8 / (1000000000 * secs)); 195 | 196 | cur_write_size *= 2; 197 | printf("doubling to %zu\n", cur_write_size); 198 | if (cur_write_size > FLAGS_max_write_size) { 199 | cur_write_size = FLAGS_min_write_size; 200 | printf("back to %zu\n", cur_write_size); 201 | } 202 | 203 | total_bytes_written = 0; 204 | clock_gettime(CLOCK_REALTIME, &start); 205 | } 206 | } 207 | } 208 | 209 | int main(int argc, char* argv[]) { 210 | gflags::ParseCommandLineFlags(&argc, &argv, true); 211 | if (FLAGS_is_client == 1) { 212 | auto client_thread = std::thread(client_func); 213 | client_thread.join(); 214 | } else { 215 | auto t = std::thread(server_func); 216 | t.join(); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /rdma/libhrd_cpp/hrd.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "../../common.h" 24 | 25 | static constexpr size_t kRoCE = false; ///< Use RoCE 26 | 27 | // Maximum inline data so that WQEs fit in two cache lines (max_sge = 1): 28 | // * mlx4, RC: 88 29 | // * mlx4, UD: 60 30 | static constexpr size_t kHrdMaxInline = 88; 31 | static constexpr size_t kHrdSQDepth = 128; ///< Depth of all SEND queues 32 | static constexpr size_t kHrdRQDepth = 2048; ///< Depth of all RECV queues 33 | 34 | static constexpr uint32_t kHrdInvalidNUMANode = 9; 35 | static constexpr uint32_t kHrdDefaultPSN = 3185; 36 | static constexpr uint32_t kHrdDefaultQKey = 0x11111111; 37 | static constexpr size_t kHrdMaxLID = 256; 38 | static constexpr size_t kHrdMaxUDQPs = 256; ///< Maximum number of UD QPs 39 | 40 | static constexpr size_t kHrdQPNameSize = 200; 41 | 42 | // This needs to be a macro because we don't have Mellanox OFED for Debian 43 | #define kHrdMlx5Atomics false 44 | #define kHrdReservedNamePrefix "__HRD_RESERVED_NAME_PREFIX" 45 | 46 | /// Registry info about a QP 47 | struct hrd_qp_attr_t { 48 | char name[kHrdQPNameSize]; 49 | uint16_t lid; 50 | uint32_t qpn; 51 | union ibv_gid gid; ///< GID, used for only RoCE 52 | 53 | // Info about the RDMA buffer associated with this QP 54 | uintptr_t buf_addr; 55 | uint32_t buf_size; 56 | uint32_t rkey; 57 | }; 58 | 59 | struct hrd_conn_config_t { 60 | // Required params 61 | size_t num_qps = 0; // num_qps > 0 is used as a validity check 62 | bool use_uc; 63 | volatile uint8_t* prealloc_buf; 64 | size_t buf_size; 65 | int buf_shm_key; 66 | 67 | // Optional params with their default values 68 | size_t sq_depth = kHrdSQDepth; 69 | size_t max_rd_atomic = 16; 70 | 71 | std::string to_string() { 72 | std::ostringstream ret; 73 | ret << "[num_qps " << std::to_string(num_qps) << ", use_uc " 74 | << std::to_string(use_uc) << ", buf size " << std::to_string(buf_size) 75 | << ", shm key " << std::to_string(buf_shm_key) << ", sq_depth " 76 | << std::to_string(sq_depth) << ", max_rd_atomic " 77 | << std::to_string(max_rd_atomic) << "]"; 78 | return ret.str(); 79 | } 80 | }; 81 | 82 | struct hrd_dgram_config_t { 83 | size_t num_qps; 84 | volatile uint8_t* prealloc_buf; 85 | size_t buf_size; 86 | int buf_shm_key; 87 | }; 88 | 89 | struct hrd_ctrl_blk_t { 90 | size_t local_hid; // Local ID on the machine this process runs on 91 | 92 | // Info about the device/port to use for this control block 93 | size_t port_index; // User-supplied. 0-based across all devices 94 | size_t numa_node; // NUMA node id 95 | 96 | /// InfiniBand info resolved from \p phy_port, must be filled by constructor. 97 | struct { 98 | int device_id; // Device index in list of verbs devices 99 | struct ibv_context* ib_ctx; // The verbs device context 100 | uint8_t dev_port_id; // 1-based port ID in device. 0 is invalid. 101 | uint16_t port_lid; // LID of phy_port. 0 is invalid. 102 | 103 | union ibv_gid gid; // GID, used only for RoCE 104 | } resolve; 105 | 106 | struct ibv_pd* pd; // A protection domain for this control block 107 | 108 | // Connected QPs 109 | hrd_conn_config_t conn_config; 110 | struct ibv_qp** conn_qp; 111 | struct ibv_cq** conn_cq; 112 | volatile uint8_t* conn_buf; // A buffer for RDMA over RC/UC QPs 113 | struct ibv_mr* conn_buf_mr; 114 | 115 | // Datagram QPs 116 | size_t num_dgram_qps; 117 | struct ibv_qp* dgram_qp[kHrdMaxUDQPs]; 118 | struct ibv_cq *dgram_send_cq[kHrdMaxUDQPs], *dgram_recv_cq[kHrdMaxUDQPs]; 119 | volatile uint8_t* dgram_buf; // A buffer for RECVs on dgram QPs 120 | size_t dgram_buf_size; 121 | int dgram_buf_shm_key; 122 | struct ibv_mr* dgram_buf_mr; 123 | 124 | uint8_t pad[64]; 125 | }; 126 | 127 | // Major initialzation functions 128 | hrd_ctrl_blk_t* hrd_ctrl_blk_init(size_t local_hid, size_t port_index, 129 | size_t numa_node, 130 | hrd_conn_config_t* conn_config, 131 | hrd_dgram_config_t* dgram_config); 132 | 133 | int hrd_ctrl_blk_destroy(hrd_ctrl_blk_t* cb); 134 | 135 | // Debug 136 | void hrd_ibv_devinfo(void); 137 | 138 | void hrd_resolve_port_index(hrd_ctrl_blk_t* cb, size_t port_index); 139 | void hrd_create_conn_qps(hrd_ctrl_blk_t* cb); 140 | void hrd_create_dgram_qps(hrd_ctrl_blk_t* cb); 141 | 142 | void hrd_connect_qp(hrd_ctrl_blk_t* cb, size_t conn_qp_idx, 143 | hrd_qp_attr_t* remote_qp_attr); 144 | 145 | // Post 1 RECV for this queue pair for this buffer. Low performance. 146 | void hrd_post_dgram_recv(struct ibv_qp* qp, void* buf_addr, size_t len, 147 | uint32_t lkey); 148 | 149 | // Fill @wc with @num_comps comps from this @cq. Exit on error. 150 | static inline void hrd_poll_cq(struct ibv_cq* cq, int num_comps, 151 | struct ibv_wc* wc) { 152 | int comps = 0; 153 | while (comps < static_cast(num_comps)) { 154 | int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]); 155 | if (new_comps != 0) { 156 | // Ideally, we should check from comps -> new_comps - 1 157 | if (wc[comps].status != 0) { 158 | fprintf(stderr, "Bad wc status %d\n", wc[comps].status); 159 | exit(0); 160 | } 161 | 162 | comps += new_comps; 163 | } 164 | } 165 | } 166 | 167 | // Fill @wc with @num_comps comps from this @cq. Return -1 on error, else 0. 168 | static inline int hrd_poll_cq_ret(struct ibv_cq* cq, int num_comps, 169 | struct ibv_wc* wc) { 170 | int comps = 0; 171 | 172 | while (comps < num_comps) { 173 | int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]); 174 | if (new_comps != 0) { 175 | // Ideally, we should check from comps -> new_comps - 1 176 | if (wc[comps].status != 0) { 177 | fprintf(stderr, "Bad wc status %d\n", wc[comps].status); 178 | return -1; // Return an error so the caller can clean up 179 | } 180 | 181 | comps += new_comps; 182 | } 183 | } 184 | 185 | return 0; // Success 186 | } 187 | 188 | // Registry functions 189 | void hrd_publish(const char* key, void* value, size_t len); 190 | int hrd_get_published(const char* key, void** value); 191 | 192 | // Publish the nth connected queue pair from this cb with this name 193 | void hrd_publish_conn_qp(hrd_ctrl_blk_t* cb, size_t n, const char* qp_name); 194 | 195 | // Publish the nth datagram queue pair from this cb with this name 196 | void hrd_publish_dgram_qp(hrd_ctrl_blk_t* cb, size_t n, const char* qp_name); 197 | 198 | struct hrd_qp_attr_t* hrd_get_published_qp(const char* qp_name); 199 | 200 | void hrd_publish_ready(const char* qp_name); 201 | void hrd_wait_till_ready(const char* qp_name); 202 | 203 | void hrd_close_memcached(); 204 | 205 | // Utility functions 206 | static inline uint32_t hrd_fastrand(uint64_t* seed) { 207 | *seed = *seed * 1103515245 + 12345; 208 | return static_cast((*seed) >> 32); 209 | } 210 | 211 | static inline size_t hrd_get_cycles() { 212 | uint64_t rax; 213 | uint64_t rdx; 214 | asm volatile("rdtsc" : "=a"(rax), "=d"(rdx)); 215 | return static_cast((rdx << 32) | rax); 216 | } 217 | 218 | static inline int hrd_is_power_of_2(uint64_t n) { return n && !(n & (n - 1)); } 219 | 220 | uint8_t* hrd_malloc_socket(int shm_key, size_t size, size_t socket_id); 221 | int hrd_free(int shm_key, void* shm_buf); 222 | void hrd_red_printf(const char* format, ...); 223 | void hrd_get_formatted_time(char* timebuf); 224 | void hrd_nano_sleep(size_t ns); 225 | char* hrd_getenv(const char* name); 226 | void hrd_bind_to_core(std::thread& thread, size_t n); 227 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file common.h 3 | * @brief Misc convenience functions and macros 4 | */ 5 | 6 | #pragma once 7 | #define likely(x) __builtin_expect(!!(x), 1) 8 | #define unlikely(x) __builtin_expect(!!(x), 0) 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #define _unused(x) ((void)(x)) // Make production build happy 18 | 19 | #define KB(x) (static_cast(x) << 10) 20 | #define MB(x) (static_cast(x) << 20) 21 | #define GB(x) (static_cast(x) << 30) 22 | #define TB(x) (static_cast(x) << 40) 23 | 24 | static void memory_barrier() { asm volatile("" ::: "memory"); } 25 | static void lfence() { asm volatile("lfence" ::: "memory"); } 26 | static void sfence() { asm volatile("sfence" ::: "memory"); } 27 | static void mfence() { asm volatile("mfence" ::: "memory"); } 28 | 29 | #define pmem_clflushopt(addr) \ 30 | asm volatile(".byte 0x66; clflush %0" : "+m"(*(volatile char *)(addr))); 31 | 32 | #define pmem_clwb(addr) \ 33 | asm volatile(".byte 0x66; xsaveopt %0" : "+m"(*(volatile char *)(addr))); 34 | 35 | template 36 | static constexpr bool is_power_of_two(T x) { 37 | return x && ((x & T(x - 1)) == 0); 38 | } 39 | 40 | template 41 | static constexpr T roundup(T x) { 42 | static_assert(is_power_of_two(PowerOfTwoNumber), 43 | "PowerOfTwoNumber must be a power of 2"); 44 | return ((x) + T(PowerOfTwoNumber - 1)) & (~T(PowerOfTwoNumber - 1)); 45 | } 46 | 47 | class SlowRand { 48 | std::random_device rand_dev; // Non-pseudorandom seed for twister 49 | std::mt19937_64 mt; 50 | std::uniform_int_distribution dist; 51 | 52 | public: 53 | SlowRand() : mt(rand_dev()), dist(0, UINT64_MAX) {} 54 | 55 | inline uint64_t next_u64() { return dist(mt); } 56 | }; 57 | 58 | class FastRand { 59 | public: 60 | uint64_t seed; 61 | 62 | /// Create a FastRand using a seed from SlowRand 63 | FastRand() { 64 | SlowRand slow_rand; 65 | seed = slow_rand.next_u64(); 66 | } 67 | 68 | inline uint32_t next_u32() { 69 | seed = seed * 1103515245 + 12345; 70 | return static_cast(seed >> 32); 71 | } 72 | }; 73 | 74 | /// Check a condition at runtime. If the condition is false, throw exception. 75 | static inline void rt_assert(bool condition, std::string throw_str, char *s) { 76 | if (unlikely(!condition)) { 77 | throw std::runtime_error(throw_str + std::string(s)); 78 | } 79 | } 80 | 81 | /// Check a condition at runtime. If the condition is false, throw exception. 82 | static inline void rt_assert(bool condition, std::string throw_str) { 83 | if (unlikely(!condition)) throw std::runtime_error(throw_str); 84 | } 85 | 86 | /// Check a condition at runtime. If the condition is false, throw exception. 87 | /// This is faster than rt_assert(cond, str) as it avoids string construction. 88 | static inline void rt_assert(bool condition) { 89 | if (unlikely(!condition)) throw std::runtime_error("Error"); 90 | } 91 | 92 | /// Return the TSC 93 | static inline size_t rdtsc() { 94 | uint64_t rax; 95 | uint64_t rdx; 96 | asm volatile("rdtsc" : "=a"(rax), "=d"(rdx)); 97 | return static_cast((rdx << 32) | rax); 98 | } 99 | 100 | static uint64_t rdtscp() { 101 | uint64_t rax; 102 | uint64_t rdx; 103 | uint32_t aux; 104 | asm volatile("rdtscp" : "=a"(rax), "=d"(rdx), "=c"(aux) : :); 105 | return (rdx << 32) | rax; 106 | } 107 | 108 | static void nano_sleep(size_t ns, double freq_ghz) { 109 | size_t start = rdtsc(); 110 | size_t end = start; 111 | size_t upp = static_cast(freq_ghz * ns); 112 | while (end - start < upp) end = rdtsc(); 113 | } 114 | 115 | static double measure_rdtsc_freq() { 116 | struct timespec start, end; 117 | clock_gettime(CLOCK_REALTIME, &start); 118 | uint64_t rdtsc_start = rdtsc(); 119 | 120 | // Do not change this loop! The hardcoded value below depends on this loop 121 | // and prevents it from being optimized out. 122 | uint64_t sum = 5; 123 | for (uint64_t i = 0; i < 1000000; i++) { 124 | sum += i + (sum + i) * (i % sum); 125 | } 126 | rt_assert(sum == 13580802877818827968ull, "Error in RDTSC freq measurement"); 127 | 128 | clock_gettime(CLOCK_REALTIME, &end); 129 | uint64_t clock_ns = 130 | static_cast(end.tv_sec - start.tv_sec) * 1000000000 + 131 | static_cast(end.tv_nsec - start.tv_nsec); 132 | uint64_t rdtsc_cycles = rdtsc() - rdtsc_start; 133 | 134 | double _freq_ghz = rdtsc_cycles * 1.0 / clock_ns; 135 | rt_assert(_freq_ghz >= 0.5 && _freq_ghz <= 5.0, "Invalid RDTSC frequency"); 136 | 137 | return _freq_ghz; 138 | } 139 | 140 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to seconds 141 | static double to_sec(size_t cycles, double freq_ghz) { 142 | return (cycles / (freq_ghz * 1000000000)); 143 | } 144 | 145 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to msec 146 | static double to_msec(size_t cycles, double freq_ghz) { 147 | return (cycles / (freq_ghz * 1000000)); 148 | } 149 | 150 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to usec 151 | static double to_usec(size_t cycles, double freq_ghz) { 152 | return (cycles / (freq_ghz * 1000)); 153 | } 154 | 155 | static size_t ms_to_cycles(double ms, double freq_ghz) { 156 | return static_cast(ms * 1000 * 1000 * freq_ghz); 157 | } 158 | 159 | static size_t us_to_cycles(double us, double freq_ghz) { 160 | return static_cast(us * 1000 * freq_ghz); 161 | } 162 | 163 | static size_t ns_to_cycles(double ns, double freq_ghz) { 164 | return static_cast(ns * freq_ghz); 165 | } 166 | 167 | // Edit 168 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to nsec 169 | static double to_nsec(size_t cycles, double freq_ghz) { 170 | return (cycles / freq_ghz); 171 | } 172 | 173 | /// Return seconds elapsed since timestamp \p t0 174 | static double sec_since(const struct timespec &t0) { 175 | struct timespec t1; 176 | clock_gettime(CLOCK_REALTIME, &t1); 177 | return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0; 178 | } 179 | 180 | /// Return nanoseconds elapsed since timestamp \p t0 181 | static double ns_since(const struct timespec &t0) { 182 | struct timespec t1; 183 | clock_gettime(CLOCK_REALTIME, &t1); 184 | return (t1.tv_sec - t0.tv_sec) * 1000000000.0 + (t1.tv_nsec - t0.tv_nsec); 185 | } 186 | 187 | /// Return the number of logical cores per NUMA node 188 | static size_t num_lcores_per_numa_node() { 189 | return static_cast(numa_num_configured_cpus() / 190 | numa_num_configured_nodes()); 191 | } 192 | 193 | /// Return a list of logical cores in \p numa_node 194 | static std::vector get_lcores_for_numa_node(size_t numa_node) { 195 | rt_assert(numa_node <= static_cast(numa_max_node())); 196 | 197 | std::vector ret; 198 | size_t num_lcores = static_cast(numa_num_configured_cpus()); 199 | 200 | for (size_t i = 0; i < num_lcores; i++) { 201 | if (numa_node == static_cast(numa_node_of_cpu(i))) { 202 | ret.push_back(i); 203 | } 204 | } 205 | 206 | return ret; 207 | } 208 | 209 | /// Bind \p thread to core with index \p numa_local_index on \p numa_node 210 | static void bind_to_core(std::thread &thread, size_t numa_node, 211 | size_t numa_local_index) { 212 | cpu_set_t cpuset; 213 | CPU_ZERO(&cpuset); 214 | 215 | auto lcore_vec = get_lcores_for_numa_node(numa_node); 216 | size_t global_index = lcore_vec.at(numa_local_index); 217 | 218 | CPU_SET(global_index, &cpuset); 219 | int rc = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), 220 | &cpuset); 221 | rt_assert(rc == 0, "Error setting thread affinity"); 222 | } 223 | 224 | /// Compute the standard deviation of a vector 225 | static double stddev(std::vector v) { 226 | if (unlikely(v.empty())) return 0; 227 | double sum = std::accumulate(v.begin(), v.end(), 0.0); 228 | double mean = sum / v.size(); 229 | double sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), 0.0); 230 | double var = sq_sum / v.size() - (mean * mean); 231 | if (unlikely(var < 0)) return 0.0; // This can happen when var ~ 0 232 | 233 | return std::sqrt(var); 234 | } 235 | -------------------------------------------------------------------------------- /rdma/rdma-write-flush-lat/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "../libhrd_cpp/hrd.h" 13 | #include "latency.h" 14 | 15 | DEFINE_uint64(is_client, 0, "Is this process a client?"); 16 | 17 | static constexpr size_t kBufSize = KB(128); // Registered buffer size 18 | static constexpr size_t kMinWriteSize = 64; 19 | static constexpr size_t kMaxWriteSize = 1024; 20 | 21 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages. 22 | static constexpr bool kUsePmem = true; 23 | static constexpr const char* kPmemFile = "/dev/dax0.0"; 24 | 25 | // Number of writes to flush. The (WRITE+READ) combos for all writes are 26 | // issued in one postlist. Only the last READ in the postlist is signaled, so 27 | // kNumWrites cannot be too large. Else we'll run into signaling issues. 28 | static constexpr size_t kNumWritesToFlush = 1; 29 | 30 | // If true, we issue only one signaled write and no reads 31 | static constexpr bool kJustAWrite = true; 32 | 33 | uint8_t* get_pmem_buf() { 34 | int fd = open(kPmemFile, O_RDWR); 35 | rt_assert(fd >= 0, "devdax open failed"); 36 | 37 | size_t pmem_size = round_up(kBufSize); // Smaller sizes may fail 38 | void* buf = 39 | mmap(nullptr, pmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 40 | rt_assert(buf != MAP_FAILED, "mmap failed for devdax"); 41 | rt_assert(reinterpret_cast(buf) % 256 == 0); 42 | memset(buf, 0, pmem_size); 43 | 44 | return reinterpret_cast(buf); 45 | } 46 | 47 | void run_server() { 48 | uint8_t* pmem_buf = nullptr; 49 | if (kUsePmem) pmem_buf = get_pmem_buf(); 50 | 51 | struct hrd_conn_config_t conn_config; 52 | conn_config.num_qps = 1; 53 | conn_config.use_uc = false; 54 | conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr; 55 | conn_config.buf_size = kBufSize; 56 | conn_config.buf_shm_key = kUsePmem ? -1 : 3185; 57 | 58 | auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */, 59 | &conn_config, nullptr /* dgram config */); 60 | memset(const_cast(cb->conn_buf), 0, kBufSize); 61 | 62 | hrd_publish_conn_qp(cb, 0, "server"); 63 | printf("main: Server published. Waiting for client.\n"); 64 | 65 | hrd_qp_attr_t* clt_qp = nullptr; 66 | while (clt_qp == nullptr) { 67 | clt_qp = hrd_get_published_qp("client"); 68 | if (clt_qp == nullptr) usleep(200000); 69 | } 70 | 71 | printf("main: Server %s found client! Connecting..\n", "server"); 72 | hrd_connect_qp(cb, 0, clt_qp); 73 | hrd_publish_ready("server"); 74 | printf("main: Server ready. Going to sleep.\n"); 75 | 76 | while (true) sleep(1); 77 | } 78 | 79 | /// Get a random offset in the registered buffer with at least \p msg_size room 80 | size_t get_256_aligned_random_offset(pcg64_fast& pcg, size_t msg_size) { 81 | size_t iters = 0; 82 | while (true) { 83 | size_t rand_offset = (pcg() % kBufSize); 84 | if (likely(kBufSize - rand_offset > msg_size)) return rand_offset; 85 | iters++; 86 | if (unlikely(iters > 10)) printf("Random offset took over 10 iters\n"); 87 | } 88 | } 89 | 90 | void run_client() { 91 | Latency latency; 92 | hrd_conn_config_t conn_config; 93 | conn_config.num_qps = 1; 94 | conn_config.use_uc = false; 95 | conn_config.prealloc_buf = nullptr; 96 | conn_config.buf_size = kBufSize; 97 | conn_config.buf_shm_key = 3185; 98 | 99 | auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */, 100 | &conn_config, nullptr /* dgram config */); 101 | memset(const_cast(cb->conn_buf), 31, kBufSize); 102 | 103 | hrd_publish_conn_qp(cb, 0, "client"); 104 | printf("main: Client published. Waiting for server.\n"); 105 | 106 | hrd_qp_attr_t* srv_qp = nullptr; 107 | while (srv_qp == nullptr) { 108 | srv_qp = hrd_get_published_qp("server"); 109 | if (srv_qp == nullptr) usleep(2000); 110 | } 111 | 112 | printf("main: Client found server. Connecting..\n"); 113 | hrd_connect_qp(cb, 0, srv_qp); 114 | printf("main: Client connected!\n"); 115 | 116 | hrd_wait_till_ready("server"); 117 | 118 | // The +1s are for simpler postlist chain pointer math 119 | static constexpr size_t kArrSz = kNumWritesToFlush + 1; 120 | struct ibv_send_wr write_wr[kArrSz], read_wr[kArrSz]; 121 | struct ibv_send_wr* bad_send_wr; 122 | struct ibv_sge write_sge[kArrSz], read_sge[kArrSz]; 123 | struct ibv_wc wc; 124 | 125 | size_t write_size = kMinWriteSize; // Increases by powers of two 126 | size_t num_iters = 0; 127 | 128 | // Remote memory is divided into write_size chunks. The RDMA writes use these 129 | // chunks in order. 130 | size_t write_chunk_idx = 0; 131 | 132 | // pcg64_fast pcg(pcg_extras::seed_seq_from{}); 133 | 134 | printf("#write_size median_us 5th_us 99th_us 999th_us\n"); // Stats header 135 | while (true) { 136 | if (num_iters == KB(256)) { 137 | printf("%zu %.1f %.1f %.1f %.1f\n", write_size, latency.perc(.50) / 10.0, 138 | latency.perc(.05) / 10.0, latency.perc(.99) / 10.0, 139 | latency.perc(.999) / 10.0); 140 | latency.reset(); 141 | 142 | write_size *= 2; 143 | if (write_size > kMaxWriteSize) write_size = kMinWriteSize; 144 | 145 | num_iters = 0; 146 | write_chunk_idx = 0; 147 | } 148 | 149 | struct timespec start; 150 | clock_gettime(CLOCK_REALTIME, &start); 151 | 152 | // Enter the loop below with room for at least (kNumWritesToFlush + 1) 153 | // chunks. We don't use the last chunk because we read from there. 154 | if (write_chunk_idx + 1 >= 155 | (kBufSize / write_size) - kNumWritesToFlush - 1) { 156 | write_chunk_idx = 0; 157 | } 158 | 159 | // WRITE 160 | for (size_t i = 0; i < kNumWritesToFlush; i++) { 161 | const size_t remote_offset = write_chunk_idx * write_size; 162 | write_chunk_idx++; 163 | 164 | write_sge[i].addr = 165 | reinterpret_cast(&cb->conn_buf[i * write_size]); 166 | write_sge[i].length = write_size; 167 | write_sge[i].lkey = cb->conn_buf_mr->lkey; 168 | 169 | write_wr[i].opcode = IBV_WR_RDMA_WRITE; 170 | write_wr[i].num_sge = 1; 171 | write_wr[i].sg_list = &write_sge[i]; 172 | write_wr[i].send_flags = 0 /* unsignaled */; 173 | if (write_size <= kHrdMaxInline) { 174 | write_wr[i].send_flags |= IBV_SEND_INLINE; 175 | } 176 | 177 | write_wr[i].wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset; 178 | write_wr[i].wr.rdma.rkey = srv_qp->rkey; 179 | 180 | // READ. We can read from any address. 181 | read_sge[i].addr = 182 | reinterpret_cast(&cb->conn_buf[kBufSize - sizeof(size_t)]); 183 | read_sge[i].length = sizeof(size_t); // Indepenent of write size 184 | read_sge[i].lkey = cb->conn_buf_mr->lkey; 185 | 186 | read_wr[i].opcode = IBV_WR_RDMA_READ; 187 | read_wr[i].num_sge = 1; 188 | read_wr[i].sg_list = &read_sge[i]; 189 | read_wr[i].send_flags = 0; // Unsignaled. The last read is signaled. 190 | read_wr[i].wr.rdma.remote_addr = 191 | srv_qp->buf_addr + kBufSize - sizeof(size_t); 192 | read_wr[i].wr.rdma.rkey = srv_qp->rkey; 193 | 194 | // Make a chain 195 | write_wr[i].next = &read_wr[i]; 196 | read_wr[i].next = &write_wr[i + 1]; 197 | } 198 | 199 | if (!kJustAWrite) { 200 | read_wr[kNumWritesToFlush - 1].send_flags = IBV_SEND_SIGNALED; 201 | read_wr[kNumWritesToFlush - 1].next = nullptr; 202 | } else { 203 | write_wr[0].send_flags |= IBV_SEND_SIGNALED; 204 | write_wr[0].next = nullptr; 205 | } 206 | 207 | int ret = ibv_post_send(cb->conn_qp[0], &write_wr[0], &bad_send_wr); 208 | rt_assert(ret == 0); 209 | hrd_poll_cq(cb->conn_cq[0], 1, &wc); // Block till the RDMA read completes 210 | num_iters++; 211 | 212 | double us = ns_since(start) / 1000.0; 213 | latency.update(us * 10); 214 | } 215 | } 216 | 217 | int main(int argc, char* argv[]) { 218 | gflags::ParseCommandLineFlags(&argc, &argv, true); 219 | FLAGS_is_client == 1 ? run_client() : run_server(); 220 | return 0; 221 | } 222 | -------------------------------------------------------------------------------- /microbench/bench.cc: -------------------------------------------------------------------------------- 1 | #include "bench.h" 2 | 3 | #include "rand_read_latency.h" 4 | #include "rand_read_tput.h" 5 | #include "rand_write_latency.h" 6 | #include "rand_write_tput.h" 7 | #include "seq_read_tput.h" 8 | #include "seq_write_latency.h" 9 | #include "seq_write_tput.h" 10 | 11 | // Return true if kPmemFile is in devdax mode 12 | static bool is_pmem_file_devdax() { 13 | if (std::string(kPmemFile).find("dax") != std::string::npos) return true; 14 | return false; 15 | } 16 | 17 | // Write to the whole buffer to "map it in", whatever that means 18 | void map_in_buffer_whole(uint8_t *pbuf) { 19 | printf("Writing to the whole file for map-in...\n"); 20 | const size_t chunk_sz = GB(16); 21 | rt_assert(kPmemFileSize % chunk_sz == 0, "Invalid chunk size for map-in"); 22 | 23 | for (size_t i = 0; i < kPmemFileSize; i += chunk_sz) { 24 | struct timespec start; 25 | clock_gettime(CLOCK_REALTIME, &start); 26 | pmem_memset_persist(&pbuf[i], 3185, chunk_sz); // nodrain performs similar 27 | printf("Fraction complete = %.2f. Took %.3f sec for %zu GB.\n", 28 | (i + 1) * 1.0 / kPmemFileSize, sec_since(start), chunk_sz / GB(1)); 29 | } 30 | 31 | printf("Done writing.\n"); 32 | } 33 | 34 | // Write to a byte in each page of the buffer, to map the pages in 35 | void map_in_buffer_by_page(uint8_t *pbuf) { 36 | printf("Mapping-in file pages.\n"); 37 | struct timespec start; 38 | clock_gettime(CLOCK_REALTIME, &start); 39 | 40 | for (size_t i = 0; i < kPmemFileSize; i += KB(4)) { 41 | pmem_memset_nodrain(&pbuf[i], 3185, 1); 42 | if (i % GB(32) == 0 && i > 0) { 43 | printf("Fraction complete = %.2f. Took %.3f sec for %u GB.\n", 44 | (i + 1) * 1.0 / kPmemFileSize, sec_since(start), 32); 45 | clock_gettime(CLOCK_REALTIME, &start); 46 | } 47 | } 48 | 49 | printf("Done mapping-in.\n"); 50 | } 51 | 52 | // Map pmem file in devdax mode 53 | uint8_t *map_pmem_file_devdax() { 54 | int fd = open(kPmemFile, O_RDWR); 55 | rt_assert(fd >= 0, "devdax open failed"); 56 | rt_assert(kPmemFileSize % MB(2) == 0, "File size must be multiple of 2 MB"); 57 | 58 | void *buf = 59 | mmap(nullptr, kPmemFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 60 | rt_assert(buf != MAP_FAILED, "mmap failed for devdax"); 61 | rt_assert(reinterpret_cast(buf) % 256 == 0); 62 | 63 | return reinterpret_cast(buf); 64 | } 65 | 66 | // Map pmem file in fsdax mode 67 | uint8_t *map_pmem_file_fsdax() { 68 | uint8_t *pbuf; 69 | size_t mapped_len; 70 | int is_pmem; 71 | 72 | pbuf = reinterpret_cast(pmem_map_file( 73 | kPmemFile, 0 /* length */, 0 /* flags */, 0666, &mapped_len, &is_pmem)); 74 | 75 | rt_assert(pbuf != nullptr, 76 | "pmem_map_file() failed. " + std::string(strerror(errno))); 77 | rt_assert(mapped_len >= kPmemFileSize, 78 | "pmem file too small " + std::to_string(mapped_len)); 79 | rt_assert(reinterpret_cast(pbuf) % 4096 == 0, 80 | "Mapped buffer isn't page-aligned"); 81 | rt_assert(is_pmem == 1, "File is not pmem"); 82 | printf("Mapped file of length %.2f GB\n", mapped_len * 1.0 / GB(1)); 83 | 84 | return pbuf; 85 | } 86 | 87 | int main(int argc, char **argv) { 88 | gflags::ParseCommandLineFlags(&argc, &argv, true); 89 | uint8_t *pbuf; 90 | 91 | freq_ghz = measure_rdtsc_freq(); 92 | printf("RDTSC frequency = %.2f GHz\n", freq_ghz); 93 | 94 | pbuf = is_pmem_file_devdax() ? map_pmem_file_devdax() : map_pmem_file_fsdax(); 95 | 96 | // Print some random file samples to check it's full of random contents 97 | printf("File contents sample: "); 98 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 99 | for (size_t i = 0; i < 10; i++) { 100 | printf("%zu ", *reinterpret_cast(&pbuf[pcg() % kPmemFileSize])); 101 | } 102 | printf("\n"); 103 | 104 | // map_in_buffer_by_page(pbuf); 105 | // map_in_buffer_whole(pbuf); 106 | 107 | std::string bench_func; // Last one wins 108 | bench_func = "bench_seq_read_latency"; 109 | bench_func = "bench_rand_write_latency"; 110 | bench_func = "bench_rand_read_tput"; 111 | bench_func = "bench_seq_write_tput"; 112 | bench_func = "bench_seq_write_latency"; 113 | bench_func = "bench_rand_read_latency"; 114 | bench_func = "bench_seq_read_tput"; 115 | bench_func = "bench_rand_write_tput"; 116 | bench_func = "bench_seq_write_tput"; 117 | 118 | // Sequential write throughput 119 | if (bench_func == "bench_seq_write_tput") { 120 | printf("Sequential write throughput. %zu threads\n", FLAGS_num_threads); 121 | std::ostringstream dat_header; 122 | std::ostringstream dat_data; 123 | dat_header << "Threads "; 124 | dat_data << std::to_string(FLAGS_num_threads) << " "; 125 | 126 | for (size_t copy_sz = MB(2); copy_sz <= GB(1); copy_sz *= 2) { 127 | dat_header << std::to_string(copy_sz) << " "; 128 | std::vector avg_tput_GBps(FLAGS_num_threads); 129 | 130 | std::vector threads(FLAGS_num_threads); 131 | for (size_t i = 0; i < FLAGS_num_threads; i++) { 132 | threads[i] = std::thread(bench_seq_write_tput, pbuf, i, copy_sz, 133 | &avg_tput_GBps[i]); 134 | bind_to_core(threads[i], kNumaNode, i); 135 | } 136 | for (auto &t : threads) t.join(); 137 | 138 | double total_tput = 0.0; 139 | for (size_t i = 0; i < FLAGS_num_threads; i++) 140 | total_tput += avg_tput_GBps[i]; 141 | dat_data << std::setprecision(2) << total_tput << " "; 142 | } 143 | 144 | printf("%s\n", dat_header.str().c_str()); 145 | printf("%s\n", dat_data.str().c_str()); 146 | } 147 | 148 | // Sequential write latency 149 | if (bench_func == "bench_seq_write_latency") { 150 | printf("Sequential write latency. One thread only!\n"); 151 | bench_seq_write_latency(pbuf); 152 | } 153 | 154 | // Random write latency 155 | if (bench_func == "bench_rand_write_latency") { 156 | printf("Random write latency. One thread only!\n"); 157 | bench_rand_write_latency(pbuf); 158 | } 159 | 160 | // Random read latency 161 | if (bench_func == "bench_rand_read_latency") { 162 | printf("Random read latency. One thread only!\n"); 163 | bench_rand_read_latency(pbuf); 164 | } 165 | 166 | // Random write tput 167 | if (bench_func == "bench_rand_write_tput") { 168 | std::vector thread_count = {1}; 169 | std::vector copy_sz_vec = {256}; 170 | 171 | for (size_t copy_sz : copy_sz_vec) { 172 | for (size_t num_threads : thread_count) { 173 | printf("Rand write tput with %zu threads, copy_sz %zu\n", num_threads, 174 | copy_sz); 175 | std::vector threads(num_threads); 176 | 177 | for (size_t i = 0; i < num_threads; i++) { 178 | threads[i] = 179 | std::thread(bench_rand_write_tput, pbuf, i, copy_sz, num_threads); 180 | } 181 | 182 | for (size_t i = 0; i < num_threads; i++) threads[i].join(); 183 | } 184 | } 185 | } 186 | 187 | // Random read throughput 188 | if (bench_func == "bench_rand_read_tput") { 189 | std::vector thread_count = {1, 2, 4, 8, 16, 24, 48}; 190 | std::vector copy_sz_vec = {64, 256, 512, 1024}; 191 | 192 | for (size_t copy_sz : copy_sz_vec) { 193 | for (size_t num_threads : thread_count) { 194 | printf("Rand read tput with %zu threads, copy_sz %zu\n", num_threads, 195 | copy_sz); 196 | std::vector threads(num_threads); 197 | 198 | for (size_t i = 0; i < num_threads; i++) { 199 | threads[i] = 200 | std::thread(bench_rand_read_tput, pbuf, i, copy_sz, num_threads); 201 | } 202 | 203 | for (size_t i = 0; i < num_threads; i++) threads[i].join(); 204 | } 205 | } 206 | } 207 | 208 | // Sequential read throughput 209 | if (bench_func == "bench_seq_read_tput") { 210 | std::vector thread_count = {1, 2, 4, 8, 16, 24, 48}; 211 | 212 | for (size_t num_threads : thread_count) { 213 | printf("Seq read tput with %zu threads\n", num_threads); 214 | std::vector threads(num_threads); 215 | 216 | for (size_t i = 0; i < num_threads; i++) { 217 | threads[i] = std::thread(bench_seq_read_tput, pbuf, i, num_threads); 218 | bind_to_core(threads[i], kNumaNode, i); 219 | } 220 | 221 | for (size_t i = 0; i < num_threads; i++) threads[i].join(); 222 | } 223 | } 224 | 225 | is_pmem_file_devdax() ? munmap(pbuf, kPmemFileSize) 226 | : pmem_unmap(pbuf, kPmemFileSize); 227 | exit(0); 228 | } 229 | -------------------------------------------------------------------------------- /rdma/rw-tput-receiver/main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "../libhrd_cpp/hrd.h" 11 | 12 | static constexpr size_t kServerBufSize = GB(8); 13 | static constexpr size_t kAppMaxPostlist = 64; 14 | static constexpr size_t kAppUnsigBatch = 64; 15 | static constexpr size_t kBaseSHMKey = 3185; 16 | 17 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages. 18 | static constexpr bool kUsePmem = true; 19 | static constexpr const char* kPmemFile = "/dev/dax0.0"; 20 | 21 | // If true, server zeroes out its buffer and reports write throughput 22 | static constexpr bool kZeroServerBuf = false; 23 | 24 | DEFINE_uint64(num_client_processes, 1, "Number of client processes"); 25 | DEFINE_uint64(num_threads_per_client, 1, "Threads per client process"); 26 | DEFINE_uint64(is_client, 0, "Is this process a client?"); 27 | DEFINE_uint64(use_uc, 0, "Use unreliable connected transport?"); 28 | DEFINE_uint64(do_read, 0, "Do RDMA reads?"); 29 | DEFINE_uint64(machine_id, 0, "Zero-based ID of this client machine"); 30 | DEFINE_uint64(size, 0, "RDMA size"); 31 | DEFINE_uint64(postlist, 0, "Postlist size"); 32 | 33 | // Parameters for a client thread 34 | struct clt_thread_params_t { 35 | size_t global_thread_id; 36 | double* tput; 37 | }; 38 | 39 | // Map the devdax buffer at the server 40 | uint8_t* get_pmem_buf_server() { 41 | int fd = open(kPmemFile, O_RDWR); 42 | rt_assert(fd >= 0, "devdax open failed"); 43 | 44 | size_t pmem_size = roundup(kServerBufSize); // Smaller sizes may fail 45 | void* buf = 46 | mmap(nullptr, pmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 47 | rt_assert(buf != MAP_FAILED, "mmap failed for devdax"); 48 | rt_assert(reinterpret_cast(buf) % 256 == 0); 49 | 50 | return reinterpret_cast(buf); 51 | } 52 | 53 | void run_server() { 54 | size_t num_client_connections = 55 | FLAGS_num_client_processes * FLAGS_num_threads_per_client; 56 | 57 | uint8_t* pmem_buf = nullptr; 58 | if (kUsePmem) { 59 | pmem_buf = get_pmem_buf_server(); 60 | 61 | // Fill in the persistent buffer, also sanity-check local write throughput 62 | if (kZeroServerBuf) { 63 | printf("main: Zero-ing pmem buffer\n"); 64 | struct timespec start; 65 | clock_gettime(CLOCK_REALTIME, &start); 66 | pmem_memset_persist(pmem_buf, 0, kServerBufSize); 67 | printf("main: Zero-ed %f MB of pmem at %.1f GB/s\n", 68 | kServerBufSize * 1.0 / MB(1), 69 | kServerBufSize / (1000000000.0 * sec_since(start))); 70 | } 71 | } 72 | 73 | struct hrd_conn_config_t conn_config; 74 | conn_config.num_qps = num_client_connections; 75 | conn_config.use_uc = (FLAGS_use_uc == 1); 76 | conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr; 77 | conn_config.buf_size = kServerBufSize; 78 | conn_config.buf_shm_key = kUsePmem ? -1 : 3185; 79 | 80 | auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */, 81 | &conn_config, nullptr /* dgram config */); 82 | 83 | // Publish server QPs. Server i is for global connection ID i 84 | for (size_t i = 0; i < num_client_connections; i++) { 85 | auto srv_qp_name = std::string("server-") + std::to_string(i); 86 | hrd_publish_conn_qp(cb, i, srv_qp_name.c_str()); 87 | } 88 | 89 | for (size_t i = 0; i < num_client_connections; i++) { 90 | auto conn_name = std::string("conn-") + std::to_string(i); 91 | hrd_qp_attr_t* conn_qp = nullptr; 92 | while (conn_qp == nullptr) { 93 | conn_qp = hrd_get_published_qp(conn_name.c_str()); 94 | if (conn_qp == nullptr) { 95 | usleep(200000); 96 | continue; 97 | } 98 | 99 | printf("main: Server found client connection %zu! Connecting..\n", i); 100 | hrd_connect_qp(cb, i, conn_qp); 101 | } 102 | } 103 | 104 | hrd_publish_ready("server"); 105 | printf("main: Server ready. Going to sleep.\n"); 106 | 107 | while (true) sleep(1); 108 | } 109 | 110 | void run_client(clt_thread_params_t* params) { 111 | FastRand fast_rand; 112 | size_t clt_lid = params->global_thread_id % FLAGS_num_threads_per_client; 113 | 114 | hrd_conn_config_t conn_config; 115 | conn_config.num_qps = 1; 116 | conn_config.use_uc = (FLAGS_use_uc == 1); 117 | conn_config.prealloc_buf = nullptr; 118 | conn_config.buf_size = FLAGS_size; 119 | conn_config.buf_shm_key = kBaseSHMKey + clt_lid; 120 | 121 | auto* cb = hrd_ctrl_blk_init(params->global_thread_id, 0 /* port */, 122 | 0 /* numa */, &conn_config, nullptr); 123 | 124 | memset(const_cast(cb->conn_buf), 125 | static_cast(params->global_thread_id) + 1, 126 | conn_config.buf_size); 127 | 128 | size_t global_conn_id = params->global_thread_id; 129 | auto conn_name = std::string("conn-") + std::to_string(global_conn_id); 130 | hrd_publish_conn_qp(cb, 0, conn_name.c_str()); 131 | printf("main: Connection %s published. Waiting for server.\n", 132 | conn_name.c_str()); 133 | 134 | auto srv_qp_name = std::string("server-") + std::to_string(global_conn_id); 135 | hrd_qp_attr_t* srv_qp = nullptr; 136 | while (srv_qp == nullptr) { 137 | srv_qp = hrd_get_published_qp(srv_qp_name.c_str()); 138 | if (srv_qp == nullptr) usleep(2000); 139 | } 140 | 141 | rt_assert(srv_qp->buf_addr % FLAGS_size == 0, 142 | "Server buffer address not aligned to RDMA size"); 143 | 144 | printf("main: Found server for connection %s. Connecting..\n", 145 | conn_name.c_str()); 146 | hrd_connect_qp(cb, 0, srv_qp); 147 | printf("main: Client connected!\n"); 148 | 149 | hrd_wait_till_ready("server"); 150 | 151 | struct ibv_send_wr wr[kAppMaxPostlist], *bad_send_wr; 152 | struct ibv_sge sgl[kAppMaxPostlist]; 153 | struct ibv_wc wc; 154 | size_t rolling_iter = 0; // For performance measurement 155 | size_t nb_tx = 0; // For selective signaling 156 | int ret; 157 | 158 | struct timespec start, end; 159 | clock_gettime(CLOCK_REALTIME, &start); 160 | 161 | while (true) { 162 | if (rolling_iter >= KB(512)) { 163 | clock_gettime(CLOCK_REALTIME, &end); 164 | double seconds = (end.tv_sec - start.tv_sec) + 165 | (end.tv_nsec - start.tv_nsec) / 1000000000.0; 166 | double tput_mrps = rolling_iter / (seconds * 1000000); 167 | printf("main: Client %zu: %.2f M/s\n", params->global_thread_id, 168 | tput_mrps); 169 | rolling_iter = 0; 170 | 171 | // Per-machine stats 172 | params->tput[clt_lid] = tput_mrps; 173 | if (clt_lid == 0) { 174 | double tot = 0; 175 | for (size_t i = 0; i < FLAGS_num_threads_per_client; i++) 176 | tot += params->tput[i]; 177 | hrd_red_printf("main: Machine: %.2f M/s\n", tot); 178 | } 179 | 180 | clock_gettime(CLOCK_REALTIME, &start); 181 | } 182 | 183 | // Post a batch 184 | for (size_t w_i = 0; w_i < FLAGS_postlist; w_i++) { 185 | wr[w_i].opcode = 186 | FLAGS_do_read == 0 ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ; 187 | wr[w_i].num_sge = 1; 188 | wr[w_i].next = (w_i == FLAGS_postlist - 1) ? nullptr : &wr[w_i + 1]; 189 | wr[w_i].sg_list = &sgl[w_i]; 190 | 191 | wr[w_i].send_flags = nb_tx % kAppUnsigBatch == 0 ? IBV_SEND_SIGNALED : 0; 192 | if (nb_tx % kAppUnsigBatch == 0 && nb_tx > 0) { 193 | hrd_poll_cq(cb->conn_cq[0], 1, &wc); 194 | } 195 | 196 | wr[w_i].send_flags |= FLAGS_do_read == 0 ? IBV_SEND_INLINE : 0; 197 | 198 | sgl[w_i].addr = reinterpret_cast(&cb->conn_buf); 199 | sgl[w_i].length = FLAGS_size; 200 | sgl[w_i].lkey = cb->conn_buf_mr->lkey; 201 | 202 | size_t remote_offset = 203 | (fast_rand.next_u32() % (kServerBufSize / FLAGS_size)) * FLAGS_size; 204 | 205 | wr[w_i].wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset; 206 | wr[w_i].wr.rdma.rkey = srv_qp->rkey; 207 | 208 | nb_tx++; 209 | } 210 | 211 | ret = ibv_post_send(cb->conn_qp[0], &wr[0], &bad_send_wr); 212 | rt_assert(ret == 0); 213 | 214 | rolling_iter += FLAGS_postlist; 215 | } 216 | } 217 | 218 | int main(int argc, char* argv[]) { 219 | gflags::ParseCommandLineFlags(&argc, &argv, true); 220 | 221 | if (FLAGS_is_client == 1) { 222 | if (FLAGS_do_read == 0) { 223 | rt_assert(FLAGS_size <= kHrdMaxInline, "Inline size too small"); 224 | } 225 | rt_assert(FLAGS_postlist <= kAppMaxPostlist, "Postlist too large"); 226 | rt_assert(kAppUnsigBatch >= FLAGS_postlist, "Postlist check failed"); 227 | rt_assert(kHrdSQDepth >= 2 * kAppUnsigBatch, "Queue capacity check failed"); 228 | } 229 | 230 | // Launch a single server thread or multiple client threads 231 | 232 | if (FLAGS_is_client == 1) { 233 | std::vector thread_arr(FLAGS_num_threads_per_client); 234 | auto* tput = new double[FLAGS_num_threads_per_client]; 235 | printf("main: Using %zu threads\n", FLAGS_num_threads_per_client); 236 | auto* param_arr = new clt_thread_params_t[FLAGS_num_threads_per_client]; 237 | for (size_t i = 0; i < FLAGS_num_threads_per_client; i++) { 238 | param_arr[i].global_thread_id = 239 | (FLAGS_machine_id * FLAGS_num_threads_per_client) + i; 240 | param_arr[i].tput = tput; 241 | 242 | thread_arr[i] = std::thread(run_client, ¶m_arr[i]); 243 | } 244 | 245 | for (auto& thread : thread_arr) thread.join(); 246 | } else { 247 | auto server_thread = std::thread(run_server); 248 | server_thread.join(); 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /ioat/bench.cc: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file bench.cc 4 | * 5 | * @brief Benchmark for IOAT DMA based on DPDK instead of SPDK. The benchmark 6 | * task is to paste small, cached source buffers sequentially into the large 7 | * destination buffer. 8 | * 9 | * Flexibility: use IOAT or memcpy for the copy. Use volatile or persistent 10 | * buffer for the large destination buffer. 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "../common.h" 31 | #include "huge_alloc.h" 32 | #include "virt2phy.h" 33 | 34 | static constexpr size_t kIoatDevID = 0; 35 | static constexpr size_t kIoatDoFence = 0; 36 | static constexpr size_t kIoatRingSize = 512; 37 | 38 | static constexpr size_t kDstBufferSize = GB(32); 39 | static constexpr bool kCheckCopyResults = true; 40 | 41 | static constexpr const char *kPmemFile = "/dev/dax0.0"; 42 | 43 | DEFINE_uint64(num_prints, 3, "Number of measurements printed before exit"); 44 | DEFINE_uint64(size, KB(128), "Size of each copy"); 45 | DEFINE_uint64(window_size, 8, "Number of outstanding transfers"); 46 | DEFINE_uint64(numa_node, 0, "NUMA node for experiment"); 47 | DEFINE_uint64(use_ioat, 1, "Use IOAT DMA engines, else memcpy"); 48 | DEFINE_uint64(use_pmem, 1, "Use persistent memory for destination buffer"); 49 | 50 | // Initialize and start device 0 51 | void setup_ioat_device() { 52 | struct rte_rawdev_info info; 53 | info.dev_private = NULL; 54 | 55 | rt_assert(rte_rawdev_info_get(kIoatDevID, &info) == 0); 56 | rt_assert(std::string(info.driver_name).find("ioat") != std::string::npos); 57 | 58 | struct rte_ioat_rawdev_config p; 59 | memset(&info, 0, sizeof(info)); 60 | info.dev_private = &p; 61 | 62 | rte_rawdev_info_get(kIoatDevID, &info); 63 | rt_assert(p.ring_size == 0, "Initial ring size is non-zero"); 64 | 65 | p.ring_size = kIoatRingSize; 66 | rt_assert(rte_rawdev_configure(kIoatDevID, &info) == 0, 67 | "rte_rawdev_configure failed"); 68 | 69 | rte_rawdev_info_get(kIoatDevID, &info); 70 | rt_assert(p.ring_size == kIoatRingSize, "Wrong ring size"); 71 | 72 | rt_assert(rte_rawdev_start(kIoatDevID) == 0, "Rawdev start failed"); 73 | 74 | printf("Started device %zu\n", kIoatDevID); 75 | } 76 | 77 | void poll_one() { 78 | while (true) { 79 | uintptr_t _src, _dst; 80 | int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst); 81 | rt_assert(ret >= 0, "rte_ioat_completed_copies error"); 82 | 83 | if (ret > 0) break; 84 | } 85 | } 86 | 87 | int main(int argc, char **argv) { 88 | if (getuid() != 0) { 89 | // Mapping devdax files needs root perms for now 90 | printf("You need to be root to run this benchmark\n"); 91 | exit(-1); 92 | } 93 | 94 | gflags::ParseCommandLineFlags(&argc, &argv, true); 95 | 96 | auto hugepage_caching_v2p = new HugepageCachingVirt2Phy(); 97 | double freq_ghz = measure_rdtsc_freq(); 98 | 99 | rt_assert(FLAGS_size <= KB(128), 100 | "Copy size must be small to reduce the likelihood of " 101 | "straddling 2 hugepages"); 102 | 103 | rt_assert(kDstBufferSize / FLAGS_size > 2 * FLAGS_window_size, 104 | "Copy size too large, pipelined copies might overlap"); 105 | 106 | // Init DPDK 107 | const char *rte_argv[] = {"-c", "1", "-n", "4", "--log-level", 108 | "5", "-m", "128", NULL}; 109 | 110 | int rte_argc = sizeof(rte_argv) / sizeof(rte_argv[0]) - 1; 111 | int ret = rte_eal_init(rte_argc, const_cast(rte_argv)); 112 | rt_assert(ret >= 0, "rte_eal_init failed"); 113 | 114 | if (FLAGS_use_ioat == 1) { 115 | size_t count = rte_rawdev_count(); 116 | printf("Fount %zu rawdev devices\n", count); 117 | rt_assert(count >= 1, "No rawdev devices available"); 118 | 119 | setup_ioat_device(); 120 | } 121 | 122 | // Create source and destination buffers 123 | auto huge_alloc = new hugealloc::HugeAlloc(MB(512), FLAGS_numa_node); 124 | std::vector src_bufs(FLAGS_window_size); 125 | for (size_t i = 0; i < FLAGS_window_size; i++) { 126 | src_bufs[i] = huge_alloc->alloc(FLAGS_size); 127 | rt_assert(src_bufs[i].buf != nullptr); 128 | 129 | memset(src_bufs[i].buf, i + 1, FLAGS_size); // Page-in 130 | } 131 | 132 | printf("Allocating %zu GB destination buffer...", kDstBufferSize / GB(1)); 133 | uint8_t *dst_buf = nullptr; 134 | 135 | if (FLAGS_use_pmem == 1) { 136 | // Map pmem buffer 137 | size_t mapped_len; 138 | int is_pmem; 139 | 140 | dst_buf = reinterpret_cast( 141 | pmem_map_file(kPmemFile, 0, 0, 0666, &mapped_len, &is_pmem)); 142 | 143 | rt_assert(dst_buf != nullptr); 144 | rt_assert(mapped_len >= kDstBufferSize); 145 | rt_assert(is_pmem == 1); 146 | 147 | } else { 148 | hugealloc::Buffer _dst_buf = huge_alloc->alloc_raw(kDstBufferSize); 149 | rt_assert(_dst_buf.buf != nullptr); 150 | rt_assert(reinterpret_cast(_dst_buf.buf) % MB(2) == 0); 151 | dst_buf = _dst_buf.buf; 152 | } 153 | 154 | for (size_t i = 0; i < kDstBufferSize; i += MB(2)) dst_buf[i] = i; // Page-in 155 | printf("done!\n"); 156 | 157 | // Start test 158 | printf("Flags: size %zu, window size %zu, use_ioat %zu, use_pmem %zu\n", 159 | FLAGS_size, FLAGS_window_size, FLAGS_use_ioat, FLAGS_use_pmem); 160 | 161 | size_t num_printed = 0; // Number of times we printed stats 162 | size_t num_completed_copies = 0; 163 | 164 | size_t src_bufs_i = 0; // Index among the source buffers for the next job 165 | size_t dst_buf_offset = 0; // Offset in the destination buffer 166 | 167 | size_t ioat_outstanding_jobs = 0; 168 | size_t timer_start = rdtsc(); 169 | FastRand fast_rand; 170 | 171 | while (true) { 172 | if (FLAGS_use_ioat == 1) { 173 | if (dst_buf_offset / MB(2) != (dst_buf_offset + FLAGS_size) / MB(2)) { 174 | // The copy operating will straddle two hugepages 175 | dst_buf_offset += FLAGS_size; 176 | continue; // Go back 177 | } 178 | 179 | uint8_t *dst_buf_ptr = &dst_buf[dst_buf_offset]; 180 | uint64_t dst_phys_addr = hugepage_caching_v2p->translate(dst_buf_ptr); 181 | 182 | uint8_t *src_buf_ptr = src_bufs[src_bufs_i].buf; 183 | uint64_t src_phys_addr = hugepage_caching_v2p->translate(src_buf_ptr); 184 | 185 | // Pass zeroes as callback args, we don't need them for now 186 | int ret = rte_ioat_enqueue_copy( 187 | kIoatDevID, src_phys_addr, dst_phys_addr, FLAGS_size, 188 | reinterpret_cast(src_buf_ptr), 189 | reinterpret_cast(dst_buf_ptr), kIoatDoFence); 190 | 191 | rt_assert(ret == 1, "Error with rte_ioat_enqueue_copy"); 192 | rte_ioat_do_copies(kIoatDevID); 193 | 194 | ioat_outstanding_jobs++; 195 | rt_assert(ioat_outstanding_jobs <= kIoatRingSize); 196 | 197 | if (ioat_outstanding_jobs == FLAGS_window_size) { 198 | // Poll for a completed copy 199 | while (true) { 200 | uintptr_t _src = 0, _dst = 0; 201 | int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst); 202 | rt_assert(ret >= 0, "rte_ioat_completed_copies error"); 203 | 204 | if (ret == 1 && kCheckCopyResults) { 205 | // Check at a random offset 206 | size_t offset = fast_rand.next_u32() % FLAGS_size; 207 | uint8_t src_val = reinterpret_cast(_src)[offset]; 208 | uint8_t dst_val = reinterpret_cast(_dst)[offset]; 209 | if (unlikely(src_val != dst_val)) { 210 | fprintf(stderr, "Mismatch\n"); 211 | } 212 | } 213 | 214 | num_completed_copies += static_cast(ret); 215 | ioat_outstanding_jobs -= static_cast(ret); 216 | if (ret > 0) break; 217 | } 218 | } 219 | } else { // Use memcpy 220 | if (FLAGS_use_pmem == 0) { 221 | rte_memcpy(&dst_buf[dst_buf_offset], src_bufs[src_bufs_i].buf, 222 | FLAGS_size); 223 | } else { 224 | pmem_memcpy_persist(&dst_buf[dst_buf_offset], src_bufs[src_bufs_i].buf, 225 | FLAGS_size); 226 | } 227 | num_completed_copies++; 228 | } 229 | 230 | // If we're here, we did/enqueued a copy. Bump src and dst buffers. 231 | src_bufs_i++; 232 | if (src_bufs_i == FLAGS_window_size) src_bufs_i = 0; 233 | 234 | dst_buf_offset += FLAGS_size; 235 | if (dst_buf_offset + FLAGS_size >= kDstBufferSize) { 236 | dst_buf_offset = 0; 237 | 238 | double ns_total = to_nsec(rdtsc() - timer_start, freq_ghz); 239 | printf("%.2f GB/s\n", num_completed_copies * FLAGS_size / ns_total); 240 | 241 | num_completed_copies = 0; 242 | num_printed++; 243 | timer_start = rdtsc(); 244 | } 245 | 246 | if (num_printed == FLAGS_num_prints) break; 247 | } 248 | 249 | // With IOAT, wait for outstanding copies before deleting hugepages 250 | printf("Waiting for outstanding copies to finish\n"); 251 | while (FLAGS_use_ioat == 1 && ioat_outstanding_jobs > 0) { 252 | uintptr_t _src, _dst; 253 | int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst); 254 | rt_assert(ret >= 0, "rte_ioat_completed_copies error"); 255 | ioat_outstanding_jobs -= static_cast(ret); 256 | } 257 | 258 | delete huge_alloc; 259 | } 260 | -------------------------------------------------------------------------------- /hopscotch_pmem/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../common.h" 7 | #include "phopscotch.h" 8 | 9 | #define table phopscotch 10 | 11 | DEFINE_string(pmem_file, "/dev/dax12.0", "Persistent memory file name"); 12 | DEFINE_uint64(table_key_capacity, MB(1), "Number of keys in table per thread"); 13 | DEFINE_uint64(batch_size, table::kMaxBatchSize, "Batch size"); 14 | DEFINE_string(benchmark, "get", "Benchmark to run"); 15 | DEFINE_uint64(num_threads, 1, "Number of threads"); 16 | DEFINE_uint64(sweep_optimizations, 0, "Sweep optimizations"); 17 | 18 | // 19 | // Overhead to occupancy map: 20 | // 0.05 -> 0.56 21 | static constexpr double kDefaultOverhead = 0.05; 22 | static constexpr double kNumaNode = 0; 23 | 24 | class Key { 25 | public: 26 | size_t key_frag[2]; 27 | bool operator==(const Key &rhs) const { 28 | return memcmp(this, &rhs, sizeof(Key)) == 0; 29 | } 30 | bool operator!=(const Key &rhs) const { 31 | return memcmp(this, &rhs, sizeof(Key)) != 0; 32 | } 33 | Key() { memset(key_frag, 0, sizeof(Key)); } 34 | }; 35 | 36 | class Value { 37 | public: 38 | size_t val_frag[8]; 39 | Value() { memset(val_frag, 0, sizeof(Value)); } 40 | }; 41 | 42 | // Used for threads to wait to begin work 43 | // https://stackoverflow.com/questions/24465533/implementing-boostbarrier-in-c11 44 | class Barrier { 45 | private: 46 | std::mutex mutex; 47 | std::condition_variable cv; 48 | std::size_t count; 49 | 50 | public: 51 | explicit Barrier(std::size_t count) : count{count} {} 52 | void wait() { 53 | std::unique_lock lock{mutex}; 54 | if (--count == 0) { 55 | cv.notify_all(); 56 | } else { 57 | cv.wait(lock, [this] { return count == 0; }); 58 | } 59 | } 60 | }; 61 | Barrier *barrier; 62 | 63 | /// Given a random number \p rand, return a random number 64 | static inline uint64_t fastrange64(uint64_t rand, uint64_t n) { 65 | return static_cast( 66 | static_cast<__uint128_t>(rand) * static_cast<__uint128_t>(n) >> 64); 67 | } 68 | 69 | /// Generate a key for a thread's partition. Each partition hosts a contiguous 70 | /// range of keys {1, ..., max_key} 71 | static inline size_t gen_key(size_t offset_in_partition, size_t thread_id) { 72 | assert(thread_id <= 31); 73 | return ((offset_in_partition << 5) | thread_id); 74 | } 75 | 76 | typedef table::HashMap HashMap; 77 | 78 | size_t populate(HashMap *hashmap, size_t thread_id) { 79 | bool is_set_arr[table::kMaxBatchSize]; 80 | Key key_arr[table::kMaxBatchSize]; 81 | Value val_arr[table::kMaxBatchSize]; 82 | Key *key_ptr_arr[table::kMaxBatchSize]; 83 | Value *val_ptr_arr[table::kMaxBatchSize]; 84 | bool success_arr[table::kMaxBatchSize]; 85 | 86 | size_t num_success = 0; 87 | 88 | for (size_t i = 0; i < table::kMaxBatchSize; i++) { 89 | key_ptr_arr[i] = &key_arr[i]; 90 | val_ptr_arr[i] = &val_arr[i]; 91 | } 92 | 93 | const size_t num_keys_to_insert = 94 | roundup(FLAGS_table_key_capacity); 95 | size_t progress_console_lim = num_keys_to_insert / 10; 96 | 97 | for (size_t i = 1; i <= num_keys_to_insert; i += table::kMaxBatchSize) { 98 | for (size_t j = 0; j < table::kMaxBatchSize; j++) { 99 | is_set_arr[j] = true; 100 | size_t offset_in_partition = (i + j); 101 | key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id); 102 | val_arr[j].val_frag[0] = key_arr[j].key_frag[0]; 103 | } 104 | 105 | hashmap->batch_op_drain(is_set_arr, const_cast(key_ptr_arr), 106 | val_ptr_arr, success_arr, table::kMaxBatchSize); 107 | 108 | if (i >= progress_console_lim) { 109 | printf("thread %zu: %.2f percent done\n", thread_id, 110 | i * 1.0 / num_keys_to_insert); 111 | progress_console_lim += num_keys_to_insert / 10; 112 | } 113 | 114 | for (size_t j = 0; j < table::kMaxBatchSize; j++) { 115 | num_success += success_arr[j]; 116 | if (!success_arr[j]) { 117 | printf("thread %zu: populate() failed at key %zu of %zu keys\n", 118 | thread_id, i + j, num_keys_to_insert); 119 | return num_success; 120 | } 121 | } 122 | } 123 | 124 | return FLAGS_table_key_capacity; // All keys were added 125 | } 126 | 127 | enum class Workload { kGets, kSets, k5050 }; 128 | double batch_exp(HashMap *hashmap, size_t max_key, size_t batch_size, 129 | Workload workload, size_t thread_id) { 130 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 131 | constexpr size_t kNumIters = MB(1); 132 | 133 | struct timespec start; 134 | bool is_set_arr[table::kMaxBatchSize]; 135 | Key key_arr[table::kMaxBatchSize]; 136 | Value val_arr[table::kMaxBatchSize]; 137 | Key *key_ptr_arr[table::kMaxBatchSize]; 138 | Value *val_ptr_arr[table::kMaxBatchSize]; 139 | bool success_arr[table::kMaxBatchSize]; 140 | clock_gettime(CLOCK_REALTIME, &start); 141 | 142 | for (size_t i = 0; i < table::kMaxBatchSize; i++) { 143 | key_ptr_arr[i] = &key_arr[i]; 144 | val_ptr_arr[i] = &val_arr[i]; 145 | } 146 | 147 | size_t num_success = 0; 148 | for (size_t i = 1; i <= kNumIters; i += batch_size) { 149 | for (size_t j = 0; j < batch_size; j++) { 150 | switch (workload) { 151 | case Workload::kGets: is_set_arr[j] = false; break; 152 | case Workload::kSets: is_set_arr[j] = true; break; 153 | case Workload::k5050: is_set_arr[j] = pcg() % 2 == 0; break; 154 | } 155 | 156 | size_t offset_in_partition = 1 + fastrange64(pcg(), max_key - 1); 157 | 158 | key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id); 159 | val_arr[j].val_frag[0] = is_set_arr[j] ? key_arr[j].key_frag[0] : 0; 160 | } 161 | 162 | hashmap->batch_op_drain(is_set_arr, const_cast(key_ptr_arr), 163 | val_ptr_arr, success_arr, batch_size); 164 | 165 | for (size_t j = 0; j < batch_size; j++) { 166 | num_success += success_arr[j]; 167 | if (!is_set_arr[j] && val_arr[j].val_frag[0] != key_arr[j].key_frag[0]) { 168 | printf("invalid value %zu for key %zu\n", val_arr[j].val_frag[0], 169 | key_arr[j].key_frag[0]); 170 | } 171 | } 172 | } 173 | 174 | double seconds = sec_since(start); 175 | double tput = kNumIters / (seconds * 1000000); 176 | return tput; 177 | } 178 | 179 | void thread_func(size_t thread_id) { 180 | size_t bytes_per_map = HashMap::get_required_bytes(FLAGS_table_key_capacity); 181 | bytes_per_map = roundup<256>(bytes_per_map); 182 | 183 | auto *hashmap = new HashMap(FLAGS_pmem_file, thread_id * bytes_per_map, 184 | FLAGS_table_key_capacity); 185 | 186 | printf("thread %zu: Populating hashmap. Expected time = %.1f seconds\n", 187 | thread_id, FLAGS_table_key_capacity / (4.0 * 1000000)); // 4 M/s 188 | 189 | size_t max_key = populate(hashmap, thread_id); 190 | printf("thread %zu: final occupancy = %.2f\n", thread_id, 191 | max_key * 1.0 / FLAGS_table_key_capacity); 192 | 193 | std::vector tput_vec; 194 | Workload workload; 195 | if (FLAGS_benchmark == "set") workload = Workload::kSets; 196 | if (FLAGS_benchmark == "get") workload = Workload::kGets; 197 | if (FLAGS_benchmark == "5050") workload = Workload::k5050; 198 | 199 | printf("thread %zu, done populating. waiting for others.\n", thread_id); 200 | barrier->wait(); 201 | printf("thread %zu, starting work.\n", thread_id); 202 | 203 | for (size_t i = 0; i < 10; i++) { 204 | double tput = 205 | batch_exp(hashmap, max_key, FLAGS_batch_size, workload, thread_id); 206 | printf("thread %zu, iter %zu: tput = %.2f\n", thread_id, i, tput); 207 | tput_vec.push_back(tput); 208 | } 209 | 210 | double avg_tput = 211 | std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size(); 212 | double _stddev = stddev(tput_vec); 213 | 214 | printf("thread %zu of %zu final M/s: %.2f avg, %.2f stddev\n", thread_id, 215 | FLAGS_num_threads, avg_tput, _stddev); 216 | 217 | delete hashmap; 218 | } 219 | 220 | void sweep_do_one(HashMap *hashmap, size_t max_key, size_t batch_size, 221 | Workload workload) { 222 | std::vector tput_vec; 223 | 224 | for (size_t i = 0; i < 10; i++) { 225 | double tput; 226 | tput = batch_exp(hashmap, max_key, batch_size, workload, 0 /* thread_id */); 227 | tput_vec.push_back(tput); 228 | } 229 | 230 | double avg_tput = 231 | std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size(); 232 | double _stddev = stddev(tput_vec); 233 | 234 | printf(" Tput (M/s) = %.2f avg, %.2f stddev\n", avg_tput, _stddev); 235 | } 236 | 237 | // Measure the effectiveness of optimizations with one thread 238 | void sweep_optimizations() { 239 | auto *hashmap = new HashMap(FLAGS_pmem_file, 0, FLAGS_table_key_capacity); 240 | 241 | printf("Populating hashmap. Expected time = %.1f seconds\n", 242 | FLAGS_table_key_capacity / (4.0 * 1000000)); // 4 M/s 243 | 244 | size_t max_key = populate(hashmap, 0 /* thread_id */); 245 | printf("Final occupancy = %.2f\n", max_key * 1.0 / FLAGS_table_key_capacity); 246 | 247 | std::vector batch_size_vec = {1, 4, 8, 16}; 248 | 249 | // GET batch sizes 250 | for (auto &batch_size : batch_size_vec) { 251 | printf("get. Batch size %zu\n", batch_size); 252 | sweep_do_one(hashmap, max_key, batch_size, Workload::kGets); 253 | } 254 | 255 | // SET batch sizes 256 | for (auto &batch_size : batch_size_vec) { 257 | printf("set. Batch size %zu\n", batch_size); 258 | sweep_do_one(hashmap, max_key, batch_size, Workload::kSets); 259 | } 260 | 261 | // 50/50 batch sizes 262 | for (auto &batch_size : batch_size_vec) { 263 | printf("50/50. Batch size %zu\n", batch_size); 264 | sweep_do_one(hashmap, max_key, batch_size, Workload::k5050); 265 | } 266 | 267 | // Optimizations for GETs 268 | hashmap->opts.prefetch = false; 269 | printf("get. Batch size 16, no prefetch.\n"); 270 | sweep_do_one(hashmap, max_key, 16, Workload::kGets); 271 | hashmap->opts.reset(); 272 | 273 | // Optimizations for SETs, and 50/50 274 | hashmap->opts.redo_batch = false; 275 | printf("set. Batch size 16, only redo batch disabled\n"); 276 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 277 | printf("50/50. Batch size 16, only redo batch disabled\n"); 278 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 279 | hashmap->opts.reset(); 280 | 281 | hashmap->opts.prefetch = false; 282 | printf("set. Batch size 16, only prefetch disabled.\n"); 283 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 284 | printf("50/50. Batch size 16, only prefetch disabled\n"); 285 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 286 | hashmap->opts.reset(); 287 | 288 | hashmap->opts.async_drain = false; 289 | printf("set. Batch size 16, only async slot drain disabled.\n"); 290 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 291 | printf("50/50. Batch size 16, only async slot drain disabled.\n"); 292 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 293 | hashmap->opts.reset(); 294 | 295 | delete hashmap; 296 | } 297 | 298 | int main(int argc, char **argv) { 299 | gflags::ParseCommandLineFlags(&argc, &argv, true); 300 | 301 | if (FLAGS_sweep_optimizations == 1) { 302 | std::thread t = std::thread(sweep_optimizations); 303 | bind_to_core(t, kNumaNode, 0); 304 | t.join(); 305 | exit(0); 306 | } 307 | 308 | barrier = new Barrier(FLAGS_num_threads); 309 | std::vector threads(FLAGS_num_threads); 310 | 311 | printf("Launching %zu threads\n", FLAGS_num_threads); 312 | for (size_t i = 0; i < FLAGS_num_threads; i++) { 313 | threads[i] = std::thread(thread_func, i); 314 | bind_to_core(threads[i], kNumaNode, i); 315 | } 316 | 317 | for (size_t i = 0; i < FLAGS_num_threads; i++) { 318 | threads[i].join(); 319 | } 320 | 321 | delete barrier; 322 | } 323 | -------------------------------------------------------------------------------- /mica_pmem/bench.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../common.h" 7 | #include "pmica.h" 8 | 9 | #define table pmica 10 | 11 | DEFINE_string(pmem_file, "/dev/dax12.0", "Persistent memory file name"); 12 | DEFINE_uint64(table_key_capacity, MB(1), "Number of keys in table per thread"); 13 | DEFINE_uint64(batch_size, table::kMaxBatchSize, "Batch size"); 14 | DEFINE_string(benchmark, "get", "Benchmark to run"); 15 | DEFINE_uint64(num_threads, 1, "Number of threads"); 16 | DEFINE_uint64(sweep_optimizations, 0, "Sweep optimizations"); 17 | 18 | // 19 | // Overhead to occupancy map: 20 | // 0.05 -> 0.56 21 | static constexpr double kDefaultOverhead = 0.05; 22 | static constexpr double kNumaNode = 0; 23 | 24 | class Key { 25 | public: 26 | size_t key_frag[2]; 27 | bool operator==(const Key &rhs) const { 28 | return memcmp(this, &rhs, sizeof(Key)) == 0; 29 | } 30 | bool operator!=(const Key &rhs) const { 31 | return memcmp(this, &rhs, sizeof(Key)) != 0; 32 | } 33 | Key() { memset(key_frag, 0, sizeof(Key)); } 34 | }; 35 | 36 | class Value { 37 | public: 38 | size_t val_frag[4]; 39 | Value() { memset(val_frag, 0, sizeof(Value)); } 40 | }; 41 | 42 | // Used for threads to wait to begin work 43 | // https://stackoverflow.com/questions/24465533/implementing-boostbarrier-in-c11 44 | class Barrier { 45 | private: 46 | std::mutex mutex; 47 | std::condition_variable cv; 48 | std::size_t count; 49 | 50 | public: 51 | explicit Barrier(std::size_t count) : count{count} {} 52 | void wait() { 53 | std::unique_lock lock{mutex}; 54 | if (--count == 0) { 55 | cv.notify_all(); 56 | } else { 57 | cv.wait(lock, [this] { return count == 0; }); 58 | } 59 | } 60 | }; 61 | Barrier *barrier; 62 | 63 | /// Given a random number \p rand, return a random number 64 | static inline uint64_t fastrange64(uint64_t rand, uint64_t n) { 65 | return static_cast( 66 | static_cast<__uint128_t>(rand) * static_cast<__uint128_t>(n) >> 64); 67 | } 68 | 69 | /// Generate a key for a thread's partition. Each partition hosts a contiguous 70 | /// range of keys {1, ..., max_key} 71 | static inline size_t gen_key(size_t offset_in_partition, size_t thread_id) { 72 | assert(thread_id <= 31); 73 | return ((offset_in_partition << 5) | thread_id); 74 | } 75 | 76 | typedef table::HashMap HashMap; 77 | 78 | size_t populate(HashMap *hashmap, size_t thread_id) { 79 | bool is_set_arr[table::kMaxBatchSize]; 80 | Key key_arr[table::kMaxBatchSize]; 81 | Value val_arr[table::kMaxBatchSize]; 82 | Key *key_ptr_arr[table::kMaxBatchSize]; 83 | Value *val_ptr_arr[table::kMaxBatchSize]; 84 | bool success_arr[table::kMaxBatchSize]; 85 | 86 | size_t num_success = 0; 87 | 88 | for (size_t i = 0; i < table::kMaxBatchSize; i++) { 89 | key_ptr_arr[i] = &key_arr[i]; 90 | val_ptr_arr[i] = &val_arr[i]; 91 | } 92 | 93 | const size_t num_keys_to_insert = 94 | roundup(FLAGS_table_key_capacity); 95 | size_t progress_console_lim = num_keys_to_insert / 10; 96 | 97 | for (size_t i = 1; i <= num_keys_to_insert; i += table::kMaxBatchSize) { 98 | for (size_t j = 0; j < table::kMaxBatchSize; j++) { 99 | is_set_arr[j] = true; 100 | size_t offset_in_partition = (i + j); 101 | key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id); 102 | val_arr[j].val_frag[0] = key_arr[j].key_frag[0]; 103 | } 104 | 105 | hashmap->batch_op_drain(is_set_arr, const_cast(key_ptr_arr), 106 | val_ptr_arr, success_arr, table::kMaxBatchSize); 107 | 108 | if (i >= progress_console_lim) { 109 | printf("thread %zu: %.2f percent done\n", thread_id, 110 | i * 1.0 / num_keys_to_insert); 111 | progress_console_lim += num_keys_to_insert / 10; 112 | } 113 | 114 | for (size_t j = 0; j < table::kMaxBatchSize; j++) { 115 | num_success += success_arr[j]; 116 | if (!success_arr[j]) { 117 | printf("thread %zu: populate() failed at key %zu of %zu keys\n", 118 | thread_id, i + j, num_keys_to_insert); 119 | return num_success; 120 | } 121 | } 122 | } 123 | 124 | return FLAGS_table_key_capacity; // All keys were added 125 | } 126 | 127 | enum class Workload { kGets, kSets, k5050 }; 128 | double batch_exp(HashMap *hashmap, size_t max_key, size_t batch_size, 129 | Workload workload, size_t thread_id) { 130 | pcg64_fast pcg(pcg_extras::seed_seq_from{}); 131 | constexpr size_t kNumIters = MB(1); 132 | 133 | struct timespec start; 134 | bool is_set_arr[table::kMaxBatchSize]; 135 | Key key_arr[table::kMaxBatchSize]; 136 | Value val_arr[table::kMaxBatchSize]; 137 | Key *key_ptr_arr[table::kMaxBatchSize]; 138 | Value *val_ptr_arr[table::kMaxBatchSize]; 139 | bool success_arr[table::kMaxBatchSize]; 140 | clock_gettime(CLOCK_REALTIME, &start); 141 | 142 | for (size_t i = 0; i < table::kMaxBatchSize; i++) { 143 | key_ptr_arr[i] = &key_arr[i]; 144 | val_ptr_arr[i] = &val_arr[i]; 145 | } 146 | 147 | size_t num_success = 0; 148 | for (size_t i = 1; i <= kNumIters; i += batch_size) { 149 | for (size_t j = 0; j < batch_size; j++) { 150 | switch (workload) { 151 | case Workload::kGets: is_set_arr[j] = false; break; 152 | case Workload::kSets: is_set_arr[j] = true; break; 153 | case Workload::k5050: is_set_arr[j] = pcg() % 2 == 0; break; 154 | } 155 | 156 | size_t offset_in_partition = 1 + fastrange64(pcg(), max_key - 1); 157 | 158 | key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id); 159 | val_arr[j].val_frag[0] = is_set_arr[j] ? key_arr[j].key_frag[0] : 0; 160 | } 161 | 162 | hashmap->batch_op_drain(is_set_arr, const_cast(key_ptr_arr), 163 | val_ptr_arr, success_arr, batch_size); 164 | 165 | for (size_t j = 0; j < batch_size; j++) { 166 | num_success += success_arr[j]; 167 | if (!is_set_arr[j] && val_arr[j].val_frag[0] != key_arr[j].key_frag[0]) { 168 | printf("invalid value %zu for key %zu\n", val_arr[j].val_frag[0], 169 | key_arr[j].key_frag[0]); 170 | } 171 | } 172 | } 173 | 174 | double seconds = sec_since(start); 175 | double tput = kNumIters / (seconds * 1000000); 176 | return tput; 177 | } 178 | 179 | void thread_func(size_t thread_id) { 180 | size_t bytes_per_map = 181 | HashMap::get_required_bytes(FLAGS_table_key_capacity, kDefaultOverhead); 182 | bytes_per_map = roundup<256>(bytes_per_map); 183 | 184 | auto *hashmap = new HashMap(FLAGS_pmem_file, thread_id * bytes_per_map, 185 | FLAGS_table_key_capacity, kDefaultOverhead); 186 | 187 | printf("thread %zu: Populating hashmap. Expected time = %.1f seconds\n", 188 | thread_id, FLAGS_table_key_capacity / (4.0 * 1000000)); // 4 M/s 189 | 190 | size_t max_key = populate(hashmap, thread_id); 191 | printf("thread %zu: final occupancy = %.2f\n", thread_id, 192 | max_key * 1.0 / hashmap->get_key_capacity()); 193 | 194 | std::vector tput_vec; 195 | Workload workload; 196 | if (FLAGS_benchmark == "set") workload = Workload::kSets; 197 | if (FLAGS_benchmark == "get") workload = Workload::kGets; 198 | if (FLAGS_benchmark == "5050") workload = Workload::k5050; 199 | 200 | printf("thread %zu, done populating. waiting for others.\n", thread_id); 201 | barrier->wait(); 202 | printf("thread %zu, starting work.\n", thread_id); 203 | 204 | for (size_t i = 0; i < 10; i++) { 205 | double tput = 206 | batch_exp(hashmap, max_key, FLAGS_batch_size, workload, thread_id); 207 | printf("thread %zu, iter %zu: tput = %.2f\n", thread_id, i, tput); 208 | tput_vec.push_back(tput); 209 | } 210 | 211 | double avg_tput = 212 | std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size(); 213 | double _stddev = stddev(tput_vec); 214 | 215 | printf("thread %zu of %zu final M/s: %.2f avg, %.2f stddev\n", thread_id, 216 | FLAGS_num_threads, avg_tput, _stddev); 217 | 218 | delete hashmap; 219 | } 220 | 221 | // Measure the effectiveness of optimizations with one thread, given a config 222 | void sweep_do_one(HashMap *hashmap, size_t max_key, size_t batch_size, 223 | Workload workload) { 224 | std::vector tput_vec; 225 | 226 | for (size_t msr = 0; msr < 3; msr++) { 227 | double tput; 228 | tput = batch_exp(hashmap, max_key, batch_size, workload, 0 /* thread_id */); 229 | tput_vec.push_back(tput); 230 | } 231 | 232 | double avg_tput = 233 | std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size(); 234 | double _stddev = stddev(tput_vec); 235 | 236 | printf(" Tput (M/s) = %.2f avg, %.2f stddev\n", avg_tput, _stddev); 237 | } 238 | 239 | // Measure the effectiveness of optimizations with one thread 240 | void sweep_optimizations() { 241 | auto *hashmap = new HashMap(FLAGS_pmem_file, 0, FLAGS_table_key_capacity, 242 | kDefaultOverhead); 243 | 244 | printf("Populating hashmap. Expected time = %.1f seconds\n", 245 | FLAGS_table_key_capacity / (4.0 * 1000000)); // 4 M/s 246 | 247 | size_t max_key = populate(hashmap, 0 /* thread_id */); 248 | printf("Final occupancy = %.2f\n", 249 | max_key * 1.0 / hashmap->get_key_capacity()); 250 | 251 | std::vector batch_size_vec = {1, 4, 8, 16}; 252 | 253 | // SET batch sizes 254 | for (auto &batch_size : batch_size_vec) { 255 | printf("set. Batch size %zu\n", batch_size); 256 | sweep_do_one(hashmap, max_key, batch_size, Workload::kSets); 257 | } 258 | 259 | // GET batch sizes 260 | for (auto &batch_size : batch_size_vec) { 261 | printf("get. Batch size %zu\n", batch_size); 262 | sweep_do_one(hashmap, max_key, batch_size, Workload::kGets); 263 | } 264 | 265 | // 50/50 batch sizes 266 | for (auto &batch_size : batch_size_vec) { 267 | printf("50/50. Batch size %zu\n", batch_size); 268 | sweep_do_one(hashmap, max_key, batch_size, Workload::k5050); 269 | } 270 | 271 | // Optimizations for GETs 272 | hashmap->opts.prefetch = false; 273 | printf("get. Batch size 16, no prefetch.\n"); 274 | sweep_do_one(hashmap, max_key, 16, Workload::kGets); 275 | hashmap->opts.reset(); 276 | 277 | // Optimizations for SETs, and 50/50 278 | hashmap->opts.redo_batch = false; 279 | printf("set. Batch size 16, only redo batch disabled\n"); 280 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 281 | printf("50/50. Batch size 16, only redo batch disabled\n"); 282 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 283 | hashmap->opts.reset(); 284 | 285 | hashmap->opts.prefetch = false; 286 | printf("set. Batch size 16, only prefetch disabled.\n"); 287 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 288 | printf("50/50. Batch size 16, only prefetch disabled\n"); 289 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 290 | hashmap->opts.reset(); 291 | 292 | hashmap->opts.async_drain = false; 293 | printf("set. Batch size 16, only async slot drain disabled.\n"); 294 | sweep_do_one(hashmap, max_key, 16, Workload::kSets); 295 | printf("50/50. Batch size 16, only async slot drain disabled.\n"); 296 | sweep_do_one(hashmap, max_key, 16, Workload::k5050); 297 | hashmap->opts.reset(); 298 | 299 | delete hashmap; 300 | } 301 | 302 | int main(int argc, char **argv) { 303 | gflags::ParseCommandLineFlags(&argc, &argv, true); 304 | 305 | if (FLAGS_sweep_optimizations == 1) { 306 | std::thread t = std::thread(sweep_optimizations); 307 | bind_to_core(t, kNumaNode, 0); 308 | t.join(); 309 | exit(0); 310 | } 311 | 312 | barrier = new Barrier(FLAGS_num_threads); 313 | std::vector threads(FLAGS_num_threads); 314 | 315 | printf("Launching %zu threads\n", FLAGS_num_threads); 316 | for (size_t i = 0; i < FLAGS_num_threads; i++) { 317 | threads[i] = std::thread(thread_func, i); 318 | bind_to_core(threads[i], kNumaNode, i); 319 | } 320 | 321 | for (size_t i = 0; i < FLAGS_num_threads; i++) { 322 | threads[i].join(); 323 | } 324 | 325 | delete barrier; 326 | } 327 | -------------------------------------------------------------------------------- /nvme_perf/c/hello.c: -------------------------------------------------------------------------------- 1 | /*- 2 | * BSD LICENSE 3 | * 4 | * Copyright (c) Intel Corporation. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above copyright 14 | * notice, this list of conditions and the following disclaimer in 15 | * the documentation and/or other materials provided with the 16 | * distribution. 17 | * * Neither the name of Intel Corporation nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | 34 | #include "spdk/stdinc.h" 35 | 36 | #include "spdk/nvme.h" 37 | #include "spdk/env.h" 38 | 39 | struct ctrlr_entry { 40 | struct spdk_nvme_ctrlr *ctrlr; 41 | struct ctrlr_entry *next; 42 | char name[1024]; 43 | }; 44 | 45 | struct ns_entry { 46 | struct spdk_nvme_ctrlr *ctrlr; 47 | struct spdk_nvme_ns *ns; 48 | struct ns_entry *next; 49 | struct spdk_nvme_qpair *qpair; 50 | }; 51 | 52 | static struct ctrlr_entry *g_controllers = NULL; 53 | static struct ns_entry *g_namespaces = NULL; 54 | 55 | static void 56 | register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns) 57 | { 58 | struct ns_entry *entry; 59 | const struct spdk_nvme_ctrlr_data *cdata; 60 | 61 | /* 62 | * spdk_nvme_ctrlr is the logical abstraction in SPDK for an NVMe 63 | * controller. During initialization, the IDENTIFY data for the 64 | * controller is read using an NVMe admin command, and that data 65 | * can be retrieved using spdk_nvme_ctrlr_get_data() to get 66 | * detailed information on the controller. Refer to the NVMe 67 | * specification for more details on IDENTIFY for NVMe controllers. 68 | */ 69 | cdata = spdk_nvme_ctrlr_get_data(ctrlr); 70 | 71 | if (!spdk_nvme_ns_is_active(ns)) { 72 | printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n", 73 | cdata->mn, cdata->sn, 74 | spdk_nvme_ns_get_id(ns)); 75 | return; 76 | } 77 | 78 | entry = malloc(sizeof(struct ns_entry)); 79 | if (entry == NULL) { 80 | perror("ns_entry malloc"); 81 | exit(1); 82 | } 83 | 84 | entry->ctrlr = ctrlr; 85 | entry->ns = ns; 86 | entry->next = g_namespaces; 87 | g_namespaces = entry; 88 | 89 | printf(" Namespace ID: %d size: %juGB\n", spdk_nvme_ns_get_id(ns), 90 | spdk_nvme_ns_get_size(ns) / 1000000000); 91 | } 92 | 93 | struct hello_world_sequence { 94 | struct ns_entry *ns_entry; 95 | char *buf; 96 | unsigned using_cmb_io; 97 | int is_completed; 98 | }; 99 | 100 | static void 101 | read_complete(void *arg, const struct spdk_nvme_cpl *) 102 | { 103 | struct hello_world_sequence *sequence = arg; 104 | 105 | /* 106 | * The read I/O has completed. Print the contents of the 107 | * buffer, free the buffer, then mark the sequence as 108 | * completed. This will trigger the hello_world() function 109 | * to exit its polling loop. 110 | */ 111 | printf("%s", sequence->buf); 112 | spdk_free(sequence->buf); 113 | sequence->is_completed = 1; 114 | } 115 | 116 | static void 117 | write_complete(void *arg, const struct spdk_nvme_cpl *) 118 | { 119 | struct hello_world_sequence *sequence = arg; 120 | struct ns_entry *ns_entry = sequence->ns_entry; 121 | int rc; 122 | 123 | /* 124 | * The write I/O has completed. Free the buffer associated with 125 | * the write I/O and allocate a new zeroed buffer for reading 126 | * the data back from the NVMe namespace. 127 | */ 128 | if (sequence->using_cmb_io) { 129 | spdk_nvme_ctrlr_free_cmb_io_buffer(ns_entry->ctrlr, sequence->buf, 0x1000); 130 | } else { 131 | spdk_free(sequence->buf); 132 | } 133 | sequence->buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 134 | 135 | rc = spdk_nvme_ns_cmd_read(ns_entry->ns, ns_entry->qpair, sequence->buf, 136 | 0, /* LBA start */ 137 | 1, /* number of LBAs */ 138 | read_complete, (void *)sequence, 0); 139 | if (rc != 0) { 140 | fprintf(stderr, "starting read I/O failed\n"); 141 | exit(1); 142 | } 143 | } 144 | 145 | static void 146 | hello_world(void) 147 | { 148 | struct ns_entry *ns_entry; 149 | struct hello_world_sequence sequence; 150 | int rc; 151 | 152 | ns_entry = g_namespaces; 153 | while (ns_entry != NULL) { 154 | /* 155 | * Allocate an I/O qpair that we can use to submit read/write requests 156 | * to namespaces on the controller. NVMe controllers typically support 157 | * many qpairs per controller. Any I/O qpair allocated for a controller 158 | * can submit I/O to any namespace on that controller. 159 | * 160 | * The SPDK NVMe driver provides no synchronization for qpair accesses - 161 | * the application must ensure only a single thread submits I/O to a 162 | * qpair, and that same thread must also check for completions on that 163 | * qpair. This enables extremely efficient I/O processing by making all 164 | * I/O operations completely lockless. 165 | */ 166 | ns_entry->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_entry->ctrlr, NULL, 0); 167 | if (ns_entry->qpair == NULL) { 168 | printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); 169 | return; 170 | } 171 | 172 | /* 173 | * Use spdk_dma_zmalloc to allocate a 4KB zeroed buffer. This memory 174 | * will be pinned, which is required for data buffers used for SPDK NVMe 175 | * I/O operations. 176 | */ 177 | sequence.using_cmb_io = 1; 178 | sequence.buf = spdk_nvme_ctrlr_alloc_cmb_io_buffer(ns_entry->ctrlr, 0x1000); 179 | if (sequence.buf == NULL) { 180 | sequence.using_cmb_io = 0; 181 | sequence.buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); 182 | } 183 | if (sequence.buf == NULL) { 184 | printf("ERROR: write buffer allocation failed\n"); 185 | return; 186 | } 187 | if (sequence.using_cmb_io) { 188 | printf("INFO: using controller memory buffer for IO\n"); 189 | } else { 190 | printf("INFO: using host memory buffer for IO\n"); 191 | } 192 | sequence.is_completed = 0; 193 | sequence.ns_entry = ns_entry; 194 | 195 | /* 196 | * Print "Hello world!" to sequence.buf. We will write this data to LBA 197 | * 0 on the namespace, and then later read it back into a separate buffer 198 | * to demonstrate the full I/O path. 199 | */ 200 | snprintf(sequence.buf, 0x1000, "%s", "Hello world!\n"); 201 | 202 | /* 203 | * Write the data buffer to LBA 0 of this namespace. "write_complete" and 204 | * "&sequence" are specified as the completion callback function and 205 | * argument respectively. write_complete() will be called with the 206 | * value of &sequence as a parameter when the write I/O is completed. 207 | * This allows users to potentially specify different completion 208 | * callback routines for each I/O, as well as pass a unique handle 209 | * as an argument so the application knows which I/O has completed. 210 | * 211 | * Note that the SPDK NVMe driver will only check for completions 212 | * when the application calls spdk_nvme_qpair_process_completions(). 213 | * It is the responsibility of the application to trigger the polling 214 | * process. 215 | */ 216 | rc = spdk_nvme_ns_cmd_write(ns_entry->ns, ns_entry->qpair, sequence.buf, 217 | 0, /* LBA start */ 218 | 1, /* number of LBAs */ 219 | write_complete, &sequence, 0); 220 | if (rc != 0) { 221 | fprintf(stderr, "starting write I/O failed\n"); 222 | exit(1); 223 | } 224 | 225 | /* 226 | * Poll for completions. 0 here means process all available completions. 227 | * In certain usage models, the caller may specify a positive integer 228 | * instead of 0 to signify the maximum number of completions it should 229 | * process. This function will never block - if there are no 230 | * completions pending on the specified qpair, it will return immediately. 231 | * 232 | * When the write I/O completes, write_complete() will submit a new I/O 233 | * to read LBA 0 into a separate buffer, specifying read_complete() as its 234 | * completion routine. When the read I/O completes, read_complete() will 235 | * print the buffer contents and set sequence.is_completed = 1. That will 236 | * break this loop and then exit the program. 237 | */ 238 | while (!sequence.is_completed) { 239 | spdk_nvme_qpair_process_completions(ns_entry->qpair, 0); 240 | } 241 | 242 | /* 243 | * Free the I/O qpair. This typically is done when an application exits. 244 | * But SPDK does support freeing and then reallocating qpairs during 245 | * operation. It is the responsibility of the caller to ensure all 246 | * pending I/O are completed before trying to free the qpair. 247 | */ 248 | spdk_nvme_ctrlr_free_io_qpair(ns_entry->qpair); 249 | ns_entry = ns_entry->next; 250 | } 251 | } 252 | 253 | static bool 254 | probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 255 | struct spdk_nvme_ctrlr_opts *opts) 256 | { 257 | printf("Attaching to %s\n", trid->traddr); 258 | 259 | return true; 260 | } 261 | 262 | static void 263 | attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, 264 | struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) 265 | { 266 | int nsid, num_ns; 267 | struct ctrlr_entry *entry; 268 | struct spdk_nvme_ns *ns; 269 | const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr); 270 | 271 | entry = malloc(sizeof(struct ctrlr_entry)); 272 | if (entry == NULL) { 273 | perror("ctrlr_entry malloc"); 274 | exit(1); 275 | } 276 | 277 | printf("Attached to %s\n", trid->traddr); 278 | 279 | snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn); 280 | 281 | entry->ctrlr = ctrlr; 282 | entry->next = g_controllers; 283 | g_controllers = entry; 284 | 285 | /* 286 | * Each controller has one or more namespaces. An NVMe namespace is basically 287 | * equivalent to a SCSI LUN. The controller's IDENTIFY data tells us how 288 | * many namespaces exist on the controller. For Intel(R) P3X00 controllers, 289 | * it will just be one namespace. 290 | * 291 | * Note that in NVMe, namespace IDs start at 1, not 0. 292 | */ 293 | num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); 294 | printf("Using controller %s with %d namespaces.\n", entry->name, num_ns); 295 | for (nsid = 1; nsid <= num_ns; nsid++) { 296 | ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); 297 | if (ns == NULL) { 298 | continue; 299 | } 300 | register_ns(ctrlr, ns); 301 | } 302 | } 303 | 304 | static void 305 | cleanup(void) 306 | { 307 | struct ns_entry *ns_entry = g_namespaces; 308 | struct ctrlr_entry *ctrlr_entry = g_controllers; 309 | 310 | while (ns_entry) { 311 | struct ns_entry *next = ns_entry->next; 312 | free(ns_entry); 313 | ns_entry = next; 314 | } 315 | 316 | while (ctrlr_entry) { 317 | struct ctrlr_entry *next = ctrlr_entry->next; 318 | 319 | spdk_nvme_detach(ctrlr_entry->ctrlr); 320 | free(ctrlr_entry); 321 | ctrlr_entry = next; 322 | } 323 | } 324 | 325 | int main(int argc, char **argv) 326 | { 327 | int rc; 328 | struct spdk_env_opts opts; 329 | 330 | /* 331 | * SPDK relies on an abstraction around the local environment 332 | * named env that handles memory allocation and PCI device operations. 333 | * This library must be initialized first. 334 | * 335 | */ 336 | spdk_env_opts_init(&opts); 337 | opts.name = "hello_world"; 338 | opts.shm_id = 0; 339 | if (spdk_env_init(&opts) < 0) { 340 | fprintf(stderr, "Unable to initialize SPDK env\n"); 341 | return 1; 342 | } 343 | 344 | printf("Initializing NVMe Controllers\n"); 345 | 346 | /* 347 | * Start the SPDK NVMe enumeration process. probe_cb will be called 348 | * for each NVMe controller found, giving our application a choice on 349 | * whether to attach to each controller. attach_cb will then be 350 | * called for each controller after the SPDK NVMe driver has completed 351 | * initializing the controller we chose to attach. 352 | */ 353 | rc = spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL); 354 | if (rc != 0) { 355 | fprintf(stderr, "spdk_nvme_probe() failed\n"); 356 | cleanup(); 357 | return 1; 358 | } 359 | 360 | if (g_controllers == NULL) { 361 | fprintf(stderr, "no NVMe controllers found\n"); 362 | cleanup(); 363 | return 1; 364 | } 365 | 366 | printf("Initialization complete.\n"); 367 | hello_world(); 368 | cleanup(); 369 | return 0; 370 | } 371 | -------------------------------------------------------------------------------- /ioat/huge_alloc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file huge_alloc.h 3 | * @brief A header-only fast hugepage allocator with no dependencies 4 | * @author Anuj Kalia 5 | * @date 2018-09-25 6 | */ 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | namespace hugealloc { 29 | 30 | static constexpr size_t kHugepageSize = (2 * 1024 * 1024); ///< Hugepage size 31 | 32 | // 33 | // Utility classes for HugeAlloc 34 | // 35 | 36 | template 37 | static constexpr inline bool is_power_of_two(T x) { 38 | return x && ((x & T(x - 1)) == 0); 39 | } 40 | template 41 | static constexpr inline T round_up(T x) { 42 | static_assert(is_power_of_two(power_of_two_number), 43 | "PowerOfTwoNumber must be a power of 2"); 44 | return ((x) + T(power_of_two_number - 1)) & (~T(power_of_two_number - 1)); 45 | } 46 | 47 | static inline void rt_assert(bool condition, std::string throw_str, char *s) { 48 | if (!condition) { 49 | throw std::runtime_error(throw_str + std::string(s)); 50 | } 51 | } 52 | 53 | /// Check a condition at runtime. If the condition is false, throw exception. 54 | static inline void rt_assert(bool condition, std::string throw_str) { 55 | if (!condition) throw std::runtime_error(throw_str); 56 | } 57 | 58 | /// Check a condition at runtime. If the condition is false, throw exception. 59 | /// This is faster than rt_assert(cond, str) as it avoids string construction. 60 | static inline void rt_assert(bool condition) { 61 | if (!condition) throw std::runtime_error("Error"); 62 | } 63 | 64 | /// High-quality but slow RNG 65 | class SlowRand { 66 | std::random_device rand_dev; // Non-pseudorandom seed for twister 67 | std::mt19937_64 mt; 68 | std::uniform_int_distribution dist; 69 | 70 | public: 71 | SlowRand() : mt(rand_dev()), dist(0, UINT64_MAX) {} 72 | 73 | inline uint64_t next_u64() { return dist(mt); } 74 | }; 75 | 76 | // 77 | // Definitions for HugeAlloc 78 | // 79 | 80 | /// Information about an SHM region 81 | struct shm_region_t { 82 | // Constructor args 83 | const int shm_key; /// The key used to create the SHM region 84 | const uint8_t *buf; /// The start address of the allocated SHM buffer 85 | const size_t size; /// The size in bytes of the allocated SHM buffer 86 | 87 | shm_region_t(int shm_key, uint8_t *buf, size_t size) 88 | : shm_key(shm_key), buf(buf), size(size) { 89 | assert(size % kHugepageSize == 0); 90 | } 91 | }; 92 | 93 | /// The hugepage allocator returns Buffers 94 | class Buffer { 95 | public: 96 | Buffer(uint8_t *buf, size_t class_size) : buf(buf), class_size(class_size) {} 97 | 98 | Buffer() {} 99 | 100 | ~Buffer() { 101 | // The hugepage allocator frees up memory for its Buffers 102 | } 103 | 104 | /// Return a string representation of this Buffer (excluding lkey) 105 | std::string to_string() const { 106 | char ret[100]; 107 | sprintf(ret, "[buf %p, class sz %zu]", buf, class_size); 108 | return std::string(ret); 109 | } 110 | 111 | /// The backing memory of this Buffer. The Buffer is invalid if this is null. 112 | uint8_t *buf; 113 | size_t class_size; ///< The size of the hugealloc class used for this Buffer 114 | }; 115 | 116 | /// Return the index of the most significant bit of x. The index of the 2^0 117 | /// bit is 1. (x = 0 returns 0, x = 1 returns 1.) 118 | static inline size_t msb_index(int x) { 119 | assert(x < INT32_MAX / 2); 120 | int index; 121 | asm("bsrl %1, %0" : "=r"(index) : "r"(x << 1)); 122 | return static_cast(index); 123 | } 124 | 125 | /** 126 | * A hugepage allocator that uses per-class freelists. The minimum class size 127 | * is kMinClassSize, and class size increases by a factor of 2 until 128 | * kMaxClassSize. 129 | * 130 | * Large Buffers split into smaller Buffers when needed. Small Buffers never 131 | * merge into larger Buffers. 132 | * 133 | * When a new SHM region is added to the allocator, it is split into Buffers of 134 | * size kMaxClassSize and added to that class. These Buffers are later split to 135 | * fill up smaller classes. 136 | * 137 | * The allocator uses randomly generated positive SHM keys, and deallocates the 138 | * SHM regions it creates when deleted. 139 | */ 140 | class HugeAlloc { 141 | public: 142 | static constexpr const char *alloc_fail_help_str = 143 | "This could be due to insufficient huge pages or SHM limits."; 144 | static const size_t kMinClassSize = 64; /// Min allocation size 145 | static const size_t kMinClassBitShift = 6; /// For division by kMinClassSize 146 | static_assert((kMinClassSize >> kMinClassBitShift) == 1, ""); 147 | 148 | static const size_t kMaxClassSize = 8 * 1024 * 1024; /// Max allocation size 149 | 150 | // We fill-in physical addresses only when splitting larger Buffers into 151 | // hugepage-sized buffers 152 | static_assert(kMaxClassSize >= 2 * kHugepageSize, ""); 153 | 154 | static const size_t kNumClasses = 18; /// 64 B (2^6), ..., 8 MB (2^23) 155 | static_assert(kMaxClassSize == kMinClassSize << (kNumClasses - 1), ""); 156 | 157 | /// Return the maximum size of a class 158 | static constexpr size_t class_max_size(size_t class_i) { 159 | return kMinClassSize * (1ull << class_i); 160 | } 161 | 162 | /** 163 | * @brief Construct the hugepage allocator 164 | * @throw runtime_error if construction fails 165 | */ 166 | HugeAlloc(size_t initial_size, size_t numa_node) : numa_node(numa_node) { 167 | if (initial_size < kMaxClassSize) initial_size = kMaxClassSize; 168 | prev_allocation_size = initial_size; 169 | reserve_hugepages(prev_allocation_size); 170 | } 171 | 172 | HugeAlloc(size_t numa_node) : numa_node(numa_node) { 173 | prev_allocation_size = kMaxClassSize; 174 | reserve_hugepages(prev_allocation_size); 175 | } 176 | 177 | ~HugeAlloc() { 178 | for (shm_region_t &shm_region : shm_list) { 179 | int ret = 180 | shmdt(static_cast(const_cast(shm_region.buf))); 181 | if (ret != 0) { 182 | fprintf(stderr, "HugeAlloc: Error freeing SHM buf for key %d.\n", 183 | shm_region.shm_key); 184 | exit(-1); 185 | } 186 | } 187 | } 188 | 189 | /** 190 | * @brief Reserve size bytes as huge pages by adding hugepage-backed Buffers 191 | * to freelists. 192 | * 193 | * @return True if the allocation succeeds. False if the allocation fails 194 | * because no more hugepages are available. 195 | */ 196 | bool reserve_hugepages(size_t size) { 197 | Buffer buffer = alloc_raw(size); 198 | if (buffer.buf == nullptr) return false; 199 | 200 | // Add Buffers to the largest class 201 | size_t num_buffers = size / kMaxClassSize; 202 | assert(num_buffers >= 1); 203 | for (size_t i = 0; i < num_buffers; i++) { 204 | uint8_t *buf = buffer.buf + (i * kMaxClassSize); 205 | freelist[kNumClasses - 1].push_back(Buffer(buf, kMaxClassSize)); 206 | } 207 | return true; 208 | } 209 | 210 | /** 211 | * @brief Allocate memory using raw SHM operations, always bypassing the 212 | * allocator's freelists. Unlike alloc(), the size of the allocated memory 213 | * need not fit in the allocator's max class size. 214 | * 215 | * Allocated memory can be freed only when this allocator is destroyed, i.e., 216 | * free_buf() cannot be used. Use alloc() if freeing is needed. 217 | * 218 | * @param size The minimum size of the allocated memory 219 | * 220 | * @return The allocated hugepage-backed Buffer. buffer.buf is nullptr if we 221 | * ran out of memory. buffer.class_size is set to SIZE_MAX to indicate that 222 | * allocator classes were not used. 223 | * 224 | * @throw runtime_error if hugepage reservation failure is catastrophic 225 | */ 226 | Buffer alloc_raw(size_t size) { 227 | std::ostringstream xmsg; // The exception message 228 | size = round_up(size); 229 | int shm_key, shm_id; 230 | 231 | while (true) { 232 | // Choose a positive SHM key. Negative is fine but it looks scary in the 233 | // error message. 234 | shm_key = static_cast(slow_rand.next_u64()); 235 | shm_key = std::abs(shm_key); 236 | 237 | // Try to get an SHM region 238 | shm_id = shmget(shm_key, size, IPC_CREAT | IPC_EXCL | 0666 | SHM_HUGETLB); 239 | 240 | if (shm_id == -1) { 241 | switch (errno) { 242 | case EEXIST: 243 | continue; // shm_key already exists. Try again. 244 | 245 | case EACCES: 246 | xmsg << "HugeAlloc: SHM allocation error. " 247 | << "Insufficient permissions."; 248 | throw std::runtime_error(xmsg.str()); 249 | 250 | case EINVAL: 251 | xmsg << "HugeAlloc: SHM allocation error: SHMMAX/SHMIN " 252 | << "mismatch. size = " << std::to_string(size) << " (" 253 | << std::to_string(size / (1024 * 1024)) << " MB)."; 254 | throw std::runtime_error(xmsg.str()); 255 | 256 | case ENOMEM: 257 | // Out of memory - this is OK 258 | return Buffer(nullptr, 0); 259 | 260 | default: 261 | xmsg << "HugeAlloc: Unexpected SHM malloc error " 262 | << strerror(errno); 263 | throw std::runtime_error(xmsg.str()); 264 | } 265 | } else { 266 | // shm_key worked. Break out of the while loop. 267 | break; 268 | } 269 | } 270 | 271 | uint8_t *shm_buf = static_cast(shmat(shm_id, nullptr, 0)); 272 | rt_assert(shm_buf != nullptr, 273 | "HugeAlloc: shmat() failed. Key = " + std::to_string(shm_key)); 274 | 275 | rt_assert(reinterpret_cast(shm_buf) % kHugepageSize == 0, 276 | "SHM buffer isn't aligned to hugepage size"); 277 | 278 | // Mark the SHM region for deletion when this process exits 279 | shmctl(shm_id, IPC_RMID, nullptr); 280 | 281 | // Bind the buffer to the NUMA node 282 | const unsigned long nodemask = 283 | (1ul << static_cast(numa_node)); 284 | long ret = mbind(shm_buf, size, MPOL_BIND, &nodemask, 32, 0); 285 | rt_assert(ret == 0, 286 | "HugeAlloc: mbind() failed. Key " + std::to_string(shm_key)); 287 | 288 | // Save the SHM region so we can free it later 289 | shm_list.push_back(shm_region_t(shm_key, shm_buf, size)); 290 | stats.shm_reserved += size; 291 | 292 | // buffer.class_size is invalid because we didn't allocate from a class 293 | return Buffer(shm_buf, SIZE_MAX); 294 | } 295 | 296 | /** 297 | * @brief Allocate a Buffer using the allocator's freelists, i.e., the max 298 | * size that can be allocated is the max freelist class size. 299 | * 300 | * @param size The minimum size of the allocated Buffer. size need not 301 | * equal a class size. 302 | * 303 | * @return The allocated buffer. The buffer is invalid if we ran out of 304 | * memory. The class_size of the allocated Buffer is equal to a 305 | * HugeAlloc class size. 306 | * 307 | * @throw runtime_error if size is too large for the allocator, or if 308 | * hugepage reservation failure is catastrophic 309 | */ 310 | Buffer alloc(size_t size) { 311 | assert(size <= kMaxClassSize); 312 | const size_t size_class = get_class(size); 313 | 314 | if (freelist[size_class].empty()) { 315 | // There is no free Buffer in this class. Find the first larger class with 316 | // free Buffers. 317 | size_t next_class = size_class + 1; 318 | for (; next_class < kNumClasses; next_class++) { 319 | if (!freelist[next_class].empty()) break; 320 | } 321 | 322 | if (next_class == kNumClasses) { 323 | // There's no larger size class with free pages, we need to allocate 324 | // more hugepages. This adds some Buffers to the largest class. 325 | prev_allocation_size *= 2; 326 | bool success = reserve_hugepages(prev_allocation_size); 327 | 328 | if (!success) { 329 | prev_allocation_size /= 2; // Restore the previous allocation 330 | return Buffer(nullptr, 0); 331 | } 332 | 333 | next_class = kNumClasses - 1; 334 | } 335 | 336 | while (next_class != size_class) { 337 | split(next_class); 338 | next_class--; 339 | } 340 | } 341 | 342 | assert(!freelist[size_class].empty()); 343 | 344 | Buffer buffer = freelist[size_class].back(); 345 | freelist[size_class].pop_back(); 346 | stats.user_alloc_tot += buffer.class_size; 347 | return buffer; 348 | } 349 | 350 | /// Free a Buffer 351 | inline void free_buf(Buffer buffer) { 352 | assert(buffer.buf != nullptr); 353 | 354 | size_t size_class = get_class(buffer.class_size); 355 | assert(class_max_size(size_class) == buffer.class_size); 356 | 357 | freelist[size_class].push_back(buffer); 358 | stats.user_alloc_tot -= buffer.class_size; 359 | } 360 | 361 | inline size_t get_numa_node() { return numa_node; } 362 | 363 | /// Return the total amount of memory reserved as hugepages 364 | inline size_t get_stat_shm_reserved() const { 365 | assert(stats.shm_reserved % kHugepageSize == 0); 366 | return stats.shm_reserved; 367 | } 368 | 369 | /// Return the total amoung of memory allocated to the user 370 | inline size_t get_stat_user_alloc_tot() const { 371 | assert(stats.user_alloc_tot % kMinClassSize == 0); 372 | return stats.user_alloc_tot; 373 | } 374 | 375 | private: 376 | /** 377 | * @brief Get the class index for a Buffer size 378 | * @param size The size of the buffer, which may or may not be a class size 379 | */ 380 | inline size_t get_class(size_t size) { 381 | assert(size >= 1 && size <= kMaxClassSize); 382 | // Use bit shift instead of division to make debug-mode code a faster 383 | return msb_index(static_cast((size - 1) >> kMinClassBitShift)); 384 | } 385 | 386 | /// Reference function for the optimized get_class function above 387 | inline size_t get_class_slow(size_t size) { 388 | assert(size >= 1 && size <= kMaxClassSize); 389 | 390 | size_t size_class = 0; // The size class for size 391 | size_t class_lim = kMinClassSize; // The max size for size_class 392 | while (size > class_lim) { 393 | size_class++; 394 | class_lim *= 2; 395 | } 396 | 397 | return size_class; 398 | } 399 | 400 | /// Split one Buffers from class size_class into two Buffers of the previous 401 | /// class. 402 | inline void split(size_t size_class) { 403 | Buffer buffer = freelist[size_class].back(); 404 | freelist[size_class].pop_back(); 405 | 406 | Buffer buffer_0 = Buffer(buffer.buf, buffer.class_size / 2); 407 | Buffer buffer_1 = 408 | Buffer(buffer.buf + buffer.class_size / 2, buffer.class_size / 2); 409 | 410 | freelist[size_class - 1].push_back(buffer_0); 411 | freelist[size_class - 1].push_back(buffer_1); 412 | } 413 | 414 | std::vector shm_list; /// SHM regions by increasing alloc size 415 | std::vector freelist[kNumClasses]; /// Per-class freelist 416 | 417 | SlowRand slow_rand; /// RNG to generate SHM keys 418 | const size_t numa_node; /// NUMA node on which all memory is allocated 419 | size_t prev_allocation_size; /// Size of previous hugepage reservation 420 | 421 | // Stats 422 | struct { 423 | size_t shm_reserved = 0; /// Total hugepage memory reserved by allocator 424 | size_t user_alloc_tot = 0; /// Total memory allocated to user 425 | } stats; 426 | }; 427 | 428 | } // namespace hugealloc 429 | --------------------------------------------------------------------------------