├── mica_pmem
    ├── .gitignore
    ├── Makefile
    ├── run.sh
    ├── test.cc
    └── bench.cc
├── scripts
    ├── .gitignore
    ├── clean.sh
    ├── mlx_env.sh
    ├── reconfigure_fsdax_to_devdax.sh
    ├── utils.sh
    └── ipmctl_watch.sh
├── README.md
├── nvme_perf
    ├── c
    │   ├── .gitignore
    │   ├── Makefile
    │   └── hello.c
    └── latency.sh
├── rdma
    ├── rdma-write-bw
    │   ├── .gitignore
    │   ├── kill.sh
    │   ├── config
    │   ├── README.md
    │   ├── Makefile
    │   ├── run-servers.sh
    │   ├── run-machine.sh
    │   └── main.cc
    ├── rw-tput-receiver
    │   ├── .gitignore
    │   ├── kill.sh
    │   ├── config
    │   ├── notes.md
    │   ├── Makefile
    │   ├── run-servers.sh
    │   ├── run-machine.sh
    │   └── main.cc
    ├── rdma-write-flush-lat
    │   ├── .gitignore
    │   ├── kill.sh
    │   ├── README.md
    │   ├── Makefile
    │   ├── run-servers.sh
    │   ├── run-machine.sh
    │   ├── latency.h
    │   └── main.cc
    └── libhrd_cpp
    │   └── hrd.h
├── hopscotch_pmem
    ├── .gitignore
    ├── Makefile
    ├── run.sh
    ├── LICENSE
    ├── test.cc
    └── bench.cc
├── hog
    ├── hog
    ├── Makefile
    ├── run.sh
    └── hog.cc
├── .clang-format
├── log_store
    ├── Makefile
    ├── run.sh
    ├── rotating_counter.h
    └── bench.cc
├── ioat
    ├── .clang-format
    ├── Makefile
    ├── setup_dpdk.sh
    ├── run.sh
    ├── virt2phy.h
    ├── bench.cc
    └── huge_alloc.h
├── cacheline_versions
    ├── Makefile
    ├── sweep.sh
    ├── run.sh
    └── bench.cc
├── microbench
    ├── read_latency
    │   ├── Makefile
    │   ├── run.sh
    │   └── bench.cc
    ├── write_latency
    │   ├── Makefile
    │   ├── run.sh
    │   └── bench.cc
    ├── Makefile
    ├── run.sh
    ├── README
    ├── seq_read_tput.h
    ├── rand_write_tput.h
    ├── bench.h
    ├── rand_read_tput.h
    ├── seq_write_tput.h
    ├── rand_write_latency.h
    ├── rand_read_latency.h
    ├── seq_write_latency.h
    └── bench.cc
├── pmemkv_perf
    ├── run.sh
    └── bench.cc
├── circular_writes_tput
    ├── dram_only
    │   ├── Makefile
    │   ├── sweep.sh
    │   ├── run.sh
    │   └── bench.cc
    ├── Makefile
    ├── sweep.sh
    ├── run.sh
    └── bench.cc
├── randomizer
    ├── Makefile
    └── main.cc
├── .gitignore
├── utils
    ├── hdr_histogram_wrapper.h
    └── timer.h
├── .ycm_extra_conf.py
└── common.h


/mica_pmem/.gitignore:
--------------------------------------------------------------------------------
1 | test
2 | 


--------------------------------------------------------------------------------
/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | watch_out
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Persistent memory benchmarks
2 | 


--------------------------------------------------------------------------------
/nvme_perf/c/.gitignore:
--------------------------------------------------------------------------------
1 | perf
2 | hello
3 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/.gitignore:
--------------------------------------------------------------------------------
1 | write-bw
2 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/.gitignore:
--------------------------------------------------------------------------------
1 | main
2 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/.gitignore:
--------------------------------------------------------------------------------
1 | test
2 | bench
3 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/.gitignore:
--------------------------------------------------------------------------------
1 | write-flush
2 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/kill.sh:
--------------------------------------------------------------------------------
1 | sudo pkill main
2 | sudo pkill memcached
3 | 


--------------------------------------------------------------------------------
/hog/hog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anujkaliaiitd/pmem-bench/HEAD/hog/hog


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/kill.sh:
--------------------------------------------------------------------------------
1 | sudo pkill main
2 | sudo pkill memcached
3 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/kill.sh:
--------------------------------------------------------------------------------
1 | sudo pkill main
2 | sudo pkill memcached
3 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | AllowShortCaseLabelsOnASingleLine: true
3 | 


--------------------------------------------------------------------------------
/hog/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o hog hog.cc -lpmem -march=native
3 | clean:
4 | 	rm hog
5 | 


--------------------------------------------------------------------------------
/hog/run.sh:
--------------------------------------------------------------------------------
1 | make
2 | rm -f /tmp/hogout
3 | sudo -E taskset -c 23 ./hog > /tmp/hogout &
4 | 


--------------------------------------------------------------------------------
/log_store/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o bench bench.cc -lpmem -march=native
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/config:
--------------------------------------------------------------------------------
1 | --min_write_size 1024
2 | --max_write_size 131072
3 | --window_size 32
4 | 


--------------------------------------------------------------------------------
/ioat/.clang-format:
--------------------------------------------------------------------------------
1 | BasedOnStyle: Google
2 | AllowShortCaseLabelsOnASingleLine: true
3 | SortIncludes: false
4 | 


--------------------------------------------------------------------------------
/cacheline_versions/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o bench bench.cc -lpmem -march=native -lgflags
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/microbench/read_latency/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -g -o bench bench.cc -lpmem -march=native
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/microbench/write_latency/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -g -o bench bench.cc -lpmem -march=native
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/pmemkv_perf/run.sh:
--------------------------------------------------------------------------------
1 | pmempool rm --verbose /dev/dax0.0
2 | pmempool create --layout pmemkv obj /dev/dax0.0
3 | ./bench
4 | 


--------------------------------------------------------------------------------
/circular_writes_tput/dram_only/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o bench bench.cc -lpmem -march=native
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/circular_writes_tput/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o bench bench.cc -lpmem -march=native -lgflags -lpthread -lnuma
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/config:
--------------------------------------------------------------------------------
1 | --num_client_processes 2
2 | --num_threads_per_client 4
3 | --use_uc 0
4 | --size 64
5 | --postlist 4
6 | --do_read 0
7 | 


--------------------------------------------------------------------------------
/randomizer/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -O3 -o main main.cc -lpmem -lhdr_histogram_static -lpthread -lgflags -march=native -lnuma -lcityhash
3 | clean:
4 | 	rm main
5 | 


--------------------------------------------------------------------------------
/scripts/clean.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Clean temporary files generated by CMake for eRPC
3 | rm -rf build CMakeFiles CMakeCache.txt cmake_install.cmake CTestTestfile.cmake Makefile Testing
4 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/notes.md:
--------------------------------------------------------------------------------
1 | # Oct 27, with commit 32b029c:
2 |   * One Optane DIMM at server, DDIO disabled
3 |   * 64-byte random writes from 8 clients spread over two machines get 9 M/s
4 |     total
5 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -g -o test test.cc -lcityhash -lgtest -lpmem
3 | 	g++ -g -O3 -DNDEBUG bench.cc -o bench -lpmem -lcityhash -lpthread -lgtest -lnuma -lgflags -march=native
4 | clean:
5 | 	rm test bench
6 | 


--------------------------------------------------------------------------------
/mica_pmem/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	g++ -g test.cc -o test -lpmem -lcityhash -lpthread -lgtest -lnuma
3 | 	g++ -g -O3 -DNDEBUG bench.cc -o bench -lpmem -lcityhash -lpthread -lgtest -lnuma -lgflags -march=native
4 | clean:
5 | 	rm test
6 | 


--------------------------------------------------------------------------------
/scripts/mlx_env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Config for Mellanox userspace driver
3 | 
4 | export MLX4_SINGLE_THREADED=1
5 | export MLX5_SINGLE_THREADED=1
6 | export MLX_QP_ALLOC_TYPE="HUGE"
7 | export MLX_CQ_ALLOC_TYPE="HUGE"
8 | 


--------------------------------------------------------------------------------
/circular_writes_tput/dram_only/sweep.sh:
--------------------------------------------------------------------------------
1 | for num_counters in `seq 1 32`; do
2 |   make 1>/dev/null 2>/dev/null
3 |   t=`/usr/bin/time -f "%e" numactl --physcpubind=3 --membind=0 ./bench $num_counters`
4 | 
5 |   #echo "$kNumCounters;$t"
6 | done
7 | 


--------------------------------------------------------------------------------
/nvme_perf/c/Makefile:
--------------------------------------------------------------------------------
1 | # On the Intel AEP servers, SPDK is installed at system-level, but DPDK isn't.
2 | all:
3 | 	gcc -o hello hello.c -L /home/akalia/sandbox/spdk/dpdk/build/lib \
4 | 	-lspdk_nvme -lspdk_util -lspdk_env_dpdk -lspdk_log \
5 | 	-lpthread -ldpdk -lnuma -ldl -luuid
6 | 


--------------------------------------------------------------------------------
/circular_writes_tput/sweep.sh:
--------------------------------------------------------------------------------
 1 | for num_counters in 1 2 3 4 5 8 16; do
 2 |   for stride_size in 64 256; do
 3 |     rm config.h
 4 |     touch config.h
 5 | 
 6 |     sudo -E env numactl --physcpubind=3 --membind=0 ./bench \
 7 |       --num_counters=$num_counters \
 8 |       --stride_size=$stride_size
 9 |   done
10 | done
11 | 


--------------------------------------------------------------------------------
/microbench/Makefile:
--------------------------------------------------------------------------------
 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native
 2 | SOURCES=bench.cc
 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem
 4 | 
 5 | all:
 6 | 	g++ -std=c++11 -O3 ${CPP_FLAGS} -o bench ${SOURCES} ${LIBS}
 7 | 
 8 | clean:
 9 | 	rm bench
10 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/README.md:
--------------------------------------------------------------------------------
1 | Latency to flush a write from remote NIC cache
2 | 
3 | DRAM writes:
4 |  * 16-byte inline WRITE latency = 1.3 us (median and 99th percentile)
5 |  * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.6 us 50%, 2.9 us 99%
6 |  * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.4 us 50%, 2.6 us 99%
7 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/README.md:
--------------------------------------------------------------------------------
1 | Latency to flush a write from remote NIC cache
2 | 
3 | DRAM writes:
4 |  * 16-byte inline WRITE latency = 1.3 us (median and 99th percentile)
5 |  * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.6 us 50%, 2.9 us 99%
6 |  * 16-byte inline WRITE latency with a READ flush, without DDIO = 2.4 us 50%, 2.6 us 99%
7 | 


--------------------------------------------------------------------------------
/cacheline_versions/sweep.sh:
--------------------------------------------------------------------------------
 1 | for kNumCounters in 1 2 3 4 5 8 16; do
 2 |   for kStrideSize in 256 4096; do
 3 |     rm config.h
 4 |     touch config.h
 5 | 
 6 |     echo "static constexpr size_t kNumCounters = $kNumCounters;" >> config.h
 7 |     echo "static constexpr size_t kStrideSize = $kStrideSize;" >> config.h
 8 | 
 9 |     make
10 |     ./run.sh
11 |   done
12 | done
13 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/Makefile:
--------------------------------------------------------------------------------
 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native
 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc
 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem
 4 | 
 5 | all:
 6 | 	g++ -std=c++11 -O3 ${CPP_FLAGS} -o write-bw ${SOURCES} ${LIBS}
 7 | 
 8 | clean:
 9 | 	rm write-bw
10 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/Makefile:
--------------------------------------------------------------------------------
 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native
 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc
 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma -lpmem
 4 | 
 5 | all:
 6 | 	g++ -std=c++11 -O3 ${CPP_FLAGS} -o main ${SOURCES} ${LIBS}
 7 | 
 8 | clean:
 9 | 	rm write-bw
10 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/Makefile:
--------------------------------------------------------------------------------
 1 | CPP_FLAGS=-Wall -Wextra -Werror -pedantic -Wsign-conversion -Wold-style-cast -Wno-unused-function -march=native
 2 | SOURCES=../libhrd_cpp/hrd_conn.cc ../libhrd_cpp/hrd_util.cc main.cc
 3 | LIBS=-libverbs -lgtest -lpthread -lmemcached -lgflags -lnuma
 4 | 
 5 | all:
 6 | 	g++ -std=c++11 -O3 ${CPP_FLAGS} -o write-flush ${SOURCES} ${LIBS}
 7 | 
 8 | clean:
 9 | 	rm write-flush
10 | 


--------------------------------------------------------------------------------
/circular_writes_tput/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | if [ "$#" -gt 1 ]; then
 5 |   blue "Illegal number of arguments."
 6 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 7 | 	exit
 8 | fi
 9 | 
10 | # Check for non-gdb mode
11 | if [ "$#" -eq 0 ]; then
12 |   sudo -E env numactl --physcpubind=3 --membind=0 $exe
13 | fi
14 | 
15 | # Check for gdb mode
16 | if [ "$#" -eq 1 ]; then
17 |   gdb -ex run --args $exe --num_threads=$num_threads
18 | fi
19 | 


--------------------------------------------------------------------------------
/ioat/Makefile:
--------------------------------------------------------------------------------
 1 | DPDK_HOME=/usr
 2 | 
 3 | CFLAGS=-Wall -Wextra -Werror -pedantic -fpermissive -march=native \
 4 | 			 -Wold-style-cast -Wsign-conversion \
 5 | 			 -Wno-unused-function
 6 | 
 7 | all:
 8 | 	g++ -O3 -std=c++11 ${CFLAGS} -o bench bench.cc -isystem ${DPDK_HOME}/include/dpdk/ -march=native -L ${DPDK_HOME}/lib/ \
 9 |   -Wl,--whole-archive \
10 |   -ldpdk -lnuma -lpthread -ldl -lm -lgflags -lpmem \
11 |   -Wl,--no-whole-archive \
12 | 
13 | clean:
14 | 	rm bench
15 | 


--------------------------------------------------------------------------------
/log_store/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | num_threads=1
 5 | 
 6 | if [ "$#" -gt 1 ]; then
 7 |   blue "Illegal number of arguments."
 8 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 9 | 	exit
10 | fi
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 0 ]; then
14 |   numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 1 ]; then
19 |   gdb -ex run --args $exe --num_threads=$num_threads
20 | fi
21 | 


--------------------------------------------------------------------------------
/microbench/read_latency/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | num_threads=1
 5 | 
 6 | if [ "$#" -gt 1 ]; then
 7 |   blue "Illegal number of arguments."
 8 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 9 | 	exit
10 | fi
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 0 ]; then
14 |   numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 1 ]; then
19 |   gdb -ex run --args $exe --num_threads=$num_threads
20 | fi
21 | 


--------------------------------------------------------------------------------
/cacheline_versions/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | num_threads=1
 5 | 
 6 | if [ "$#" -gt 1 ]; then
 7 |   blue "Illegal number of arguments."
 8 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 9 | 	exit
10 | fi
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 0 ]; then
14 |   sudo -E numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 1 ]; then
19 |   gdb -ex run --args $exe --num_threads=$num_threads
20 | fi
21 | 


--------------------------------------------------------------------------------
/microbench/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | num_threads=1
 5 | 
 6 | if [ "$#" -gt 1 ]; then
 7 |   blue "Illegal number of arguments."
 8 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 9 | 	exit
10 | fi
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 0 ]; then
14 |   sudo -E numactl --cpunodebind=0 --membind=0 $exe --num_threads=$num_threads
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 1 ]; then
19 |   sudo -E gdb -ex run --args $exe --num_threads=$num_threads
20 | fi
21 | 


--------------------------------------------------------------------------------
/microbench/write_latency/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | num_threads=1
 5 | 
 6 | if [ "$#" -gt 1 ]; then
 7 |   blue "Illegal number of arguments."
 8 |   blue "Usage: ./run.sh, or ./run.sh gdb"
 9 | 	exit
10 | fi
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 0 ]; then
14 |   numactl --physcpubind=3 --membind=0 $exe --num_threads=$num_threads
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 1 ]; then
19 |   gdb -ex run --args $exe --num_threads=$num_threads
20 | fi
21 | 


--------------------------------------------------------------------------------
/scripts/reconfigure_fsdax_to_devdax.sh:
--------------------------------------------------------------------------------
 1 | # Destroy fsdax namespaces
 2 | 
 3 | # This can fail even when /mnt/pmem0 and /mnt/pmem1 are empty. Rebooting solves it.
 4 | echo "Unmounting"
 5 | sudo umount /mnt/pmem0
 6 | sudo umount /mnt/pmem1
 7 | 
 8 | echo "Destroying namespaces via ndctl. This takes a while."
 9 | sudo ndctl destroy-namespace -f all
10 | 
11 | echo "Recreating devdax namespaces"
12 | sudo ndctl create-namespace --mode devdax --region 0
13 | sudo ndctl create-namespace --mode devdax --region 1
14 | 


--------------------------------------------------------------------------------
/circular_writes_tput/dram_only/run.sh:
--------------------------------------------------------------------------------
 1 | exe="./bench"
 2 | chmod +x $exe
 3 | 
 4 | if [ "$#" -gt 2 ]; then
 5 |   blue "Illegal number of arguments."
 6 |   blue "Usage: ./run.sh <num_counters>, or ./run.sh <num_counters> <gdb>"
 7 | 	exit
 8 | fi
 9 | 
10 | num_counters=$1
11 | 
12 | # Check for non-gdb mode
13 | if [ "$#" -eq 1 ]; then
14 |   sudo numactl --physcpubind=3 --membind=0 $exe $num_counters
15 | fi
16 | 
17 | # Check for gdb mode
18 | if [ "$#" -eq 2 ]; then
19 |   gdb -ex run --args $exe $num_counters
20 | fi
21 | 


--------------------------------------------------------------------------------
/ioat/setup_dpdk.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | dpdk=~/sandbox/dpdk-19.08/
 3 | 
 4 | sudo modprobe uio
 5 | sudo insmod $dpdk/x86_64-native-linux-gcc/kmod/igb_uio.ko
 6 | 
 7 | # Create hugepage mount
 8 | sudo mkdir -p /mnt/huge
 9 | grep -s /mnt/huge /proc/mounts > /dev/null
10 | 
11 | if [ $? -ne 0 ] ; then
12 |   sudo mount -t hugetlbfs nodev /mnt/huge
13 | fi
14 | 
15 | # Bind IOAT devices on NUMA node 0, choose igb_uio (userspace) or ioatdma (kernel)
16 | for i in `seq 0 7`; do
17 |   sudo ${dpdk}/usertools/dpdk-devbind.py -b igb_uio 0000:00:04.$i
18 | done
19 | 
20 | # Bind IOAT devices on NUMA node 1, choose igb_uio (userspace) or ioatdma (kernel)
21 | for i in `seq 0 7`; do
22 |   sudo ${dpdk}/usertools/dpdk-devbind.py -b ioatdma 0000:80:04.$i
23 | done
24 | 


--------------------------------------------------------------------------------
/scripts/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Utilities for other scripts
 3 | 
 4 | # Echo in blue color
 5 | function blue() {
 6 | 	es=`tput setaf 4`
 7 | 	ee=`tput sgr0`
 8 | 	echo "${es}$1${ee}"
 9 | }
10 | 
11 | # Drop all SHM
12 | function drop_shm() {
13 | 	for i in $(ipcs -m | awk '{ print $2; }'); do
14 | 		sudo ipcrm -m $i 2>/dev/null
15 | 	done
16 | }
17 | 
18 | # Check if an environment variable is set. If it is not, exit.
19 | function check_env() {
20 |   if [ -z "$1" ]; then
21 |     echo "utils: Environment variable $1 not set. Exiting."
22 |     exit
23 |   fi
24 | }
25 | 
26 | # Check if a file ($1) exists. If it does not, exit.
27 | function assert_file_exists() {
28 |   if [ ! -f $1 ]; then
29 |     echo "utils: File $1 not found! Exiting."
30 |     exit 0
31 |   fi
32 | }
33 | 


--------------------------------------------------------------------------------
/circular_writes_tput/dram_only/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <malloc.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | 
 6 | #define clwb(addr) \
 7 |   asm volatile(".byte 0x66; xsaveopt %0" : "+m"(*(volatile char *)(addr)));
 8 | 
 9 | int main(int argc, char **argv) {
10 |   if (argc <= 1) {
11 |     printf("Usage ./bench [num_counters]\n");
12 |     exit(0);
13 |   }
14 | 
15 |   size_t num_counters = static_cast<size_t>(atoi(argv[1]));
16 |   uint8_t *buf = reinterpret_cast<uint8_t *>(memalign(num_counters * 64, 4096));
17 | 
18 |   size_t data = 0;
19 |   for (size_t i = 0; i < 10000000; i++) {
20 |     size_t buf_offset = (i % num_counters) * 64;
21 |     buf[buf_offset] = data++;
22 |     clwb(&buf[buf_offset]);
23 |     asm volatile("sfence" ::: "memory");
24 |   }
25 | 
26 |   free(buf);
27 | }
28 | 


--------------------------------------------------------------------------------
/microbench/README:
--------------------------------------------------------------------------------
 1 | Persistent Memory Development Kit
 2 | 
 3 | This is examples/libpmem/README.
 4 | 
 5 | This directory contains examples for libpmem, the library containing
 6 | low-level persistent memory support.  A detailed explanation of these
 7 | examples can be found here: http://pmem.io/pmdk/libpmem
 8 | 
 9 | manpage.c is the example used in the libpmem man page.
10 | 
11 | simple_copy.c is a simple pmem_memcpy() example.
12 | 
13 | full_copy.c shows how to use pmem_memcpy_nodrain().
14 | 
15 | To build these examples:
16 | 	make
17 | 
18 | These examples can be built against an installed system using:
19 | 	make LIBDIR=/usr/lib INCDIR=/usr/include
20 | 
21 | If you're looking for documentation to get you started using PMDK,
22 | start here: http://pmem.io/pmdk and follow the links to examples and
23 | man pages.  Developers new to PMDK are probably looking for libpmemobj.
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | bench
 2 | # Object files
 3 | *.o
 4 | *.ko
 5 | *.obj
 6 | *.elf
 7 | main
 8 | tags
 9 | 1
10 | 
11 | # Precompiled Headers
12 | *.gch
13 | *.pch
14 | 
15 | # Libraries
16 | *.lib
17 | *.a
18 | *.la
19 | *.lo
20 | 
21 | # Shared objects (inc. Windows DLLs)
22 | *.dll
23 | *.so
24 | *.so.*
25 | *.dylib
26 | 
27 | # Executables
28 | *.exe
29 | *.out
30 | *.app
31 | *.i*86
32 | *.x86_64
33 | *.hex
34 | 
35 | # Debug files
36 | *.dSYM/
37 | 
38 | # Apt NFS files
39 | .nfs000*
40 | 
41 | # CMake
42 | build
43 | CMakeCache.txt
44 | CMakeFiles
45 | cmake_install.cmake
46 | CMakeScripts
47 | Testing
48 | CTestTestfile.cmake
49 | 
50 | # gdb
51 | .gdb_history
52 | 
53 | # Mac
54 | .DS_Store
55 | 
56 | # Doxygen
57 | html
58 | latex
59 | 
60 | # Common temp files
61 | ibnet_out
62 | sweep_out
63 | scripts/autorun_process_file
64 | scripts/autorun_app_file
65 | gdb_history
66 | *.swp
67 | src/config.h
68 | .ycm_extra_conf.py
69 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/run-servers.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | drop_shm
 9 | exe="./write-flush"
10 | chmod +x $exe
11 | 
12 | blue "Reset server QP registry"
13 | sudo pkill memcached
14 | 
15 | # Spawn memcached, but wait for it to start
16 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null &
17 | while ! nc -z localhost 11211; do sleep .1; done
18 | echo "Server: memcached server is open for business on port 11211"
19 | 
20 | # Check for non-gdb mode
21 | if [ "$#" -eq 0 ]; then
22 |   sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 0
23 | fi
24 | 
25 | # Check for gdb mode
26 | if [ "$#" -eq 1 ]; then
27 |   sudo -E gdb -ex run --args $exe --is_client 0
28 | fi
29 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/run-servers.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | blue "Dropping SHM regions"
 9 | drop_shm
10 | exe="./main"
11 | chmod +x $exe
12 | 
13 | blue "Reset server QP registry"
14 | sudo pkill memcached
15 | 
16 | # Spawn memcached, but wait for it to start
17 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null &
18 | while ! nc -z localhost 11211; do sleep .1; done
19 | echo "Server: memcached server is open for business on port 11211"
20 | 
21 | # Check for non-gdb mode
22 | if [ "$#" -eq 0 ]; then
23 |   sudo -E numactl --cpunodebind=0 --membind=0 $exe \
24 |     --is_client 0 $(cat config)
25 | fi
26 | 
27 | # Check for gdb mode
28 | if [ "$#" -eq 1 ]; then
29 |   sudo -E gdb -ex run --args $exe --is_client 0 $(cat config)
30 | fi
31 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/run-servers.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | blue "Dropping SHM regions"
 9 | drop_shm
10 | exe="./write-bw"
11 | chmod +x $exe
12 | 
13 | blue "Reset server QP registry"
14 | sudo pkill memcached
15 | 
16 | # Spawn memcached, but wait for it to start
17 | memcached -l 0.0.0.0 1>/dev/null 2>/dev/null &
18 | while ! nc -z localhost 11211; do sleep .1; done
19 | echo "Server: memcached server is open for business on port 11211"
20 | 
21 | # Check for non-gdb mode
22 | if [ "$#" -eq 0 ]; then
23 |   sudo -E numactl --cpunodebind=0 --membind=0 $exe \
24 |     --is_client 0 $(cat config)
25 | fi
26 | 
27 | # Check for gdb mode
28 | if [ "$#" -eq 1 ]; then
29 |   sudo -E gdb -ex run --args $exe --is_client 0 $(cat config)
30 | fi
31 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/run-machine.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | drop_shm
 9 | exe="./write-flush"
10 | chmod +x $exe
11 | 
12 | # Check number of arguments
13 | if [ "$#" -gt 2 ]; then
14 |   blue "Illegal number of arguments."
15 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
16 | 	exit
17 | fi
18 | 
19 | if [ "$#" -eq 0 ]; then
20 |   blue "Illegal number of arguments."
21 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
22 | 	exit
23 | fi
24 | 
25 | # Check for non-gdb mode
26 | if [ "$#" -eq 1 ]; then
27 |   sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 1
28 | fi
29 | 
30 | # Check for gdb mode
31 | if [ "$#" -eq 2 ]; then
32 |   sudo -E gdb -ex run --args $exe --is_client 1
33 | fi
34 | 


--------------------------------------------------------------------------------
/microbench/seq_read_tput.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_seq_read_tput(uint8_t *pbuf, size_t thread_id, size_t num_threads) {
 4 |   static constexpr size_t kReadSize = MB(256);
 5 |   auto *buf = new uint8_t[kReadSize];
 6 | 
 7 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
 8 |   struct timespec start;
 9 |   size_t sum = 0;
10 | 
11 |   for (size_t iter = 0; iter < 20; iter++) {
12 |     clock_gettime(CLOCK_REALTIME, &start);
13 | 
14 |     // Generate a 64-byte aligned address to read kReadSize bytes from
15 |     size_t start_address = roundup<64>(pcg() % kPmemFileSize);
16 |     if (start_address + kReadSize >= kPmemFileSize) {
17 |       iter--;
18 |       continue;
19 |     }
20 | 
21 |     memcpy(buf, &pbuf[start_address], kReadSize);
22 |     sum += buf[pcg() % kReadSize];
23 | 
24 |     double tot_sec = sec_since(start);
25 |     printf("Thread %zu of %zu, seq read tput = %.2f GB/sec, sum = %zu\n",
26 |            thread_id, num_threads, kReadSize * 1.0 / (GB(1) * tot_sec), sum);
27 |   }
28 | 
29 |   delete[] buf;
30 | }
31 | 


--------------------------------------------------------------------------------
/mica_pmem/run.sh:
--------------------------------------------------------------------------------
 1 | batch_size=16
 2 | benchmark=5050
 3 | sweep_optimizations=1
 4 | pmem_file="/mnt/pmem12/raft_log"
 5 | 
 6 | one_million=1048576  # Just a constant to adjust keys_total below
 7 | keys_total=`expr 1024 \* $one_million`
 8 | 
 9 | rm -rf /tmp/mica_bench*
10 | 
11 | for num_threads in 1 2 4 8 16 24; do
12 |   keys_per_thread=`expr $keys_total / $num_threads`
13 | 
14 |   # Non-GDB mode
15 |   if [ "$#" -eq 0 ]; then
16 |     numactl --cpunodebind=0 --membind=0 ./bench \
17 |       --table_key_capacity $keys_per_thread \
18 |       --batch_size $batch_size \
19 |       --benchmark $benchmark \
20 |       --pmem_file $pmem_file \
21 |       --sweep_optimizations $sweep_optimizations \
22 |       --num_threads $num_threads
23 |   fi
24 |   printf "\n\n"
25 | done
26 | 
27 | num_threads=1
28 | # GDB mode
29 | if [ "$#" -eq 1 ]; then
30 |   echo "do.sh: Launching process with GDB"
31 |   num_keys=65536
32 |   gdb -ex run --args ./bench \
33 |       --table_key_capacity $num_keys \
34 |       --batch_size $batch_size \
35 |       --benchmark $benchmark \
36 |       --pmem_file $pmem_file \
37 |       --num_threads $num_threads
38 | fi
39 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/run.sh:
--------------------------------------------------------------------------------
 1 | batch_size=16
 2 | benchmark=get
 3 | sweep_optimizations=1
 4 | pmem_file="/mnt/pmem12/raft_log"
 5 | 
 6 | one_million=1048576  # Just a constant to adjust keys_total below
 7 | keys_total=`expr 64 \* $one_million`
 8 | 
 9 | rm -rf /tmp/mica_bench*
10 | 
11 | for num_threads in 1 2 4 8 16 24; do
12 |   keys_per_thread=`expr $keys_total / $num_threads`
13 | 
14 |   # Non-GDB mode
15 |   if [ "$#" -eq 0 ]; then
16 |     numactl --cpunodebind=0 --membind=0 ./bench \
17 |       --table_key_capacity $keys_per_thread \
18 |       --batch_size $batch_size \
19 |       --benchmark $benchmark \
20 |       --pmem_file $pmem_file \
21 |       --sweep_optimizations $sweep_optimizations \
22 |       --num_threads $num_threads
23 |   fi
24 |   printf "\n\n"
25 | done
26 | 
27 | num_threads=1
28 | # GDB mode
29 | if [ "$#" -eq 1 ]; then
30 |   echo "do.sh: Launching process with GDB"
31 |   num_keys=65536
32 |   gdb -ex run --args ./bench \
33 |       --table_key_capacity $num_keys \
34 |       --batch_size $batch_size \
35 |       --benchmark $benchmark \
36 |       --pmem_file $pmem_file \
37 |       --num_threads $num_threads
38 | fi
39 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/run-machine.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | # Check number of arguments
 9 | if [ "$#" -gt 2 ]; then
10 |   blue "Illegal number of arguments."
11 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
12 | 	exit
13 | fi
14 | 
15 | if [ "$#" -eq 0 ]; then
16 |   blue "Illegal number of arguments."
17 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
18 | 	exit
19 | fi
20 | 
21 | machine_id=$1
22 | 
23 | drop_shm
24 | exe="./write-bw"
25 | chmod +x $exe
26 | 
27 | # Check for non-gdb mode
28 | if [ "$#" -eq 1 ]; then
29 |   sudo -E numactl --physcpubind=0 --membind=0 $exe --is_client 1 \
30 |     --machine_id $machine_id $(cat config)
31 | fi
32 | 
33 | # Check for gdb mode
34 | if [ "$#" -eq 2 ]; then
35 |   sudo -E gdb -ex run --args $exe --is_client 1 \
36 |     --machine_id $machine_id $(cat config)
37 | fi
38 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/run-machine.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source $(dirname $0)/../../scripts/utils.sh
 3 | source $(dirname $0)/../../scripts/mlx_env.sh
 4 | #export HRD_REGISTRY_IP="fawn-pluto0"
 5 | #export HRD_REGISTRY_IP="akalianode-1.rdma.fawn.apt.emulab.net"
 6 | export HRD_REGISTRY_IP="192.168.18.2"
 7 | 
 8 | # Check number of arguments
 9 | if [ "$#" -gt 2 ]; then
10 |   blue "Illegal number of arguments."
11 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
12 | 	exit
13 | fi
14 | 
15 | if [ "$#" -eq 0 ]; then
16 |   blue "Illegal number of arguments."
17 |   blue "Usage: ./run-machine.sh <machine_id>, or ./run-machine.sh <machine_id> gdb"
18 | 	exit
19 | fi
20 | 
21 | machine_id=$1
22 | 
23 | drop_shm
24 | exe="./main"
25 | chmod +x $exe
26 | 
27 | # Check for non-gdb mode
28 | if [ "$#" -eq 1 ]; then
29 |   sudo -E numactl --cpunodebind=0 --membind=0 $exe --is_client 1 \
30 |     --machine_id $machine_id $(cat config)
31 | fi
32 | 
33 | # Check for gdb mode
34 | if [ "$#" -eq 2 ]; then
35 |   sudo -E gdb -ex run --args $exe --is_client 1 \
36 |     --machine_id $machine_id $(cat config)
37 | fi
38 | 


--------------------------------------------------------------------------------
/hog/hog.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <libpmem.h>
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | #include <unistd.h>
 8 | #include "../common.h"
 9 | 
10 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log";
11 | static constexpr const char *kFileName = "/dev/dax0.0";
12 | static constexpr size_t kPmemFileSize = GB(8);
13 | 
14 | int main() {
15 |   rt_assert(getuid() == 0, "You need to be root to run this benchmark");
16 |   uint8_t *pbuf;
17 |   size_t mapped_len;
18 | 
19 |   int is_pmem;
20 |   pbuf = reinterpret_cast<uint8_t *>(
21 |       pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem));
22 | 
23 |   rt_assert(pbuf != nullptr);
24 |   rt_assert(mapped_len >= kPmemFileSize);
25 | 
26 |   size_t iter = 0;
27 |   auto *buf = reinterpret_cast<uint8_t *>(malloc(kPmemFileSize));
28 | 
29 |   while (true) {
30 |     struct timespec start;
31 |     clock_gettime(CLOCK_REALTIME, &start);
32 |     pmem_memcpy_persist(pbuf, buf, kPmemFileSize);
33 |     printf("Hog: iter = %zu, bandwidth = %.2f GB/s\n", iter,
34 |            (kPmemFileSize * 1.0 / GB(1)) / sec_since(start));
35 |     iter++;
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/LICENSE:
--------------------------------------------------------------------------------
 1 | /*_
 2 |  * Copyright (c) 2016 Hirochika Asai <asai@jar.jp>
 3 |  * All rights reserved.
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |  * of this software and associated documentation files (the "Software"), to deal
 7 |  * in the Software without restriction, including without limitation the rights
 8 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |  * copies of the Software, and to permit persons to whom the Software is
10 |  * furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |  * SOFTWARE.
22 |  */
23 | 
24 | 


--------------------------------------------------------------------------------
/nvme_perf/latency.sh:
--------------------------------------------------------------------------------
 1 | perf_exe="/home/akalia/sandbox/spdk/examples/nvme/perf/perf"
 2 | 
 3 | rm -f tmpout_*
 4 | rm -rf final_out
 5 | touch final_out
 6 | 
 7 | # Last one wins
 8 | bench=read       # Sequential reads
 9 | bench=randwrite  # Random writes
10 | bench=write      # Sequential writes
11 | bench=randread   # Random reads
12 | 
13 | echo "size us_avg us_median us_999 us_99" >> final_out
14 | 
15 | for ((size = 512; size <= 65536; size *= 2)); do
16 |   tmpfile="tmpout_$size"
17 | 
18 |   # -q: queue depth
19 |   # -o: object size to write
20 |   # -t: time in seconds
21 |   # -c: core mask (core 24)
22 |   # -L: generate histogram
23 |   sudo numactl --cpunodebind=1 --membind=1 $perf_exe \
24 |     -q 1 -o $size -w $bench -t 2 -c 0x1000000 -L > $tmpfile
25 | 
26 |   us_avg=`cat $tmpfile  | grep Total | sed -s 's/  */ /g' | cut -d ' ' -f 5`
27 |   us_median=`cat $tmpfile  | grep "50\.00000"  | tr -d ' ' |  cut -d ":" -f 2 | sed 's/us//g'`
28 |   us_99=`cat $tmpfile  | grep "99\.00000"  | tr -d ' ' |  cut -d ":" -f 2 | sed 's/us//g'`
29 |   us_999=`cat $tmpfile  | grep "99\.90000"  | tr -d ' ' |  cut -d ":" -f 2 | sed 's/us//g'`
30 | 
31 |   echo $size $us_avg $us_median $us_999 $us_99
32 |   echo $size $us_avg $us_median $us_999 $us_99 >> final_out
33 | done
34 | 
35 | cat final_out
36 | rm -f tmpout_*
37 | rm -rf final_out
38 | 


--------------------------------------------------------------------------------
/ioat/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | num_prints=3
 4 | use_ioat=1
 5 | use_pmem=1
 6 | numa_node=0
 7 | 
 8 | stat_file=$(mktemp)
 9 | out_file=$(mktemp)
10 | 
11 | function sweep_num_ioat_engines() {
12 |   window_sizes="1 8"
13 |   echo "size $window_sizes" > ${stat_file} # Stats file header
14 | 
15 |   for size in 1024 2048 4096 8192 16384 32768 65536 131072; do
16 |     stat_str="$size" # Saved in stat_file at the end of a window
17 |     for window_size in $window_sizes; do 
18 |       sudo -E env numactl --cpunodebind=$numa_node --membind=$numa_node ./bench \
19 |         --num_prints $num_prints \
20 |         --use_ioat $use_ioat \
21 |         --use_pmem $use_pmem \
22 |         --numa_node $numa_node \
23 |         --size $size \
24 |         --window_size $window_size 1>${out_file} 2>${out_file}
25 | 
26 |       # The last num_prints lines of out_file are formatted like:
27 |       # 10.2 GB/s
28 |       avg=`cat ${out_file} | tail -$num_prints | cut -d' ' -f 1 | avg.awk`
29 |       echo "size $size, window size $window_size, tput $avg GB/s"
30 | 
31 |       stat_str="$stat_str $avg"
32 |     done
33 | 
34 |     echo "Saving $stat_str to ${stat_file}"
35 |     echo $stat_str >> ${stat_file}
36 |   done
37 | 
38 |   echo "Results for: use_ioat $use_ioat, use_pmem $use_pmem"
39 |   cat ${stat_file}
40 | }
41 | 
42 | sweep_num_ioat_engines
43 | 


--------------------------------------------------------------------------------
/pmemkv_perf/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "../common.h"
 3 | #include "/home/akalia/sandbox/pmemkv/src/pmemkv.h"
 4 | 
 5 | #define LOG(msg) std::cout << msg << "\n"
 6 | 
 7 | using namespace pmemkv;
 8 | 
 9 | int main() {
10 |   LOG("Opening datastore");
11 |   KVEngine* kv =
12 |       KVEngine::Open("kvtree3", "/dev/dax0.0", 1073741824);  // 1 GB pool
13 |   assert(kv != nullptr);
14 | 
15 |   struct timespec start;
16 |   clock_gettime(CLOCK_REALTIME, &start);
17 |   for (size_t i = 0; i < 100000; i++) {
18 |     std::string k = std::to_string(i);
19 |     std::string v = std::to_string(i);
20 |     kv->Put(k, v);
21 |     kv->Put(k, v);
22 |     kv->Put(k, v);
23 |   }
24 | 
25 |   double seconds = sec_since(start);
26 |   printf("seconds = %.2f\n", seconds);
27 | 
28 |   LOG("Putting new key");
29 |   KVStatus s = kv->Put("key1", "value1");
30 |   assert(s == OK && kv->Count() == 1);
31 | 
32 |   LOG("Reading key back");
33 |   string value;
34 |   s = kv->Get("key1", &value);
35 |   assert(s == OK && value == "value1");
36 | 
37 |   LOG("Iterating existing keys");
38 |   kv->Put("key2", "value2");
39 |   kv->Put("key3", "value3");
40 |   kv->All([](int, const char* k) { LOG("  visited: " << k); });
41 | 
42 |   LOG("Removing existing key");
43 |   s = kv->Remove("key1");
44 |   assert(s == OK && !kv->Exists("key1"));
45 | 
46 |   LOG("Closing datastore");
47 |   delete kv;
48 |   return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/utils/hdr_histogram_wrapper.h:
--------------------------------------------------------------------------------
 1 | #include <hdr/hdr_histogram.h>
 2 | 
 3 | // A wrapper for hdr_histogram that supports floating point values with
 4 | // magnified precision. A floating point record x is inserted as x * AMP.
 5 | template <size_t AMP>
 6 | class HdrHistogramAmp {
 7 |  public:
 8 |   HdrHistogramAmp(int64_t min, int64_t max, uint32_t precision) {
 9 |     int ret = hdr_init(min * AMP, max * AMP, precision, &hist);
10 |     rt_assert(ret == 0);
11 |   }
12 | 
13 |   ~HdrHistogramAmp() { hdr_close(hist); }
14 | 
15 |   inline void record_value(double v) { hdr_record_value(hist, v * AMP); }
16 | 
17 |   double percentile(double p) {
18 |     return hdr_value_at_percentile(hist, p) / (AMP * 1.0);
19 |   }
20 | 
21 |   void reset() { hdr_reset(hist); }
22 | 
23 |   hdr_histogram *get_raw_hist() { return hist; }
24 | 
25 |  private:
26 |   hdr_histogram *hist = nullptr;
27 | };
28 | 
29 | // A conveinince wrapper for hdr_histogram
30 | class HdrHistogram {
31 |  public:
32 |   HdrHistogram(int64_t min, int64_t max, int precision) {
33 |     int ret = hdr_init(min, max, precision, &hist);
34 |     rt_assert(ret == 0);
35 |   }
36 | 
37 |   ~HdrHistogram() { hdr_close(hist); }
38 | 
39 |   inline void record_value(size_t v) {
40 |     hdr_record_value(hist, static_cast<int64_t>(v));
41 |   }
42 | 
43 |   size_t percentile(double p) const {
44 |     return static_cast<size_t>(hdr_value_at_percentile(hist, p));
45 |   }
46 | 
47 |   void reset() { hdr_reset(hist); }
48 | 
49 |   hdr_histogram *get_raw_hist() { return hist; }
50 | 
51 |  private:
52 |   hdr_histogram *hist = nullptr;
53 | }
54 | 


--------------------------------------------------------------------------------
/microbench/rand_write_tput.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_rand_write_tput(uint8_t *pbuf, size_t thread_id, size_t copy_sz,
 4 |                            size_t num_threads) {
 5 |   static constexpr size_t kBatchSize = 8;
 6 |   static constexpr size_t kNumIters = GB(64);
 7 | 
 8 |   // Write to non-overlapping addresses
 9 |   const size_t bytes_per_thread = kPmemFileSize / num_threads;
10 |   const size_t base_addr = thread_id * bytes_per_thread;
11 | 
12 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
13 |   struct timespec start;
14 | 
15 |   auto *copy_arr = new uint8_t[copy_sz];
16 |   for (size_t i = 0; i < copy_sz; i++) copy_arr[i] = pcg();
17 | 
18 |   for (size_t iter = 0; iter < 1; iter++) {
19 |     clock_gettime(CLOCK_REALTIME, &start);
20 | 
21 |     for (size_t i = 0; i < kNumIters / kBatchSize; i++) {
22 |       size_t offset[kBatchSize];
23 |       for (size_t j = 0; j < kBatchSize; j++) {
24 |         offset[j] = base_addr + (pcg() % bytes_per_thread);
25 |         offset[j] = roundup<256>(offset[j]);
26 |         if (offset[j] + copy_sz >= kPmemFileSize) {
27 |           j--;
28 |           continue;
29 |         }
30 |         pmem_memcpy_nodrain(&pbuf[offset[j]], copy_arr, copy_sz);
31 |       }
32 |       pmem_drain();
33 |     }
34 | 
35 |     double tot_sec = sec_since(start);
36 |     double rate = kNumIters / tot_sec;
37 |     double tput_GBps = kNumIters * copy_sz / (1000000000 * tot_sec);
38 | 
39 |     printf("Thread %zu of %zu, size %zu: rand writes: (%.2f M/s, %.2f GB/s)\n",
40 |            thread_id, num_threads, copy_sz, rate / 1000000, tput_GBps);
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/microbench/bench.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file bench.h
 3 |  * @brief Common code shared by benchmark implementations in header files
 4 |  */
 5 | 
 6 | #pragma once
 7 | 
 8 | #include <errno.h>
 9 | #include <fcntl.h>
10 | #include <gflags/gflags.h>
11 | #include <libpmem.h>
12 | #include <malloc.h>
13 | #include <stdio.h>
14 | #include <stdlib.h>
15 | #include <string.h>
16 | #include <sys/mman.h>
17 | #include <sys/stat.h>
18 | #include <time.h>
19 | #include <iomanip>
20 | #include <pcg/pcg_random.hpp>
21 | #include <sstream>
22 | #include <thread>
23 | 
24 | #include "../common.h"
25 | #include "../utils/timer.h"
26 | 
27 | DEFINE_uint64(num_threads, 0, "Number of threads");
28 | 
29 | // static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log";
30 | static constexpr const char *kPmemFile = "/dev/dax0.0";
31 | 
32 | static constexpr size_t kPmemFileSize = GB(32);
33 | 
34 | static constexpr bool kMeasureLatency = false;
35 | double freq_ghz = 0.0;
36 | static size_t align64(size_t x) { return x - x % 64; }
37 | 
38 | static constexpr int kHdrPrecision = 2;          // Precision for hdr histograms
39 | static constexpr int kMinPmemLatCycles = 1;      // Min pmem latency in cycles
40 | static constexpr int kMaxPmemLatCycles = MB(1);  // Max pmem latency in cycles
41 | 
42 | static constexpr size_t kNumaNode = 0;
43 | 
44 | /// Get a random offset in the file with at least \p space after it
45 | size_t get_random_offset_with_space(pcg64_fast &pcg, size_t space) {
46 |   size_t iters = 0;
47 |   while (true) {
48 |     size_t rand_offset = pcg() % kPmemFileSize;
49 |     if (kPmemFileSize - rand_offset > space) return rand_offset;
50 |     iters++;
51 |     if (iters > 2) printf("Random offset took over 2 iters\n");
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/microbench/rand_read_tput.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_rand_read_tput(uint8_t *pbuf, size_t thread_id, const size_t copy_sz,
 4 |                           size_t num_threads) {
 5 |   static constexpr size_t kNumIters = MB(4);
 6 |   assert(copy_sz == 64 || copy_sz == 256 || copy_sz == 512 || copy_sz == 1024);
 7 | 
 8 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
 9 |   struct timespec start;
10 |   size_t sum = 0;
11 | 
12 |   for (size_t iter = 0; iter < 5; iter++) {
13 |     clock_gettime(CLOCK_REALTIME, &start);
14 | 
15 |     if (copy_sz == 64) {
16 |       for (size_t i = 0; i < kNumIters; i++) {
17 |         size_t offset = roundup<64>(pcg() % kPmemFileSize);
18 |         sum += pbuf[offset];
19 |       }
20 |     } else if (copy_sz == 256) {
21 |       for (size_t i = 0; i < kNumIters; i++) {
22 |         size_t offset = roundup<64>(pcg() % kPmemFileSize);
23 |         for (size_t cl = 0; cl < 4; cl++) {
24 |           sum += pbuf[offset + cl * 64];
25 |         }
26 |       }
27 |     } else if (copy_sz == 512) {
28 |       for (size_t i = 0; i < kNumIters; i++) {
29 |         size_t offset = roundup<64>(pcg() % kPmemFileSize);
30 |         for (size_t cl = 0; cl < 8; cl++) {
31 |           sum += pbuf[offset + cl * 64];
32 |         }
33 |       }
34 |     } else if (copy_sz == 1024) {
35 |       for (size_t i = 0; i < kNumIters; i++) {
36 |         size_t offset = roundup<64>(pcg() % kPmemFileSize);
37 |         for (size_t cl = 0; cl < 16; cl++) {
38 |           sum += pbuf[offset + cl * 64];
39 |         }
40 |       }
41 |     }
42 | 
43 |     double tot_sec = sec_since(start);
44 |     double rate = kNumIters / tot_sec;
45 |     printf(
46 |         "Thread %zu of %zu, copy sz %zu: random read tput = %.2f M/sec, "
47 |         "sum = %zu\n",
48 |         thread_id, num_threads, copy_sz, rate / 1000000, sum);
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/microbench/write_latency/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <libpmem.h>
 3 | #include <malloc.h>
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include <time.h>
 9 | #include <algorithm>
10 | #include <vector>
11 | #include "../../common.h"
12 | 
13 | static constexpr size_t kWriteSize = 512;
14 | static constexpr size_t kNumIters = 1000000;
15 | 
16 | int main() {
17 |   uint8_t *data = reinterpret_cast<uint8_t *>(memalign(4096, kWriteSize));
18 | 
19 |   size_t mapped_len;
20 |   int is_pmem;
21 |   uint8_t *pbuf = reinterpret_cast<uint8_t *>(
22 |       pmem_map_file("/dev/dax0.0", 0, 0, 0666, &mapped_len, &is_pmem));
23 |   assert(pbuf != nullptr);
24 |   assert(mapped_len >= kWriteSize * kNumIters);
25 | 
26 |   size_t file_offset = 0;
27 |   std::vector<size_t> latency_vec;
28 |   latency_vec.reserve(kNumIters);
29 | 
30 |   for (size_t msr = 0; msr < 10; msr++) {
31 |     // Initialize measurement
32 |     latency_vec.clear();
33 |     struct timespec bench_start;
34 |     clock_gettime(CLOCK_REALTIME, &bench_start);
35 | 
36 |     // Real work
37 |     for (size_t i = 0; i < kNumIters; i++) {
38 |       size_t start_tsc = rdtsc();
39 |       mfence();
40 |       pmem_memmove_persist(&pbuf[file_offset], data, kWriteSize);
41 |       mfence();
42 | 
43 |       latency_vec.push_back(rdtsc() - start_tsc);
44 | 
45 |       file_offset += kWriteSize;
46 |       if (file_offset + kWriteSize >= mapped_len) file_offset = 0;
47 |     }
48 | 
49 |     double bench_seconds = sec_since(bench_start);
50 |     printf("Throughput of writes = %.2f M ops/s, %.2f GB/s\n",
51 |            kNumIters / (bench_seconds * 1000000),
52 |            kNumIters * kWriteSize / (bench_seconds * 1000000000));
53 | 
54 |     std::sort(latency_vec.begin(), latency_vec.end());
55 |     printf("Latency (cycles): median %zu, 99%% %zu, 99.9%% %zu\n",
56 |            latency_vec.at(kNumIters * .5), latency_vec.at(kNumIters * .99),
57 |            latency_vec.at(kNumIters * .999));
58 |   }
59 | 
60 |   pmem_unmap(pbuf, mapped_len);
61 |   exit(0);
62 | }
63 | 


--------------------------------------------------------------------------------
/microbench/seq_write_tput.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_seq_write_tput(uint8_t *pbuf, size_t thread_id, size_t copy_sz,
 4 |                           double *avg_tput_GBps) {
 5 |   // We perform multiple measurements. In each measurement, a thread writes
 6 |   // kCopyPerThreadPerMsr bytes in copy_sz chunks.
 7 |   static constexpr size_t kNumMsr = 1;
 8 |   static constexpr size_t kCopyPerThreadPerMsr = GB(1);
 9 |   rt_assert(kCopyPerThreadPerMsr % copy_sz == 0, "Unaligned copy size");
10 | 
11 |   void *dram_src_buf = memalign(4096, copy_sz);
12 |   memset(dram_src_buf, 0, copy_sz);
13 | 
14 |   // Each thread write to non-overlapping addresses
15 |   const size_t excl_bytes_per_thread = kPmemFileSize / FLAGS_num_threads;
16 |   const size_t base_offset = roundup<256>(thread_id * excl_bytes_per_thread);
17 | 
18 |   // We begin copies from a random aligned offset in the file. This prevents
19 |   // multiple calls from writing to the same file region. std::random_device
20 |   // produces a non-deterministic seed.
21 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
22 |   size_t offset = base_offset + (pcg() % excl_bytes_per_thread);
23 |   offset = roundup<256>(offset);
24 | 
25 |   double tput_sum_GBps = 0;  // Used to compute average througput at the end
26 | 
27 |   for (size_t msr = 0; msr < kNumMsr; msr++) {
28 |     struct timespec start;
29 |     clock_gettime(CLOCK_REALTIME, &start);
30 | 
31 |     for (size_t i = 0; i < kCopyPerThreadPerMsr / copy_sz; i++) {
32 |       pmem_memmove_persist(&pbuf[offset], dram_src_buf, copy_sz);
33 |       offset += copy_sz;
34 |       if (offset + copy_sz >= base_offset + excl_bytes_per_thread) {
35 |         offset = base_offset;
36 |       }
37 |     }
38 | 
39 |     double tot_sec = sec_since(start);
40 |     double tput_GBps = kCopyPerThreadPerMsr / (tot_sec * 1000000000);
41 |     printf("Thread %zu: copy_sz %zu, %.2f GB/s. Offset = %zu\n", thread_id,
42 |            copy_sz, tput_GBps, offset);
43 |     tput_sum_GBps += tput_GBps;
44 |   }
45 | 
46 |   *avg_tput_GBps = tput_sum_GBps / kNumMsr;
47 |   free(dram_src_buf);
48 | }
49 | 


--------------------------------------------------------------------------------
/randomizer/main.cc:
--------------------------------------------------------------------------------
 1 | // This can be used to write random contents to a pmem file so that later
 2 | // experiments don't benefit from any crazy value prediction of a zeroed file.
 3 | 
 4 | #include <libpmem.h>
 5 | #include <pcg/pcg_random.hpp>
 6 | #include "../common.h"
 7 | 
 8 | static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log";
 9 | static constexpr size_t kPmemFileSizeGB = 512;  // The expected file size
10 | static constexpr size_t kPmemFileSize = kPmemFileSizeGB * GB(1);
11 | static constexpr size_t kRandTemplateSz = GB(32);
12 | 
13 | int main(int, char **) {
14 |   uint8_t *pbuf;
15 |   size_t mapped_len;
16 |   int is_pmem;
17 | 
18 |   pbuf = reinterpret_cast<uint8_t *>(pmem_map_file(
19 |       kPmemFile, 0 /* length */, 0 /* flags */, 0666, &mapped_len, &is_pmem));
20 | 
21 |   rt_assert(pbuf != nullptr,
22 |             "pmem_map_file() failed. " + std::string(strerror(errno)));
23 |   rt_assert(mapped_len >= kPmemFileSize,
24 |             "pmem file too small " + std::to_string(mapped_len));
25 |   rt_assert(reinterpret_cast<size_t>(pbuf) % 4096 == 0,
26 |             "Mapped buffer isn't page-aligned");
27 |   rt_assert(is_pmem == 1, "File is not pmem");
28 | 
29 |   printf("Generating random contents\n");
30 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
31 |   size_t *rand_buf = reinterpret_cast<size_t *>(malloc(kRandTemplateSz));
32 |   for (size_t i = 0; i < kRandTemplateSz / sizeof(size_t); i++) {
33 |     rand_buf[i] = pcg();
34 |   }
35 | 
36 |   printf("Writing random contents to the whole file.\n");
37 |   rt_assert(kPmemFileSize % kRandTemplateSz == 0);
38 | 
39 |   for (size_t i = 0; i < kPmemFileSize; i += kRandTemplateSz) {
40 |     struct timespec start;
41 |     clock_gettime(CLOCK_REALTIME, &start);
42 |     pmem_memcpy_persist(&pbuf[i], rand_buf, kRandTemplateSz);
43 |     printf("Fraction complete = %.2f. Took %.3f sec for %zu GB.\n",
44 |            (i + 1) * 1.0 / kPmemFileSize, sec_since(start),
45 |            kRandTemplateSz / GB(1));
46 |   }
47 | 
48 |   printf("Done writing.\n");
49 | 
50 |   pmem_unmap(pbuf, mapped_len);
51 |   exit(0);
52 | }
53 | 


--------------------------------------------------------------------------------
/log_store/rotating_counter.h:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <libpmem.h>
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <algorithm>
 7 | 
 8 | class Counter {
 9 |  public:
10 |   static constexpr size_t kNumBuffers = 16;
11 |   static constexpr size_t kBufferSize = 256;
12 | 
13 |   /**
14 |    * @brief Construct a counter
15 |    *
16 |    * @param pbuf The start address of the counter on persistent memory
17 |    *
18 |    * @param create_new If true, the counter is reset to zero. If false, the
19 |    * counter is initialized using the prior pmem contents.
20 |    */
21 |   Counter(uint8_t *pbuf, bool create_new) : ctr_base_addr(pbuf) {
22 |     if (create_new) {
23 |       pmem_memset_persist(pbuf, 0, kNumBuffers * kBufferSize);
24 |     } else {
25 |       size_t cur_max = 0;    // Maximum value among the counters
26 |       size_t cur_max_i = 0;  // Index of the maximum value
27 |       for (size_t i = 0; i < kNumBuffers; i++) {
28 |         size_t *counter_i = reinterpret_cast<size_t *>(&pbuf[i * kBufferSize]);
29 |         if (*counter_i > cur_max) {
30 |           cur_max = *counter_i;
31 |           cur_max_i = i;
32 |         }
33 |       }
34 | 
35 |       v_value = cur_max;
36 |       buffer_idx = (cur_max_i + 1) % kNumBuffers;
37 |     }
38 |   }
39 | 
40 |   Counter() {}
41 | 
42 |   /// The amount of contiguous pmem needed for this counter
43 |   static size_t get_reqd_space() { return kNumBuffers * kBufferSize; }
44 | 
45 |   // Increment by always writing to the same location
46 |   inline void increment_naive(size_t increment) {
47 |     v_value += increment;
48 |     pmem_memcpy_persist(&ctr_base_addr[0], &v_value, sizeof(v_value));
49 |   }
50 | 
51 |   // Increment by writing to rotating locations, but don't do full-cacheline
52 |   // writes
53 |   inline void increment_rotate(size_t increment) {
54 |     v_value += increment;
55 |     pmem_memcpy_persist(&ctr_base_addr[buffer_idx * kBufferSize], &v_value,
56 |                         sizeof(v_value));
57 |     buffer_idx = (buffer_idx + 1) % kNumBuffers;
58 |   }
59 | 
60 |   size_t v_value = 0;  // Volatile value of the counter
61 | 
62 |   size_t buffer_idx = 0;
63 |   uint8_t *ctr_base_addr = nullptr;  // Starting address of the counter on pmem
64 | };
65 | 


--------------------------------------------------------------------------------
/microbench/rand_write_latency.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_rand_write_latency(uint8_t *pbuf) {
 4 |   double freq_ghz = measure_rdtsc_freq();
 5 | 
 6 |   static constexpr size_t kWriteBytes = MB(64);
 7 |   static constexpr size_t kMinIters = 50000;
 8 |   static constexpr size_t kMinWriteSz = 64;
 9 |   static constexpr size_t kMaxWriteSz = KB(64);
10 | 
11 |   size_t file_offset = 0;
12 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
13 | 
14 |   static_assert(kWriteBytes / kMinWriteSz >= kMinIters, "");
15 |   std::vector<size_t> latency_vec;
16 |   latency_vec.reserve(kWriteBytes / kMinWriteSz);
17 | 
18 |   uint8_t *data = reinterpret_cast<uint8_t *>(memalign(4096, kMaxWriteSz));
19 | 
20 |   for (size_t msr = 0; msr < 10; msr++) {
21 |     printf("size avg_ns 50_ns 999_ns\n");
22 |     std::ostringstream verify_tsc_str;  // Compare tsc results with realtime
23 | 
24 |     for (size_t size = kMinWriteSz; size <= kMaxWriteSz; size *= 2) {
25 |       struct timespec start_time;
26 |       clock_gettime(CLOCK_REALTIME, &start_time);
27 | 
28 |       latency_vec.clear();
29 |       const size_t num_iters =
30 |           kWriteBytes / size <= kMinIters ? kMinIters : kWriteBytes / size;
31 | 
32 |       for (size_t i = 0; i < num_iters; i++) {
33 |         file_offset = roundup<64>(pcg() % kPmemFileSize);
34 | 
35 |         size_t start_tsc = timer::Start();
36 |         pmem_memmove_persist(&pbuf[file_offset], data, size);
37 | 
38 |         latency_vec.push_back(timer::Stop() - start_tsc);
39 |       }
40 | 
41 |       size_t ns_avg_realtime = ns_since(start_time) / num_iters;
42 |       size_t ns_avg_rdtsc =
43 |           std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) /
44 |           (latency_vec.size() * freq_ghz);
45 |       verify_tsc_str << size << ": Average latency (ns) " << ns_avg_realtime
46 |                      << " (realtime) " << ns_avg_rdtsc << " (rdtsc) "
47 |                      << (ns_avg_realtime - ns_avg_rdtsc) << " (delta) "
48 |                      << "\n";
49 | 
50 |       std::sort(latency_vec.begin(), latency_vec.end());
51 |       printf("%zu %zu %.1f %.1f\n", size, ns_avg_realtime,
52 |              latency_vec.at(num_iters * .50) / freq_ghz,
53 |              latency_vec.at(num_iters * .999) / freq_ghz);
54 |     }
55 | 
56 |     printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str());
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/microbench/read_latency/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <libpmem.h>
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <sys/types.h>
 7 | #include <time.h>
 8 | #include <unistd.h>
 9 | #include <pcg/pcg_random.hpp>
10 | #include <random>
11 | 
12 | static constexpr size_t kNumIters = 1000000;
13 | static constexpr size_t kFileSizeGB = 512;
14 | static constexpr size_t kFileSizeBytes = (1ull << 30) * kFileSizeGB;
15 | // static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log";
16 | static constexpr const char *kPmemFile = "/dev/dax0.0";
17 | 
18 | inline uint32_t fastrand(uint64_t &seed) {
19 |   seed = seed * 1103515245 + 12345;
20 |   return static_cast<uint32_t>(seed >> 32);
21 | }
22 | 
23 | /// Return nanoseconds elapsed since timestamp \p t0
24 | static double ns_since(const struct timespec &t0) {
25 |   struct timespec t1;
26 |   clock_gettime(CLOCK_REALTIME, &t1);
27 |   return (t1.tv_sec - t0.tv_sec) * 1000000000.0 + (t1.tv_nsec - t0.tv_nsec);
28 | }
29 | 
30 | // Used for shuffle-based pointer chain measurement
31 | struct cacheline_t {
32 |   cacheline_t *ptr;
33 |   size_t pad[7];
34 | };
35 | static_assert(sizeof(cacheline_t) == 64, "");
36 | 
37 | int main() {
38 |   if (getuid() != 0) {
39 |     // Mapping devdax files needs root perms for now
40 |     printf("You need to be root to run this benchmark\n");
41 |     exit(-1);
42 |   }
43 | 
44 |   printf("Measuring random read latency with buffer size = %zu GB\n",
45 |          kFileSizeGB);
46 | 
47 |   size_t mapped_len;
48 |   int is_pmem;
49 |   uint8_t *pbuf = reinterpret_cast<uint8_t *>(
50 |       pmem_map_file(kPmemFile, 0, 0, 0666, &mapped_len, &is_pmem));
51 |   assert(pbuf != nullptr);
52 |   assert(mapped_len >= kFileSizeBytes);
53 |   assert(is_pmem == 1);
54 | 
55 |   size_t sum = 0;
56 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
57 | 
58 |   for (size_t msr = 0; msr < 10; msr++) {
59 |     // Initialize measurement
60 |     struct timespec bench_start;
61 |     clock_gettime(CLOCK_REALTIME, &bench_start);
62 | 
63 |     // Real work
64 |     for (size_t i = 0; i < kNumIters; i++) {
65 |       size_t file_offset = (sum + pcg()) % kFileSizeBytes;
66 |       sum += pbuf[file_offset];  // Make the next read dependent
67 |     }
68 | 
69 |     double bench_ns = ns_since(bench_start);
70 |     printf("Average read latency = %.1f ns, sum = %zu\n", bench_ns / kNumIters,
71 |            sum);
72 |   }
73 | 
74 |   pmem_unmap(pbuf, mapped_len);
75 |   exit(0);
76 | }
77 | 


--------------------------------------------------------------------------------
/cacheline_versions/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <gflags/gflags.h>
 3 | #include <libpmem.h>
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <sys/types.h>
 8 | #include <time.h>
 9 | #include <unistd.h>
10 | #include <pcg/pcg_random.hpp>
11 | #include "../common.h"
12 | #include "../utils/timer.h"
13 | #include "config.h"
14 | 
15 | DEFINE_uint64(use_pmem, 1, "Use persistent memory");
16 | DEFINE_uint64(object_size, KB(4), "Size of objects");
17 | 
18 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log";
19 | static constexpr const char *kFileName = "/dev/dax0.0";
20 | static constexpr bool kUsePmem = true;
21 | static constexpr size_t kFileSize = GB(32);
22 | 
23 | int main(int argc, char **argv) {
24 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
25 |   rt_assert(getuid() == 0, "You need to be root to run this benchmark");
26 | 
27 |   uint8_t *pbuf;
28 |   size_t mapped_len;
29 | 
30 |   if (FLAGS_use_pmem == 1) {
31 |     printf("Using persistent memory buffer, size %zu\n", FLAGS_object_size);
32 |     int is_pmem;
33 |     pbuf = reinterpret_cast<uint8_t *>(
34 |         pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem));
35 | 
36 |     rt_assert(pbuf != nullptr);
37 |     rt_assert(mapped_len >= kFileSize);
38 |   } else {
39 |     printf("Using volatile memory buffer\n");
40 |     pbuf = reinterpret_cast<uint8_t *>(malloc(kFileSize));
41 |   }
42 | 
43 |   size_t iter = 0;
44 |   size_t sum = 0;
45 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
46 | 
47 |   while (true) {
48 |     size_t rand = pcg();
49 |     size_t offset = roundup<64>(rand % kFileSize);
50 |     if (offset + FLAGS_object_size >= kFileSize) continue;
51 | 
52 |     uint8_t *obj = &pbuf[offset];
53 |     for (size_t i = 0; i < FLAGS_object_size / 64; i++) sum += obj[i * 64];
54 | 
55 |     struct timespec bench_start;
56 |     clock_gettime(CLOCK_REALTIME, &bench_start);
57 | 
58 |     for (size_t i = 0; i < FLAGS_object_size / 64; i++) obj[i * 64] = iter % 2;
59 |     memset(obj, iter, FLAGS_object_size);
60 |     for (size_t i = 0; i < FLAGS_object_size / 64; i++) obj[i * 64] = iter % 3;
61 | 
62 |     printf("Object size %zu, time = %.2f us, bw = %.2f GB/s, size %zu\n",
63 |            FLAGS_object_size, sec_since(bench_start) * 1000000,
64 |            FLAGS_object_size / (1024 * 1024 * 1024.0 * sec_since(bench_start)),
65 |            FLAGS_object_size);
66 | 
67 |     iter++;
68 |   }
69 | 
70 |   if (kUsePmem) pmem_unmap(pbuf, mapped_len);
71 | }
72 | 


--------------------------------------------------------------------------------
/microbench/rand_read_latency.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_rand_read_latency(uint8_t *pbuf) {
 4 |   double freq_ghz = measure_rdtsc_freq();
 5 | 
 6 |   static constexpr bool kMeasurePercentiles = false;
 7 |   static constexpr size_t kReadBytes = MB(128);
 8 |   static constexpr size_t kMinIters = 50000;
 9 |   static constexpr size_t kMinReadSz = 64;
10 |   static constexpr size_t kMaxReadSz = KB(64);
11 | 
12 |   size_t file_offset = 0;
13 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
14 | 
15 |   static_assert(kReadBytes / kMinReadSz >= kMinIters, "");
16 |   std::vector<size_t> latency_vec;
17 |   latency_vec.reserve(kReadBytes / kMinReadSz);
18 | 
19 |   size_t sum = 0;
20 | 
21 |   for (size_t msr = 0; msr < 10; msr++) {
22 |     printf("size avg_ns 50_ns 999_ns\n");
23 |     std::ostringstream verify_tsc_str;  // Compare tsc results with realtime
24 | 
25 |     for (size_t size = kMinReadSz; size <= kMaxReadSz; size *= 2) {
26 |       struct timespec start_time;
27 |       clock_gettime(CLOCK_REALTIME, &start_time);
28 | 
29 |       latency_vec.clear();
30 |       const size_t num_iters =
31 |           kReadBytes / size <= kMinIters ? kMinIters : kReadBytes / size;
32 | 
33 |       for (size_t i = 0; i < num_iters; i++) {
34 |         size_t rand = sum + pcg();
35 |         file_offset = roundup<64>(rand % kPmemFileSize);
36 | 
37 |         size_t start_tsc;
38 |         if (kMeasurePercentiles) start_tsc = timer::Start();
39 |         for (size_t j = 0; j < size; j += 64) {
40 |           sum += pbuf[file_offset + j];
41 |         }
42 | 
43 |         if (kMeasurePercentiles) {
44 |           latency_vec.push_back(timer::Stop() - start_tsc);
45 |         }
46 |       }
47 | 
48 |       size_t ns_avg_realtime = ns_since(start_time) / num_iters;
49 | 
50 |       if (kMeasurePercentiles) {
51 |         std::sort(latency_vec.begin(), latency_vec.end());
52 |         printf("%zu %zu %.1f %.1f\n", size, ns_avg_realtime,
53 |                latency_vec.at(num_iters * .50) / freq_ghz,
54 |                latency_vec.at(num_iters * .999) / freq_ghz);
55 | 
56 |         size_t ns_avg_rdtsc =
57 |             std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) /
58 |             (latency_vec.size() * freq_ghz);
59 |         verify_tsc_str << size << ": Average latency (ns) " << ns_avg_realtime
60 |                        << " (realtime) " << ns_avg_rdtsc << " (rdtsc) "
61 |                        << (ns_avg_realtime - ns_avg_rdtsc) << " (delta) "
62 |                        << "\n";
63 |       } else {
64 |         printf("%zu %zu -1.0 -1.0\n", size, ns_avg_realtime);
65 |       }
66 |     }
67 | 
68 |     printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str());
69 |     printf("sum = %zu\n", sum);
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/circular_writes_tput/bench.cc:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <gflags/gflags.h>
 3 | #include <libpmem.h>
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <sys/types.h>
 8 | #include <time.h>
 9 | #include <unistd.h>
10 | #include <algorithm>
11 | #include <vector>
12 | #include "../common.h"
13 | #include "../utils/timer.h"
14 | #include "config.h"
15 | 
16 | // Config parameters:
17 | // FLAGS_num_counters: Number of counters emulating one counter
18 | // FLAGS_stride_size: Distance between counters
19 | 
20 | // static constexpr const char *kFileName = "/mnt/pmem12/raft_log";
21 | static constexpr const char *kFileName = "/dev/dax0.0";
22 | static constexpr size_t kNumIters = 1000000;
23 | static constexpr bool kUsePmem = true;
24 | static constexpr bool kUseNtStore = true;
25 | 
26 | DEFINE_uint64(num_counters, 16, "Number of counters to rotate on");
27 | DEFINE_uint64(stride_size, 256, "Stride size");
28 | 
29 | int main(int argc, char **argv) {
30 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
31 |   rt_assert(getuid() == 0, "You need to be root to run this benchmark");
32 |   rt_assert(FLAGS_stride_size >= sizeof(size_t), "");
33 |   rt_assert(FLAGS_stride_size % sizeof(size_t) == 0, "");
34 | 
35 |   uint8_t *pbuf;
36 |   size_t mapped_len;
37 | 
38 |   if (kUsePmem) {
39 |     printf("Using persistent memory buffer\n");
40 |     int is_pmem;
41 |     pbuf = reinterpret_cast<uint8_t *>(
42 |         pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem));
43 | 
44 |     rt_assert(pbuf != nullptr);
45 |     rt_assert(mapped_len >= FLAGS_num_counters * FLAGS_stride_size);
46 |   } else {
47 |     printf("Using DRAM buffer\n");
48 |     pbuf = reinterpret_cast<uint8_t *>(
49 |         malloc(FLAGS_num_counters * FLAGS_stride_size));
50 |   }
51 | 
52 |   size_t counter_val = 1;
53 |   size_t counter_idx = 0;
54 |   for (size_t msr = 0; msr < 5; msr++) {
55 |     struct timespec bench_start;
56 |     clock_gettime(CLOCK_REALTIME, &bench_start);
57 | 
58 |     for (size_t i = 0; i < kNumIters; i++) {
59 |       size_t buffer_offset = counter_idx * FLAGS_stride_size;
60 | 
61 |       if (kUseNtStore) {
62 |         pmem_memcpy_persist(&pbuf[buffer_offset], &counter_val, sizeof(size_t));
63 |       } else {
64 |         *reinterpret_cast<size_t *>(&pbuf[buffer_offset]) = counter_val;
65 |         pmem_clwb(&pbuf[buffer_offset]);
66 |         sfence();
67 |       }
68 | 
69 |       counter_idx++;
70 |       if (counter_idx == FLAGS_num_counters) counter_idx = 0;
71 |       counter_val++;
72 |     }
73 | 
74 |     printf("num_counters %zu, stride size %zu: %.2f M/s.\n", FLAGS_num_counters,
75 |            FLAGS_stride_size, kNumIters / (sec_since(bench_start) * 1000000));
76 |   }
77 | 
78 |   if (kUsePmem) pmem_unmap(pbuf, mapped_len);
79 | }
80 | 


--------------------------------------------------------------------------------
/ioat/virt2phy.h:
--------------------------------------------------------------------------------
 1 | // Credits: DPDK
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <errno.h>
 6 | #include <fcntl.h>
 7 | #include <stdint.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | #include <sys/stat.h>
12 | #include <unistd.h>
13 | #include <unordered_map>
14 | #include "../common.h"
15 | 
16 | /**
17 |  * @brief A class to translate any mapped virtual address in the current process
18 |  * to its physical address.
19 |  *
20 |  * Requires root access.
21 |  */
22 | class Virt2Phy {
23 |   static constexpr size_t kPfnMaskSize = 8;
24 | 
25 |  public:
26 |   Virt2Phy() {
27 |     fd = open("/proc/self/pagemap", O_RDONLY);
28 |     if (fd < 0) {
29 |       printf("%s(): cannot open /proc/self/pagemap\n", strerror(errno));
30 |       exit(-1);
31 |     }
32 | 
33 |     page_size = static_cast<size_t>(getpagesize());  // Standard page size
34 |   }
35 | 
36 |   ~Virt2Phy() { close(fd); }
37 | 
38 |   /**
39 |    * @brief Return the physical address of this virtual address
40 |    * @return The physical address on success, zero on failure
41 |    */
42 |   uint64_t translate(const void *virtaddr) {
43 |     auto virt_pfn = static_cast<unsigned long>(
44 |         reinterpret_cast<uint64_t>(virtaddr) / page_size);
45 |     size_t offset = sizeof(uint64_t) * virt_pfn;
46 | 
47 |     uint64_t page;
48 |     int ret = pread(fd, &page, kPfnMaskSize, static_cast<long>(offset));
49 | 
50 |     if (ret < 0) {
51 |       fprintf(stderr, "cannot read /proc/self/pagemap: %s\n", strerror(errno));
52 |       return 0;
53 |     } else if (ret != static_cast<int>(kPfnMaskSize)) {
54 |       fprintf(stderr,
55 |               "read %d bytes from /proc/self/pagemap but expected %zu:\n", ret,
56 |               kPfnMaskSize);
57 |       return 0;
58 |     }
59 | 
60 |     // The pfn (page frame number) are bits 0-54 (see pagemap.txt in linux
61 |     // Documentation)
62 |     if ((page & 0x7fffffffffffffULL) == 0) return 0;
63 | 
64 |     uint64_t physaddr = ((page & 0x7fffffffffffffULL) * page_size) +
65 |                         (reinterpret_cast<uint64_t>(virtaddr) % page_size);
66 | 
67 |     return physaddr;
68 |   }
69 | 
70 |  private:
71 |   int fd;
72 |   size_t page_size;
73 | };
74 | 
75 | class HugepageCachingVirt2Phy {
76 |  public:
77 |   uint64_t translate(void *_va) {
78 |     uint64_t va = reinterpret_cast<uint64_t>(_va);
79 |     uint64_t va_2MB = (va & ~(MB(2) - 1));
80 | 
81 |     auto result = v2p_cache.find(va_2MB);
82 |     if (likely(result != v2p_cache.end())) {
83 |       return result->second + (va % MB(2));
84 |     }
85 | 
86 |     // Here, we have a cache miss
87 |     uint64_t phy_addr = v2p.translate(reinterpret_cast<void *>(va_2MB));
88 |     v2p_cache.emplace(va_2MB, phy_addr);
89 | 
90 |     return phy_addr + (va % MB(2));
91 |   }
92 | 
93 |  private:
94 |   Virt2Phy v2p;
95 |   std::unordered_map<uint64_t, uint64_t> v2p_cache;
96 | };
97 | 


--------------------------------------------------------------------------------
/mica_pmem/test.cc:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <gtest/gtest.h>
  3 | #include <map>
  4 | #include "pmica.h"
  5 | 
  6 | static constexpr size_t kDefaultFileOffset = 1024;
  7 | static constexpr const char* kPmemFile = "/mnt/pmem12/raft_log";
  8 | 
  9 | TEST(Basic, Simple) {
 10 |   size_t num_keys = 32;
 11 |   pmica::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 12 |                                          num_keys, 1.0);
 13 | 
 14 |   size_t key, value;
 15 | 
 16 |   key = 1;
 17 |   value = 1;
 18 |   bool success = hashmap.set_nodrain(&key, &value);
 19 |   assert(success);
 20 | 
 21 |   key = 2;
 22 |   value = 2;
 23 |   success = hashmap.set_nodrain(&key, &value);
 24 |   assert(success);
 25 | 
 26 |   success = hashmap.set_nodrain(&key, &value);
 27 |   assert(success);
 28 | 
 29 |   key = 3;
 30 |   value = 3;
 31 |   success = hashmap.set_nodrain(&key, &value);
 32 |   assert(success);
 33 | 
 34 |   key = 1;
 35 |   value = 0;
 36 |   success = hashmap.get(&key, &value);
 37 |   assert(value == 1);
 38 |   assert(success);
 39 | 
 40 |   key = 2;
 41 |   value = 0;
 42 |   success = hashmap.get(&key, &value);
 43 |   assert(value == 2);
 44 |   assert(success);
 45 | 
 46 |   key = 4;
 47 |   value = 0;
 48 |   success = hashmap.get(&key, &value);
 49 |   assert(value == 0);
 50 |   assert(!success);
 51 | }
 52 | 
 53 | TEST(Basic, Overload) {
 54 |   size_t num_keys = 32;
 55 |   pmica::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 56 |                                          num_keys, 1.0);
 57 | 
 58 |   std::map<size_t, bool> insert_success_map;
 59 |   size_t num_success = 0;
 60 | 
 61 |   for (size_t i = 1; i <= num_keys; i++) {
 62 |     bool success = hashmap.set_nodrain(&i, &i);
 63 |     insert_success_map[i] = success;
 64 | 
 65 |     if (success) num_success++;
 66 |   }
 67 | 
 68 |   printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys);
 69 | 
 70 |   for (size_t i = 1; i <= num_keys; i++) {
 71 |     size_t v;
 72 |     bool success = hashmap.get(&i, &v);
 73 |     assert(success == insert_success_map[i]);
 74 |     if (success) assert(v == i);
 75 |   }
 76 | }
 77 | 
 78 | TEST(Basic, Large) {
 79 |   pmica::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 80 |                                          (1ull << 30), 0.2);
 81 | 
 82 |   size_t num_keys = 32;
 83 |   std::map<size_t, bool> insert_success_map;
 84 |   size_t num_success = 0;
 85 | 
 86 |   for (size_t i = 1; i <= num_keys; i++) {
 87 |     bool success = hashmap.set_nodrain(&i, &i);
 88 |     insert_success_map[i] = success;
 89 | 
 90 |     if (success) num_success++;
 91 |   }
 92 | 
 93 |   printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys);
 94 | 
 95 |   for (size_t i = 1; i <= num_keys; i++) {
 96 |     size_t v;
 97 |     bool success = hashmap.get(&i, &v);
 98 |     assert(success == insert_success_map[i]);
 99 |     if (success) assert(v == i);
100 |   }
101 | }
102 | 
103 | int main(int argc, char** argv) {
104 |   testing::InitGoogleTest(&argc, argv);
105 |   return RUN_ALL_TESTS();
106 | }
107 | 


--------------------------------------------------------------------------------
/microbench/seq_write_latency.h:
--------------------------------------------------------------------------------
 1 | #include "bench.h"
 2 | 
 3 | void bench_seq_write_latency(uint8_t *pbuf) {
 4 |   double freq_ghz = measure_rdtsc_freq();
 5 | 
 6 |   static constexpr bool kMeasurePercentiles = true;
 7 | 
 8 |   // Update the source data for every write. Not doing so decreases latency.
 9 |   static constexpr bool kChangeWriteSource = false;
10 | 
11 |   static constexpr size_t kWriteBytes = MB(64);
12 |   static constexpr size_t kMinIters = 50000;
13 |   static constexpr size_t kMinWriteSz = 64;
14 |   static constexpr size_t kMaxWriteSz = KB(64);
15 | 
16 |   size_t file_offset = 0;
17 | 
18 |   static_assert(kWriteBytes / kMinWriteSz >= kMinIters, "");
19 |   std::vector<size_t> latency_vec;
20 |   latency_vec.reserve(kWriteBytes / kMinWriteSz);
21 | 
22 |   size_t *data = reinterpret_cast<size_t *>(memalign(4096, kMaxWriteSz));
23 |   memset(data, 31, kMaxWriteSz);
24 | 
25 |   for (size_t msr = 0; msr < 100; msr++) {
26 |     printf("size avg_ns 50_ns 999_ns\n");
27 |     std::ostringstream verify_tsc_str;  // Compare tsc results with realtime
28 | 
29 |     for (size_t wr_size = kMinWriteSz; wr_size <= kMaxWriteSz; wr_size *= 2) {
30 |       struct timespec start_time;
31 |       clock_gettime(CLOCK_REALTIME, &start_time);
32 | 
33 |       latency_vec.clear();
34 |       file_offset = roundup<256>(file_offset);
35 |       const size_t num_iters = kWriteBytes / wr_size <= kMinIters
36 |                                    ? kMinIters
37 |                                    : kWriteBytes / wr_size;
38 | 
39 |       for (size_t i = 0; i < num_iters; i++) {
40 |         if (kChangeWriteSource) {
41 |           for (size_t cl = 0; cl < wr_size / 64; cl++) data[cl * 8]++;
42 |         }
43 | 
44 |         size_t start_tsc;
45 |         if (kMeasurePercentiles) start_tsc = timer::Start();
46 |         pmem_memmove_persist(&pbuf[file_offset], data, wr_size);
47 | 
48 |         if (kMeasurePercentiles) {
49 |           latency_vec.push_back(timer::Stop() - start_tsc);
50 |         }
51 | 
52 |         file_offset += wr_size;
53 |         if (file_offset + wr_size >= kPmemFileSize) file_offset = 0;
54 |       }
55 | 
56 |       size_t ns_avg_realtime = ns_since(start_time) / num_iters;
57 | 
58 |       if (kMeasurePercentiles) {
59 |         std::sort(latency_vec.begin(), latency_vec.end());
60 |         printf("%zu %zu %.1f %.1f\n", wr_size, ns_avg_realtime,
61 |                latency_vec.at(num_iters * .50) / freq_ghz,
62 |                latency_vec.at(num_iters * .999) / freq_ghz);
63 | 
64 |         size_t ns_avg_rdtsc =
65 |             std::accumulate(latency_vec.begin(), latency_vec.end(), 0.0) /
66 |             (latency_vec.size() * freq_ghz);
67 |         verify_tsc_str << wr_size << ": Avg latency (ns) " << ns_avg_realtime
68 |                        << " (realtime) " << ns_avg_rdtsc << " (rdtsc) "
69 |                        << (ns_avg_realtime - ns_avg_rdtsc) << " (delta). offst "
70 |                        << file_offset << "\n";
71 |       } else {
72 |         printf("%zu %zu -1.0 -1.0\n", wr_size, ns_avg_realtime);
73 |       }
74 |     }
75 | 
76 |     printf("Fences verification:\n%s\n", verify_tsc_str.str().c_str());
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/test.cc:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <gtest/gtest.h>
  3 | #include <map>
  4 | #include "phopscotch.h"
  5 | 
  6 | static constexpr size_t kDefaultFileOffset = 1024;
  7 | static constexpr const char *kPmemFile = "/mnt/pmem12/raft_log";
  8 | 
  9 | TEST(Basic, Simple) {
 10 |   size_t num_keys = 32;
 11 |   phopscotch::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 12 |                                               num_keys);
 13 | 
 14 |   size_t key, value;
 15 | 
 16 |   key = 1;
 17 |   value = 1;
 18 |   bool success = hashmap.set_nodrain(&key, &value);
 19 |   assert(success);
 20 | 
 21 |   key = 2;
 22 |   value = 2;
 23 |   success = hashmap.set_nodrain(&key, &value);
 24 |   assert(success);
 25 | 
 26 |   success = hashmap.set_nodrain(&key, &value);
 27 |   assert(success);
 28 | 
 29 |   key = 3;
 30 |   value = 3;
 31 |   success = hashmap.set_nodrain(&key, &value);
 32 |   assert(success);
 33 | 
 34 |   key = 1;
 35 |   value = 0;
 36 |   success = hashmap.get(&key, &value);
 37 |   assert(value == 1);
 38 |   assert(success);
 39 | 
 40 |   key = 2;
 41 |   value = 0;
 42 |   success = hashmap.get(&key, &value);
 43 |   assert(value == 2);
 44 |   assert(success);
 45 | 
 46 |   key = 4;
 47 |   value = 0;
 48 |   success = hashmap.get(&key, &value);
 49 |   assert(value == 0);
 50 |   assert(!success);
 51 | }
 52 | 
 53 | TEST(Basic, Overload) {
 54 |   size_t num_keys = 1 * 1024 * 1024;
 55 |   phopscotch::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 56 |                                               num_keys);
 57 | 
 58 |   size_t max_key_inserted = 0;
 59 |   for (size_t i = 1; i <= num_keys; i++) {
 60 |     bool success = hashmap.set_nodrain(&i, &i);
 61 |     if (!success) {
 62 |       size_t hash = hashmap.get_hash(&i);
 63 |       printf("Failed for key %zu, bucket %zuu\n", i,
 64 |              hash % hashmap.num_buckets);
 65 |       break;
 66 |     }
 67 | 
 68 |     max_key_inserted = i;
 69 |   }
 70 | 
 71 |   printf("Loaded fraction = %.2f\n", max_key_inserted * 1.0 / num_keys);
 72 |   hashmap.print_stats();
 73 | 
 74 |   for (size_t i = 1; i <= num_keys; i++) {
 75 |     size_t v;
 76 |     bool success = hashmap.get(&i, &v);
 77 |     assert(success == (i <= max_key_inserted));
 78 |     if (success) assert(v == i);
 79 |   }
 80 | }
 81 | 
 82 | TEST(Basic, Large) {
 83 |   phopscotch::HashMap<size_t, size_t> hashmap(kPmemFile, kDefaultFileOffset,
 84 |                                               (1ull << 30));
 85 | 
 86 |   size_t num_keys = 32;
 87 |   std::map<size_t, bool> insert_success_map;
 88 |   size_t num_success = 0;
 89 | 
 90 |   for (size_t i = 1; i <= num_keys; i++) {
 91 |     bool success = hashmap.set_nodrain(&i, &i);
 92 |     insert_success_map[i] = success;
 93 | 
 94 |     if (success) num_success++;
 95 |   }
 96 | 
 97 |   printf("Loaded fraction = %.2f\n", num_success * 1.0 / num_keys);
 98 | 
 99 |   for (size_t i = 1; i <= num_keys; i++) {
100 |     size_t v;
101 |     bool success = hashmap.get(&i, &v);
102 |     assert(success == insert_success_map[i]);
103 |     if (success) assert(v == i);
104 |   }
105 | }
106 | 
107 | int main(int argc, char **argv) {
108 |   testing::InitGoogleTest(&argc, argv);
109 |   return RUN_ALL_TESTS();
110 | }
111 | 


--------------------------------------------------------------------------------
/scripts/ipmctl_watch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # README:
 4 | #
 5 | # This script monitors write amplification for DIMM 0 using ipmctl
 6 | #
 7 | # MediaWrites = Number of 64-byte writes to NVM. The NVM controller issues
 8 | # 256-byte writes internally, but ipmctl multiplies it by four
 9 | # 
10 | # WriteRequests = Number of 64-byte write requests received on the DDR bus
11 | #
12 | # Notes:
13 | #  * This script uses `printf` to convert hex to decimal, and `xargs` to
14 | #    trim surrounding whitespaces.
15 | #  * During idle period, the NVM controller may write to NVM even when no DDR
16 | #    commands are received. This causes write amplification to be ~100.
17 | #
18 | 
19 | dimms=0x0001 # Single non-interleaved
20 | dimms=0x0001,0x0011,0x0021,0x0101,0x0111,0x0121  # All DIMMs at socket 0
21 | 
22 | # Sum metric arg #1 from file watch_out
23 | sum_from_watch_out() {
24 |   temp_file=$(mktemp)
25 |   cat watch_out | grep $1 | cut -d'=' -f 2 > $temp_file
26 | 
27 |   sum=0
28 |   while read hex; do
29 |     dec=`printf "%d\n" $hex`
30 |     sum=`expr $sum + $dec`
31 |   done < ${temp_file}
32 | 
33 |   echo $sum
34 |   rm ${temp_file}
35 | }
36 | 
37 | # Regenerate watch_out
38 | refresh_watch_out() {
39 |   rm -f watch_out
40 |   touch watch_out
41 |   sudo ipmctl show -dimm $dimms -performance MediaWrites,WriteRequests,MediaReads,ReadRequests > watch_out
42 | }
43 | 
44 | refresh_watch_out
45 | media_writes_0=`sum_from_watch_out MediaWrites`
46 | ddr_writes_0=`sum_from_watch_out WriteRequests`
47 | media_reads_0=`sum_from_watch_out MediaReads`
48 | ddr_reads_0=`sum_from_watch_out ReadRequests`
49 | 
50 | sleep_seconds=1
51 | while true; do
52 |   sleep $sleep_seconds
53 | 
54 |   refresh_watch_out
55 |   media_writes_1=`sum_from_watch_out MediaWrites`
56 |   ddr_writes_1=`sum_from_watch_out WriteRequests`
57 |   media_reads_1=`sum_from_watch_out MediaReads`
58 |   ddr_reads_1=`sum_from_watch_out ReadRequests`
59 | 
60 |   media_writes_delta=`calc $media_writes_1 - $media_writes_0 | xargs`
61 |   ddr_writes_delta=`calc $ddr_writes_1 - $ddr_writes_0 | xargs`
62 |   media_reads_delta=`calc $media_reads_1 - $media_reads_0 | xargs`
63 |   ddr_reads_delta=`calc $ddr_reads_1 - $ddr_reads_0 | xargs`
64 | 
65 |   media_writes_GBs=`python -c "print $media_writes_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs`
66 |   ddr_writes_GBs=`python -c "print $ddr_writes_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs`
67 |   media_reads_GBs=`python -c "print $media_reads_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs`
68 |   ddr_reads_GBs=`python -c "print $ddr_reads_delta * 64.0 / (1024 * 1024 * 1024 * $sleep_seconds)" | xargs`
69 |   write_amp=`calc $media_writes_delta / $ddr_writes_delta | xargs`
70 |   read_amp=`calc $media_reads_delta / $ddr_reads_delta | xargs`
71 | 
72 |   echo "Media writes = $media_writes_delta ($media_writes_GBs GB/s), DDR writes = $ddr_writes_delta ($ddr_writes_GBs GB/s), amplification = $write_amp"
73 |   echo "Media reads = $media_reads_delta ($media_reads_GBs GB/s), DDR reads = $ddr_reads_delta ($ddr_reads_GBs GB/s), amplification = $read_amp"
74 |   echo ""
75 | 
76 |   media_writes_0=$media_writes_1
77 |   ddr_writes_0=$ddr_writes_1
78 |   media_reads_0=$media_reads_1
79 |   ddr_reads_0=$ddr_reads_1
80 | done
81 | 


--------------------------------------------------------------------------------
/log_store/bench.cc:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <libpmem.h>
  3 | #include <stdint.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <time.h>
  7 | #include "../common.h"
  8 | #include "rotating_counter.h"
  9 | 
 10 | static constexpr const char *kFileName = "/mnt/pmem12/raft_log";
 11 | static constexpr size_t kNumMeasurements = 2;
 12 | static constexpr size_t kNumIters = 1000000;
 13 | 
 14 | // Amount of data appended to the log in on iteration
 15 | static constexpr size_t kMaxLogDataSize = 4096;
 16 | 
 17 | void counter_only_bench(uint8_t *pbuf) {
 18 |   Counter ctr(pbuf, true /* create a new counter */);
 19 | 
 20 |   for (size_t msr = 0; msr < kNumMeasurements; msr++) {
 21 |     struct timespec bench_start;
 22 |     clock_gettime(CLOCK_REALTIME, &bench_start);
 23 | 
 24 |     for (size_t i = 0; i < kNumIters; i++) ctr.increment_naive(1);
 25 | 
 26 |     double bench_seconds = sec_since(bench_start);
 27 |     printf("Naive counter: %.2f M increments/s\n",
 28 |            kNumIters / (bench_seconds * 1000000));
 29 |   }
 30 | 
 31 |   for (size_t msr = 0; msr < kNumMeasurements; msr++) {
 32 |     struct timespec bench_start;
 33 |     clock_gettime(CLOCK_REALTIME, &bench_start);
 34 | 
 35 |     for (size_t i = 0; i < kNumIters; i++) ctr.increment_rotate(1);
 36 | 
 37 |     double bench_seconds = sec_since(bench_start);
 38 |     printf("Rotating counter: %.2f M increments/s\n",
 39 |            kNumIters / (bench_seconds * 1000000));
 40 |   }
 41 | }
 42 | 
 43 | class Log {
 44 |  public:
 45 |   // Assume pbuf is large enough to never overflow
 46 |   Log(uint8_t *pbuf) {
 47 |     ctr = Counter(pbuf, true /* create_new */);
 48 |     log_base_addr = pbuf + Counter::get_reqd_space();
 49 |   }
 50 | 
 51 |   // Append with naive counter incrementing
 52 |   void append_naive(uint8_t *data, size_t data_size) {
 53 |     pmem_memcpy_persist(log_base_addr + ctr.v_value, data, data_size);
 54 |     ctr.increment_naive(data_size);
 55 |   }
 56 | 
 57 |   // Append with rotating counter incrementing
 58 |   void append_rotating(uint8_t *data, size_t data_size) {
 59 |     pmem_memcpy_persist(log_base_addr + ctr.v_value, data, data_size);
 60 |     ctr.increment_rotate(data_size);
 61 |   }
 62 | 
 63 |   Counter ctr;
 64 |   uint8_t *log_base_addr = nullptr;  // Starting address of log contents on pmem
 65 | };
 66 | 
 67 | void log_bench(uint8_t *pbuf) {
 68 |   uint8_t source[kMaxLogDataSize] = {0};
 69 | 
 70 |   printf("write_bytes naive_GBps rotating_GBps\n");
 71 | 
 72 |   // Sweep over write sizes
 73 |   for (size_t write_sz = 64; write_sz <= kMaxLogDataSize; write_sz *= 2) {
 74 |     double naive_GBps, rotating_GBps;
 75 | 
 76 |     {
 77 |       // Naive log
 78 |       Log log(pbuf);
 79 |       struct timespec bench_start;
 80 |       clock_gettime(CLOCK_REALTIME, &bench_start);
 81 | 
 82 |       for (size_t i = 0; i < kNumIters; i++) {
 83 |         // Modify the source
 84 |         for (size_t j = 0; j < write_sz / 64; j += 64) source[j]++;
 85 |         log.append_naive(source, write_sz);
 86 |       }
 87 | 
 88 |       double bench_seconds = sec_since(bench_start);
 89 |       naive_GBps = kNumIters * write_sz / (bench_seconds * GB(1));
 90 |     }
 91 | 
 92 |     {
 93 |       // Rotating log
 94 |       Log log(pbuf);
 95 |       struct timespec bench_start;
 96 |       clock_gettime(CLOCK_REALTIME, &bench_start);
 97 | 
 98 |       for (size_t i = 0; i < kNumIters; i++) {
 99 |         // Modify the source
100 |         for (size_t j = 0; j < write_sz / 64; j += 64) source[j]++;
101 |         log.append_rotating(source, write_sz);
102 |       }
103 | 
104 |       double bench_seconds = sec_since(bench_start);
105 |       rotating_GBps = kNumIters * write_sz / (bench_seconds * GB(1));
106 |     }
107 | 
108 |     printf("%zu %.2f %.2f\n", write_sz, naive_GBps, rotating_GBps);
109 |   }
110 | }
111 | 
112 | int main() {
113 |   size_t mapped_len;
114 |   int is_pmem;
115 |   uint8_t *pbuf = reinterpret_cast<uint8_t *>(
116 |       pmem_map_file(kFileName, 0, 0, 0666, &mapped_len, &is_pmem));
117 | 
118 |   assert(pbuf != nullptr);
119 |   assert(mapped_len >= Counter::get_reqd_space());
120 | 
121 |   counter_only_bench(pbuf);
122 |   for (size_t msr = 0; msr < kNumMeasurements; msr++) log_bench(pbuf);
123 | 
124 |   pmem_unmap(pbuf, mapped_len);
125 |   exit(0);
126 | }
127 | 


--------------------------------------------------------------------------------
/utils/timer.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | // High-resolution (~10 ns) timestamps, using fences to prevent reordering and
 18 | // ensure exactly the desired regions are measured.
 19 | 
 20 | #include <stdint.h>
 21 | 
 22 | namespace timer {
 23 | 
 24 | // Start/Stop return absolute timestamps and must be placed immediately before
 25 | // and after the region to measure. We provide separate Start/Stop functions
 26 | // because they use different fences.
 27 | //
 28 | // Background: RDTSC is not 'serializing'; earlier instructions may complete
 29 | // after it, and/or later instructions may complete before it. 'Fences' ensure
 30 | // regions' elapsed times are independent of such reordering. The only
 31 | // documented unprivileged serializing instruction is CPUID, which acts as a
 32 | // full fence (no reordering across it in either direction). Unfortunately
 33 | // the latency of CPUID varies wildly (perhaps made worse by not initializing
 34 | // its EAX input). Because it cannot reliably be deducted from the region's
 35 | // elapsed time, it must not be included in the region to measure (i.e.
 36 | // between the two RDTSC).
 37 | //
 38 | // The newer RDTSCP is sometimes described as serializing, but it actually
 39 | // only serves as a half-fence with release semantics. Although all
 40 | // instructions in the region will complete before the final timestamp is
 41 | // captured, subsequent instructions may leak into the region and increase the
 42 | // elapsed time. Inserting another fence after the final RDTSCP would prevent
 43 | // such reordering without affecting the measured region.
 44 | //
 45 | // Fortunately, such a fence exists. The LFENCE instruction is only documented
 46 | // to delay later loads until earlier loads are visible. However, Intel's
 47 | // reference manual says it acts as a full fence (waiting until all earlier
 48 | // instructions have completed, and delaying later instructions until it
 49 | // completes). AMD assigns the same behavior to MFENCE.
 50 | //
 51 | // We need a fence before the initial RDTSC to prevent earlier instructions
 52 | // from leaking into the region, and arguably another after RDTSC to avoid
 53 | // region instructions from completing before the timestamp is recorded.
 54 | // When surrounded by fences, the additional RDTSCP half-fence provides no
 55 | // benefit, so the initial timestamp can be recorded via RDTSC, which has
 56 | // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
 57 | // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
 58 | //
 59 | // Using Start+Start leads to higher variance and overhead than Stop+Stop.
 60 | // However, Stop+Stop includes an LFENCE in the region measurements, which
 61 | // adds a delay dependent on earlier loads. The combination of Start+Stop
 62 | // is faster than Start+Start and more consistent than Stop+Stop because
 63 | // the first LFENCE already delayed subsequent loads before the measured
 64 | // region. This combination seems not to have been considered in prior work:
 65 | // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
 66 | //
 67 | // Note: performance counters can measure 'exact' instructions-retired or
 68 | // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
 69 | // requires fences. Unfortunately, it is not accessible on all OSes and we
 70 | // prefer to avoid kernel-mode drivers. Performance counters are also affected
 71 | // by several under/over-count errata, so we use the TSC instead.
 72 | 
 73 | // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
 74 | // divide by InvariantTicksPerSecond.
 75 | inline uint64_t Start() {
 76 |   uint64_t t;
 77 |   asm volatile(
 78 |       "lfence\n\t"
 79 |       "rdtsc\n\t"
 80 |       "shl $32, %%rdx\n\t"
 81 |       "or %%rdx, %0\n\t"
 82 |       "lfence"
 83 |       : "=a"(t)
 84 |       :
 85 |       // "memory" avoids reordering. rdx = TSC >> 32.
 86 |       // "cc" = flags modified by SHL.
 87 |       : "rdx", "memory", "cc");
 88 |   return t;
 89 | }
 90 | 
 91 | uint64_t Stop() {
 92 |   uint64_t t;
 93 |   // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
 94 |   asm volatile(
 95 |       "rdtscp\n\t"
 96 |       "shl $32, %%rdx\n\t"
 97 |       "or %%rdx, %0\n\t"
 98 |       "lfence"
 99 |       : "=a"(t)
100 |       :
101 |       // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
102 |       // "cc" = flags modified by SHL.
103 |       : "rcx", "rdx", "memory", "cc");
104 |   return t;
105 | }
106 | }  // namespace timer
107 | 


--------------------------------------------------------------------------------
/.ycm_extra_conf.py:
--------------------------------------------------------------------------------
  1 | # Generated by YCM Generator at 2017-06-18 18:43:13.661945
  2 | 
  3 | # This file is NOT licensed under the GPLv3, which is the license for the rest
  4 | # of YouCompleteMe.
  5 | #
  6 | # Here's the license text for this file:
  7 | #
  8 | # This is free and unencumbered software released into the public domain.
  9 | #
 10 | # Anyone is free to copy, modify, publish, use, compile, sell, or
 11 | # distribute this software, either in source code form or as a compiled
 12 | # binary, for any purpose, commercial or non-commercial, and by any
 13 | # means.
 14 | #
 15 | # In jurisdictions that recognize copyright laws, the author or authors
 16 | # of this software dedicate any and all copyright interest in the
 17 | # software to the public domain. We make this dedication for the benefit
 18 | # of the public at large and to the detriment of our heirs and
 19 | # successors. We intend this dedication to be an overt act of
 20 | # relinquishment in perpetuity of all present and future rights to this
 21 | # software under copyright law.
 22 | #
 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 24 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 25 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 26 | # IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 27 | # OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 28 | # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 29 | # OTHER DEALINGS IN THE SOFTWARE.
 30 | #
 31 | # For more information, please refer to <http://unlicense.org/>
 32 | 
 33 | import os
 34 | import ycm_core
 35 | 
 36 | from os.path import expanduser
 37 | home_dir = expanduser("~")
 38 | 
 39 | flags = [
 40 |     '-x',
 41 |     'c++',
 42 |     '-I' + str(home_dir) + '/rdma_bench',
 43 |     '-Wall',
 44 |     '-Werror',
 45 |     '-Wextra',
 46 |     '-Wno-inline',
 47 |     '-Wno-unused-function',
 48 |     '-Wold-style-cast',
 49 |     '-Wsign-conversion',
 50 |     '-std=c++11',
 51 | ]
 52 | 
 53 | 
 54 | # Set this to the absolute path to the folder (NOT the file!) containing the
 55 | # compile_commands.json file to use that instead of 'flags'. See here for
 56 | # more details: http://clang.llvm.org/docs/JSONCompilationDatabase.html
 57 | #
 58 | # You can get CMake to generate this file for you by adding:
 59 | #   set( CMAKE_EXPORT_COMPILE_COMMANDS 1 )
 60 | # to your CMakeLists.txt file.
 61 | #
 62 | # Most projects will NOT need to set this to anything; you can just change the
 63 | # 'flags' list of compilation flags. Notice that YCM itself uses that approach.
 64 | compilation_database_folder = ''
 65 | 
 66 | if os.path.exists( compilation_database_folder ):
 67 |   database = ycm_core.CompilationDatabase( compilation_database_folder )
 68 | else:
 69 |   database = None
 70 | 
 71 | SOURCE_EXTENSIONS = [ '.C', '.cpp', '.cxx', '.cc', '.c', '.m', '.mm' ]
 72 | 
 73 | def DirectoryOfThisScript():
 74 |   return os.path.dirname( os.path.abspath( __file__ ) )
 75 | 
 76 | 
 77 | def MakeRelativePathsInFlagsAbsolute( flags, working_directory ):
 78 |   if not working_directory:
 79 |     return list( flags )
 80 |   new_flags = []
 81 |   make_next_absolute = False
 82 |   path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ]
 83 |   for flag in flags:
 84 |     new_flag = flag
 85 | 
 86 |     if make_next_absolute:
 87 |       make_next_absolute = False
 88 |       if not flag.startswith( '/' ):
 89 |         new_flag = os.path.join( working_directory, flag )
 90 | 
 91 |     for path_flag in path_flags:
 92 |       if flag == path_flag:
 93 |         make_next_absolute = True
 94 |         break
 95 | 
 96 |       if flag.startswith( path_flag ):
 97 |         path = flag[ len( path_flag ): ]
 98 |         new_flag = path_flag + os.path.join( working_directory, path )
 99 |         break
100 | 
101 |     if new_flag:
102 |       new_flags.append( new_flag )
103 |   return new_flags
104 | 
105 | 
106 | def IsHeaderFile( filename ):
107 |   extension = os.path.splitext( filename )[ 1 ]
108 |   return extension in [ '.H', '.h', '.hxx', '.hpp', '.hh' ]
109 | 
110 | 
111 | def GetCompilationInfoForFile( filename ):
112 |   # The compilation_commands.json file generated by CMake does not have entries
113 |   # for header files. So we do our best by asking the db for flags for a
114 |   # corresponding source file, if any. If one exists, the flags for that file
115 |   # should be good enough.
116 |   if IsHeaderFile( filename ):
117 |     basename = os.path.splitext( filename )[ 0 ]
118 |     for extension in SOURCE_EXTENSIONS:
119 |       replacement_file = basename + extension
120 |       if os.path.exists( replacement_file ):
121 |         compilation_info = database.GetCompilationInfoForFile(
122 |           replacement_file )
123 |         if compilation_info.compiler_flags_:
124 |           return compilation_info
125 |     return None
126 |   return database.GetCompilationInfoForFile( filename )
127 | 
128 | 
129 | def FlagsForFile( filename, **kwargs ):
130 |   if database:
131 |     # Bear in mind that compilation_info.compiler_flags_ does NOT return a
132 |     # python list, but a "list-like" StringVec object
133 |     compilation_info = GetCompilationInfoForFile( filename )
134 |     if not compilation_info:
135 |       return None
136 | 
137 |     final_flags = MakeRelativePathsInFlagsAbsolute(
138 |       compilation_info.compiler_flags_,
139 |       compilation_info.compiler_working_dir_ )
140 | 
141 |   else:
142 |     relative_to = DirectoryOfThisScript()
143 |     final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to )
144 | 
145 |   return {
146 |     'flags': final_flags,
147 |     'do_cache': True
148 |   }
149 | 
150 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/latency.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file latency.h
  3 |  * @author MICA authors, akalia
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #include <assert.h>
  9 | #include <inttypes.h>
 10 | #include <string.h>
 11 | #include <algorithm>
 12 | #include <cstdio>
 13 | 
 14 | /*
 15 |  * @brief Fast but approximate latency distribution measurement for latency
 16 |  * values up to 4000 microseconds (i.e., 4 ms). Adding a latency sample is
 17 |  * fast, but computing a statistic is slow.
 18 |  */
 19 | class Latency {
 20 |  public:
 21 |   Latency() { reset(); }
 22 | 
 23 |   void reset() { memset(this, 0, sizeof(Latency)); }
 24 | 
 25 |   /// Add a latency sample
 26 |   void update(size_t us) {
 27 |     if (us < 128)
 28 |       bin0_[us]++;
 29 |     else if (us < 384)
 30 |       bin1_[(us - 128) / 2]++;
 31 |     else if (us < 896)
 32 |       bin2_[(us - 384) / 4]++;
 33 |     else if (us < 1920)
 34 |       bin3_[(us - 896) / 8]++;
 35 |     else if (us < 3968)
 36 |       bin4_[(us - 1920) / 16]++;
 37 |     else
 38 |       bin5_++;
 39 |   }
 40 | 
 41 |   /// Combine two distributions
 42 |   Latency& operator+=(const Latency& o) {
 43 |     size_t i;
 44 |     for (i = 0; i < 128; i++) bin0_[i] += o.bin0_[i];
 45 |     for (i = 0; i < 128; i++) bin1_[i] += o.bin1_[i];
 46 |     for (i = 0; i < 128; i++) bin2_[i] += o.bin2_[i];
 47 |     for (i = 0; i < 128; i++) bin3_[i] += o.bin3_[i];
 48 |     for (i = 0; i < 128; i++) bin4_[i] += o.bin4_[i];
 49 |     bin5_ += o.bin5_;
 50 |     return *this;
 51 |   }
 52 | 
 53 |   /// Return the total number of samples
 54 |   size_t count() const {
 55 |     size_t count = 0;
 56 |     size_t i;
 57 |     for (i = 0; i < 128; i++) count += bin0_[i];
 58 |     for (i = 0; i < 128; i++) count += bin1_[i];
 59 |     for (i = 0; i < 128; i++) count += bin2_[i];
 60 |     for (i = 0; i < 128; i++) count += bin3_[i];
 61 |     for (i = 0; i < 128; i++) count += bin4_[i];
 62 |     count += bin5_;
 63 |     return count;
 64 |   }
 65 | 
 66 |   /// Return the (approximate) sum of all samples
 67 |   size_t sum() const {
 68 |     size_t sum = 0;
 69 |     size_t i;
 70 |     for (i = 0; i < 128; i++) sum += bin0_[i] * (0 + i * 1);
 71 |     for (i = 0; i < 128; i++) sum += bin1_[i] * (128 + i * 2);
 72 |     for (i = 0; i < 128; i++) sum += bin2_[i] * (384 + i * 4);
 73 |     for (i = 0; i < 128; i++) sum += bin3_[i] * (896 + i * 8);
 74 |     for (i = 0; i < 128; i++) sum += bin4_[i] * (1920 + i * 16);
 75 |     sum += bin5_ * 3968;
 76 |     return sum;
 77 |   }
 78 | 
 79 |   /// Return the (approximate) average sample
 80 |   double avg() const {
 81 |     return static_cast<double>(sum()) /
 82 |            static_cast<double>(std::max(size_t(1), count()));
 83 |   }
 84 | 
 85 |   /// Return the (approximate) minimum sample
 86 |   size_t min() const {
 87 |     size_t i;
 88 |     for (i = 0; i < 128; i++)
 89 |       if (bin0_[i] != 0) return 0 + i * 1;
 90 |     for (i = 0; i < 128; i++)
 91 |       if (bin1_[i] != 0) return 128 + i * 2;
 92 |     for (i = 0; i < 128; i++)
 93 |       if (bin2_[i] != 0) return 384 + i * 4;
 94 |     for (i = 0; i < 128; i++)
 95 |       if (bin3_[i] != 0) return 896 + i * 8;
 96 |     for (i = 0; i < 128; i++)
 97 |       if (bin4_[i] != 0) return 1920 + i * 16;
 98 |     // if (bin5_ != 0) return 3968;
 99 |     return 3968;
100 |   }
101 | 
102 |   /// Return the (approximate) max sample
103 |   size_t max() const {
104 |     int64_t i;
105 |     if (bin5_ != 0) return 3968;
106 |     for (i = 127; i >= 0; i--)
107 |       if (bin4_[i] != 0) return 1920 + static_cast<size_t>(i) * 16;
108 |     for (i = 127; i >= 0; i--)
109 |       if (bin3_[i] != 0) return 896 + static_cast<size_t>(i) * 8;
110 |     for (i = 127; i >= 0; i--)
111 |       if (bin2_[i] != 0) return 384 + static_cast<size_t>(i) * 4;
112 |     for (i = 127; i >= 0; i--)
113 |       if (bin1_[i] != 0) return 128 + static_cast<size_t>(i) * 2;
114 |     for (i = 127; i >= 0; i--)
115 |       if (bin0_[i] != 0) return 0 + static_cast<size_t>(i) * 1;
116 |     return 0;
117 |   }
118 | 
119 |   /// Return the (approximate) p-th percentile sample
120 |   size_t perc(double p) const {
121 |     size_t i;
122 |     int64_t thres = static_cast<int64_t>(p * static_cast<double>(count()));
123 |     for (i = 0; i < 128; i++)
124 |       if ((thres -= static_cast<int64_t>(bin0_[i])) < 0) return 0 + i * 1;
125 |     for (i = 0; i < 128; i++)
126 |       if ((thres -= static_cast<int64_t>(bin1_[i])) < 0) return 128 + i * 2;
127 |     for (i = 0; i < 128; i++)
128 |       if ((thres -= static_cast<int64_t>(bin2_[i])) < 0) return 384 + i * 4;
129 |     for (i = 0; i < 128; i++)
130 |       if ((thres -= static_cast<int64_t>(bin3_[i])) < 0) return 896 + i * 8;
131 |     for (i = 0; i < 128; i++)
132 |       if ((thres -= static_cast<int64_t>(bin4_[i])) < 0) return 1920 + i * 16;
133 |     return 3968;
134 |   }
135 | 
136 |   /// Print the distribution to a file
137 |   void print(FILE* fp) const {
138 |     size_t i;
139 |     for (i = 0; i < 128; i++)
140 |       if (bin0_[i] != 0)
141 |         fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 0 + i * 1, bin0_[i]);
142 |     for (i = 0; i < 128; i++)
143 |       if (bin1_[i] != 0)
144 |         fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 128 + i * 2, bin1_[i]);
145 |     for (i = 0; i < 128; i++)
146 |       if (bin2_[i] != 0)
147 |         fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 384 + i * 4, bin2_[i]);
148 |     for (i = 0; i < 128; i++)
149 |       if (bin3_[i] != 0)
150 |         fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 896 + i * 8, bin3_[i]);
151 |     for (i = 0; i < 128; i++)
152 |       if (bin4_[i] != 0)
153 |         fprintf(fp, "%4" PRIu64 " %6" PRIu64 "\n", 1920 + i * 16, bin4_[i]);
154 |     if (bin5_ != 0) fprintf(fp, "%4d %6" PRIu64 "\n", 3968, bin5_);
155 |   }
156 | 
157 |  private:
158 |   // [0, 128) us
159 |   size_t bin0_[128];
160 |   // [128, 384) us
161 |   size_t bin1_[128];
162 |   // [384, 896) us
163 |   size_t bin2_[128];
164 |   // [896, 1920) us
165 |   size_t bin3_[128];
166 |   // [1920, 3968) us
167 |   size_t bin4_[128];
168 |   // [3968, inf) us
169 |   size_t bin5_;
170 | };
171 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-bw/main.cc:
--------------------------------------------------------------------------------
  1 | #include <fcntl.h>
  2 | #include <gflags/gflags.h>
  3 | #include <libpmem.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <sys/mman.h>
  8 | #include <sys/stat.h>
  9 | #include <pcg/pcg_random.hpp>
 10 | #include <random>
 11 | #include <thread>
 12 | #include <vector>
 13 | #include "../libhrd_cpp/hrd.h"
 14 | 
 15 | DEFINE_uint64(is_client, 0, "Is this process a client?");
 16 | DEFINE_uint64(machine_id, 0, "Index among client machines (for clients)");
 17 | DEFINE_uint64(min_write_size, 0, "Client's min RDMA write size");
 18 | DEFINE_uint64(max_write_size, 0, "Client's max RDMA write size");
 19 | DEFINE_uint64(window_size, 0, "Number of writes outstanding at client");
 20 | 
 21 | static constexpr size_t kPmemFileSize = GB(4);
 22 | 
 23 | // If true, server zeroes out its buffer and reports write throughput
 24 | static constexpr bool kZeroServerBuf = true;
 25 | 
 26 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages.
 27 | static constexpr bool kUsePmem = true;
 28 | static constexpr const char* kPmemFile = "/dev/dax0.0";
 29 | 
 30 | // If true, we use read-after-write to force persistence
 31 | static constexpr bool kReadAfterWrite = true;
 32 | 
 33 | static constexpr bool kVerbose = false;
 34 | 
 35 | // Map the devdax buffer at the server
 36 | uint8_t* get_pmem_buf_server() {
 37 |   int fd = open(kPmemFile, O_RDWR);
 38 |   rt_assert(fd >= 0, "devdax open failed");
 39 | 
 40 |   void* buf =
 41 |       mmap(nullptr, kPmemFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 42 |   rt_assert(buf != MAP_FAILED, "mmap failed for devdax");
 43 |   rt_assert(reinterpret_cast<size_t>(buf) % 256 == 0);
 44 | 
 45 |   return reinterpret_cast<uint8_t*>(buf);
 46 | }
 47 | 
 48 | void server_func() {
 49 |   uint8_t* pmem_buf = nullptr;
 50 |   if (kUsePmem) {
 51 |     pmem_buf = get_pmem_buf_server();
 52 | 
 53 |     // Fill in the persistent buffer, also sanity-check local write throughput
 54 |     if (kZeroServerBuf) {
 55 |       printf("main: Zero-ing pmem buffer\n");
 56 |       struct timespec start;
 57 |       clock_gettime(CLOCK_REALTIME, &start);
 58 |       pmem_memset_persist(pmem_buf, 0, kPmemFileSize);
 59 |       printf("main: Zero-ed %f MB of pmem at %.1f GB/s\n",
 60 |              kPmemFileSize * 1.0 / MB(1),
 61 |              kPmemFileSize / (1000000000.0 * sec_since(start)));
 62 |     }
 63 |   }
 64 | 
 65 |   struct hrd_conn_config_t conn_config;
 66 |   conn_config.num_qps = 1;
 67 |   conn_config.use_uc = false;
 68 |   conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr;
 69 |   conn_config.buf_size = kPmemFileSize;
 70 |   conn_config.buf_shm_key = kUsePmem ? -1 : 3185;
 71 | 
 72 |   auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */,
 73 |                                &conn_config, nullptr /* dgram config */);
 74 | 
 75 |   // Publish server QP
 76 |   auto srv_qp_name = std::string("server");
 77 |   hrd_publish_conn_qp(cb, 0, srv_qp_name.c_str());
 78 | 
 79 |   printf("main: Server published. Waiting for client\n");
 80 | 
 81 |   auto conn_name = std::string("client");
 82 |   hrd_qp_attr_t* conn_qp = nullptr;
 83 |   while (conn_qp == nullptr) {
 84 |     conn_qp = hrd_get_published_qp(conn_name.c_str());
 85 |     if (conn_qp == nullptr) {
 86 |       usleep(200000);
 87 |       continue;
 88 |     }
 89 | 
 90 |     printf("main: Server found client! Connecting..\n");
 91 |     hrd_connect_qp(cb, 0, conn_qp);
 92 |   }
 93 | 
 94 |   hrd_publish_ready("server");
 95 |   printf("main: Server ready. Going to sleep.\n");
 96 | 
 97 |   while (true) sleep(1);
 98 | }
 99 | 
100 | void client_func() {
101 |   hrd_conn_config_t conn_config;
102 | 
103 |   conn_config.num_qps = 1;
104 |   conn_config.use_uc = false;
105 |   conn_config.prealloc_buf = nullptr;
106 |   conn_config.buf_size = FLAGS_max_write_size;
107 |   conn_config.buf_shm_key = 3185;
108 | 
109 |   auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */,
110 |                                &conn_config, nullptr /* dgram config */);
111 |   memset(const_cast<uint8_t*>(cb->conn_buf), 31, FLAGS_max_write_size);
112 | 
113 |   hrd_publish_conn_qp(cb, 0, "client");
114 |   printf("main: Client published. Waiting for server.\n");
115 | 
116 |   hrd_qp_attr_t* srv_qp = nullptr;
117 |   while (srv_qp == nullptr) {
118 |     srv_qp = hrd_get_published_qp("server");
119 |     if (srv_qp == nullptr) usleep(2000);
120 |   }
121 | 
122 |   printf("main: Found server. Connecting..\n");
123 |   hrd_connect_qp(cb, 0, srv_qp);
124 |   printf("main: Client connected!\n");
125 | 
126 |   hrd_wait_till_ready("server");
127 | 
128 |   struct timespec start;
129 |   size_t total_bytes_written = 0;
130 |   size_t pending_ops = 0;
131 |   size_t remote_offset = 0;
132 |   size_t cur_write_size = FLAGS_min_write_size;
133 | 
134 |   clock_gettime(CLOCK_REALTIME, &start);
135 | 
136 |   while (true) {
137 |     if (pending_ops < FLAGS_window_size) {
138 |       struct ibv_send_wr write_wr, read_wr, *bad_send_wr;
139 |       struct ibv_sge write_sge, read_sge;
140 | 
141 |       // RDMA-write kClientWriteSize bytes
142 |       write_sge.addr = reinterpret_cast<uint64_t>(&cb->conn_buf[0]);
143 |       write_sge.length = cur_write_size;
144 |       write_sge.lkey = cb->conn_buf_mr->lkey;
145 | 
146 |       write_wr.opcode = IBV_WR_RDMA_WRITE;
147 |       write_wr.num_sge = 1;
148 |       write_wr.sg_list = &write_sge;
149 |       write_wr.send_flags = kReadAfterWrite ? 0 : IBV_SEND_SIGNALED;
150 | 
151 |       if (remote_offset + cur_write_size > kPmemFileSize) remote_offset = 0;
152 |       write_wr.wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset;
153 |       write_wr.wr.rdma.rkey = srv_qp->rkey;
154 |       write_wr.next = kReadAfterWrite ? &read_wr : nullptr;
155 | 
156 |       remote_offset += cur_write_size;
157 | 
158 |       if (kReadAfterWrite) {
159 |         // RDMA-read 8 bytes from the end of the written buffer
160 |         read_sge.addr = reinterpret_cast<uint64_t>(&cb->conn_buf[0]);
161 |         read_sge.length = sizeof(size_t);
162 |         read_sge.lkey = cb->conn_buf_mr->lkey;
163 | 
164 |         read_wr.opcode = IBV_WR_RDMA_READ;
165 |         read_wr.num_sge = 1;
166 |         read_wr.sg_list = &read_sge;
167 |         read_wr.send_flags = IBV_SEND_SIGNALED;
168 |         read_wr.wr.rdma.remote_addr =
169 |             write_wr.wr.rdma.remote_addr + cur_write_size - sizeof(size_t);
170 |         read_wr.wr.rdma.rkey = srv_qp->rkey;
171 |         read_wr.next = nullptr;
172 |       }
173 | 
174 |       int ret = ibv_post_send(cb->conn_qp[0], &write_wr, &bad_send_wr);
175 |       rt_assert(ret == 0);
176 |       pending_ops++;
177 | 
178 |       if (kVerbose) printf("Client posted. Pending = %zu\n", pending_ops);
179 |     }
180 | 
181 |     if (pending_ops == FLAGS_window_size) {
182 |       struct ibv_wc wc;
183 |       hrd_poll_cq(cb->conn_cq[0], 1, &wc);
184 |       pending_ops--;
185 | 
186 |       if (kVerbose) printf("Client polled. Pending = %zu\n", pending_ops);
187 |       total_bytes_written += cur_write_size;
188 |     }
189 | 
190 |     if (total_bytes_written >= GB(4)) {
191 |       double secs = sec_since(start);
192 | 
193 |       printf("Client: size %zu, %.2f Gbps.\n", cur_write_size,
194 |              total_bytes_written * 8 / (1000000000 * secs));
195 | 
196 |       cur_write_size *= 2;
197 |       printf("doubling to %zu\n", cur_write_size);
198 |       if (cur_write_size > FLAGS_max_write_size) {
199 |         cur_write_size = FLAGS_min_write_size;
200 |         printf("back to %zu\n", cur_write_size);
201 |       }
202 | 
203 |       total_bytes_written = 0;
204 |       clock_gettime(CLOCK_REALTIME, &start);
205 |     }
206 |   }
207 | }
208 | 
209 | int main(int argc, char* argv[]) {
210 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
211 |   if (FLAGS_is_client == 1) {
212 |     auto client_thread = std::thread(client_func);
213 |     client_thread.join();
214 |   } else {
215 |     auto t = std::thread(server_func);
216 |     t.join();
217 |   }
218 | }
219 | 


--------------------------------------------------------------------------------
/rdma/libhrd_cpp/hrd.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <assert.h>
  4 | #include <errno.h>
  5 | #include <infiniband/verbs.h>
  6 | #include <libmemcached/memcached.h>
  7 | #include <malloc.h>
  8 | #include <numaif.h>
  9 | #include <stdarg.h>
 10 | #include <stdio.h>
 11 | #include <stdlib.h>
 12 | #include <string.h>
 13 | #include <sys/ipc.h>
 14 | #include <sys/shm.h>
 15 | #include <sys/time.h>
 16 | #include <sys/types.h>
 17 | #include <time.h>
 18 | #include <unistd.h>
 19 | #include <sstream>
 20 | #include <stdexcept>
 21 | #include <string>
 22 | #include <thread>
 23 | #include "../../common.h"
 24 | 
 25 | static constexpr size_t kRoCE = false;  ///< Use RoCE
 26 | 
 27 | // Maximum inline data so that WQEs fit in two cache lines (max_sge = 1):
 28 | // * mlx4, RC: 88
 29 | // * mlx4, UD: 60
 30 | static constexpr size_t kHrdMaxInline = 88;
 31 | static constexpr size_t kHrdSQDepth = 128;   ///< Depth of all SEND queues
 32 | static constexpr size_t kHrdRQDepth = 2048;  ///< Depth of all RECV queues
 33 | 
 34 | static constexpr uint32_t kHrdInvalidNUMANode = 9;
 35 | static constexpr uint32_t kHrdDefaultPSN = 3185;
 36 | static constexpr uint32_t kHrdDefaultQKey = 0x11111111;
 37 | static constexpr size_t kHrdMaxLID = 256;
 38 | static constexpr size_t kHrdMaxUDQPs = 256;  ///< Maximum number of UD QPs
 39 | 
 40 | static constexpr size_t kHrdQPNameSize = 200;
 41 | 
 42 | // This needs to be a macro because we don't have Mellanox OFED for Debian
 43 | #define kHrdMlx5Atomics false
 44 | #define kHrdReservedNamePrefix "__HRD_RESERVED_NAME_PREFIX"
 45 | 
 46 | /// Registry info about a QP
 47 | struct hrd_qp_attr_t {
 48 |   char name[kHrdQPNameSize];
 49 |   uint16_t lid;
 50 |   uint32_t qpn;
 51 |   union ibv_gid gid;  ///< GID, used for only RoCE
 52 | 
 53 |   // Info about the RDMA buffer associated with this QP
 54 |   uintptr_t buf_addr;
 55 |   uint32_t buf_size;
 56 |   uint32_t rkey;
 57 | };
 58 | 
 59 | struct hrd_conn_config_t {
 60 |   // Required params
 61 |   size_t num_qps = 0;  // num_qps > 0 is used as a validity check
 62 |   bool use_uc;
 63 |   volatile uint8_t* prealloc_buf;
 64 |   size_t buf_size;
 65 |   int buf_shm_key;
 66 | 
 67 |   // Optional params with their default values
 68 |   size_t sq_depth = kHrdSQDepth;
 69 |   size_t max_rd_atomic = 16;
 70 | 
 71 |   std::string to_string() {
 72 |     std::ostringstream ret;
 73 |     ret << "[num_qps " << std::to_string(num_qps) << ", use_uc "
 74 |         << std::to_string(use_uc) << ", buf size " << std::to_string(buf_size)
 75 |         << ", shm key " << std::to_string(buf_shm_key) << ", sq_depth "
 76 |         << std::to_string(sq_depth) << ", max_rd_atomic "
 77 |         << std::to_string(max_rd_atomic) << "]";
 78 |     return ret.str();
 79 |   }
 80 | };
 81 | 
 82 | struct hrd_dgram_config_t {
 83 |   size_t num_qps;
 84 |   volatile uint8_t* prealloc_buf;
 85 |   size_t buf_size;
 86 |   int buf_shm_key;
 87 | };
 88 | 
 89 | struct hrd_ctrl_blk_t {
 90 |   size_t local_hid;  // Local ID on the machine this process runs on
 91 | 
 92 |   // Info about the device/port to use for this control block
 93 |   size_t port_index;  // User-supplied. 0-based across all devices
 94 |   size_t numa_node;   // NUMA node id
 95 | 
 96 |   /// InfiniBand info resolved from \p phy_port, must be filled by constructor.
 97 |   struct {
 98 |     int device_id;               // Device index in list of verbs devices
 99 |     struct ibv_context* ib_ctx;  // The verbs device context
100 |     uint8_t dev_port_id;         // 1-based port ID in device. 0 is invalid.
101 |     uint16_t port_lid;           // LID of phy_port. 0 is invalid.
102 | 
103 |     union ibv_gid gid;  // GID, used only for RoCE
104 |   } resolve;
105 | 
106 |   struct ibv_pd* pd;  // A protection domain for this control block
107 | 
108 |   // Connected QPs
109 |   hrd_conn_config_t conn_config;
110 |   struct ibv_qp** conn_qp;
111 |   struct ibv_cq** conn_cq;
112 |   volatile uint8_t* conn_buf;  // A buffer for RDMA over RC/UC QPs
113 |   struct ibv_mr* conn_buf_mr;
114 | 
115 |   // Datagram QPs
116 |   size_t num_dgram_qps;
117 |   struct ibv_qp* dgram_qp[kHrdMaxUDQPs];
118 |   struct ibv_cq *dgram_send_cq[kHrdMaxUDQPs], *dgram_recv_cq[kHrdMaxUDQPs];
119 |   volatile uint8_t* dgram_buf;  // A buffer for RECVs on dgram QPs
120 |   size_t dgram_buf_size;
121 |   int dgram_buf_shm_key;
122 |   struct ibv_mr* dgram_buf_mr;
123 | 
124 |   uint8_t pad[64];
125 | };
126 | 
127 | // Major initialzation functions
128 | hrd_ctrl_blk_t* hrd_ctrl_blk_init(size_t local_hid, size_t port_index,
129 |                                   size_t numa_node,
130 |                                   hrd_conn_config_t* conn_config,
131 |                                   hrd_dgram_config_t* dgram_config);
132 | 
133 | int hrd_ctrl_blk_destroy(hrd_ctrl_blk_t* cb);
134 | 
135 | // Debug
136 | void hrd_ibv_devinfo(void);
137 | 
138 | void hrd_resolve_port_index(hrd_ctrl_blk_t* cb, size_t port_index);
139 | void hrd_create_conn_qps(hrd_ctrl_blk_t* cb);
140 | void hrd_create_dgram_qps(hrd_ctrl_blk_t* cb);
141 | 
142 | void hrd_connect_qp(hrd_ctrl_blk_t* cb, size_t conn_qp_idx,
143 |                     hrd_qp_attr_t* remote_qp_attr);
144 | 
145 | // Post 1 RECV for this queue pair for this buffer. Low performance.
146 | void hrd_post_dgram_recv(struct ibv_qp* qp, void* buf_addr, size_t len,
147 |                          uint32_t lkey);
148 | 
149 | // Fill @wc with @num_comps comps from this @cq. Exit on error.
150 | static inline void hrd_poll_cq(struct ibv_cq* cq, int num_comps,
151 |                                struct ibv_wc* wc) {
152 |   int comps = 0;
153 |   while (comps < static_cast<int>(num_comps)) {
154 |     int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]);
155 |     if (new_comps != 0) {
156 |       // Ideally, we should check from comps -> new_comps - 1
157 |       if (wc[comps].status != 0) {
158 |         fprintf(stderr, "Bad wc status %d\n", wc[comps].status);
159 |         exit(0);
160 |       }
161 | 
162 |       comps += new_comps;
163 |     }
164 |   }
165 | }
166 | 
167 | // Fill @wc with @num_comps comps from this @cq. Return -1 on error, else 0.
168 | static inline int hrd_poll_cq_ret(struct ibv_cq* cq, int num_comps,
169 |                                   struct ibv_wc* wc) {
170 |   int comps = 0;
171 | 
172 |   while (comps < num_comps) {
173 |     int new_comps = ibv_poll_cq(cq, num_comps - comps, &wc[comps]);
174 |     if (new_comps != 0) {
175 |       // Ideally, we should check from comps -> new_comps - 1
176 |       if (wc[comps].status != 0) {
177 |         fprintf(stderr, "Bad wc status %d\n", wc[comps].status);
178 |         return -1;  // Return an error so the caller can clean up
179 |       }
180 | 
181 |       comps += new_comps;
182 |     }
183 |   }
184 | 
185 |   return 0;  // Success
186 | }
187 | 
188 | // Registry functions
189 | void hrd_publish(const char* key, void* value, size_t len);
190 | int hrd_get_published(const char* key, void** value);
191 | 
192 | // Publish the nth connected queue pair from this cb with this name
193 | void hrd_publish_conn_qp(hrd_ctrl_blk_t* cb, size_t n, const char* qp_name);
194 | 
195 | // Publish the nth datagram queue pair from this cb with this name
196 | void hrd_publish_dgram_qp(hrd_ctrl_blk_t* cb, size_t n, const char* qp_name);
197 | 
198 | struct hrd_qp_attr_t* hrd_get_published_qp(const char* qp_name);
199 | 
200 | void hrd_publish_ready(const char* qp_name);
201 | void hrd_wait_till_ready(const char* qp_name);
202 | 
203 | void hrd_close_memcached();
204 | 
205 | // Utility functions
206 | static inline uint32_t hrd_fastrand(uint64_t* seed) {
207 |   *seed = *seed * 1103515245 + 12345;
208 |   return static_cast<uint32_t>((*seed) >> 32);
209 | }
210 | 
211 | static inline size_t hrd_get_cycles() {
212 |   uint64_t rax;
213 |   uint64_t rdx;
214 |   asm volatile("rdtsc" : "=a"(rax), "=d"(rdx));
215 |   return static_cast<size_t>((rdx << 32) | rax);
216 | }
217 | 
218 | static inline int hrd_is_power_of_2(uint64_t n) { return n && !(n & (n - 1)); }
219 | 
220 | uint8_t* hrd_malloc_socket(int shm_key, size_t size, size_t socket_id);
221 | int hrd_free(int shm_key, void* shm_buf);
222 | void hrd_red_printf(const char* format, ...);
223 | void hrd_get_formatted_time(char* timebuf);
224 | void hrd_nano_sleep(size_t ns);
225 | char* hrd_getenv(const char* name);
226 | void hrd_bind_to_core(std::thread& thread, size_t n);
227 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file common.h
  3 |  * @brief Misc convenience functions and macros
  4 |  */
  5 | 
  6 | #pragma once
  7 | #define likely(x) __builtin_expect(!!(x), 1)
  8 | #define unlikely(x) __builtin_expect(!!(x), 0)
  9 | 
 10 | #include <numa.h>
 11 | #include <random>
 12 | #include <stdexcept>
 13 | #include <string>
 14 | #include <thread>
 15 | #include <vector>
 16 | 
 17 | #define _unused(x) ((void)(x))  // Make production build happy
 18 | 
 19 | #define KB(x) (static_cast<size_t>(x) << 10)
 20 | #define MB(x) (static_cast<size_t>(x) << 20)
 21 | #define GB(x) (static_cast<size_t>(x) << 30)
 22 | #define TB(x) (static_cast<size_t>(x) << 40)
 23 | 
 24 | static void memory_barrier() { asm volatile("" ::: "memory"); }
 25 | static void lfence() { asm volatile("lfence" ::: "memory"); }
 26 | static void sfence() { asm volatile("sfence" ::: "memory"); }
 27 | static void mfence() { asm volatile("mfence" ::: "memory"); }
 28 | 
 29 | #define pmem_clflushopt(addr) \
 30 |   asm volatile(".byte 0x66; clflush %0" : "+m"(*(volatile char *)(addr)));
 31 | 
 32 | #define pmem_clwb(addr) \
 33 |   asm volatile(".byte 0x66; xsaveopt %0" : "+m"(*(volatile char *)(addr)));
 34 | 
 35 | template <typename T>
 36 | static constexpr bool is_power_of_two(T x) {
 37 |   return x && ((x & T(x - 1)) == 0);
 38 | }
 39 | 
 40 | template <uint64_t PowerOfTwoNumber, typename T>
 41 | static constexpr T roundup(T x) {
 42 |   static_assert(is_power_of_two(PowerOfTwoNumber),
 43 |                 "PowerOfTwoNumber must be a power of 2");
 44 |   return ((x) + T(PowerOfTwoNumber - 1)) & (~T(PowerOfTwoNumber - 1));
 45 | }
 46 | 
 47 | class SlowRand {
 48 |   std::random_device rand_dev;  // Non-pseudorandom seed for twister
 49 |   std::mt19937_64 mt;
 50 |   std::uniform_int_distribution<uint64_t> dist;
 51 | 
 52 |  public:
 53 |   SlowRand() : mt(rand_dev()), dist(0, UINT64_MAX) {}
 54 | 
 55 |   inline uint64_t next_u64() { return dist(mt); }
 56 | };
 57 | 
 58 | class FastRand {
 59 |  public:
 60 |   uint64_t seed;
 61 | 
 62 |   /// Create a FastRand using a seed from SlowRand
 63 |   FastRand() {
 64 |     SlowRand slow_rand;
 65 |     seed = slow_rand.next_u64();
 66 |   }
 67 | 
 68 |   inline uint32_t next_u32() {
 69 |     seed = seed * 1103515245 + 12345;
 70 |     return static_cast<uint32_t>(seed >> 32);
 71 |   }
 72 | };
 73 | 
 74 | /// Check a condition at runtime. If the condition is false, throw exception.
 75 | static inline void rt_assert(bool condition, std::string throw_str, char *s) {
 76 |   if (unlikely(!condition)) {
 77 |     throw std::runtime_error(throw_str + std::string(s));
 78 |   }
 79 | }
 80 | 
 81 | /// Check a condition at runtime. If the condition is false, throw exception.
 82 | static inline void rt_assert(bool condition, std::string throw_str) {
 83 |   if (unlikely(!condition)) throw std::runtime_error(throw_str);
 84 | }
 85 | 
 86 | /// Check a condition at runtime. If the condition is false, throw exception.
 87 | /// This is faster than rt_assert(cond, str) as it avoids string construction.
 88 | static inline void rt_assert(bool condition) {
 89 |   if (unlikely(!condition)) throw std::runtime_error("Error");
 90 | }
 91 | 
 92 | /// Return the TSC
 93 | static inline size_t rdtsc() {
 94 |   uint64_t rax;
 95 |   uint64_t rdx;
 96 |   asm volatile("rdtsc" : "=a"(rax), "=d"(rdx));
 97 |   return static_cast<size_t>((rdx << 32) | rax);
 98 | }
 99 | 
100 | static uint64_t rdtscp() {
101 |   uint64_t rax;
102 |   uint64_t rdx;
103 |   uint32_t aux;
104 |   asm volatile("rdtscp" : "=a"(rax), "=d"(rdx), "=c"(aux) : :);
105 |   return (rdx << 32) | rax;
106 | }
107 | 
108 | static void nano_sleep(size_t ns, double freq_ghz) {
109 |   size_t start = rdtsc();
110 |   size_t end = start;
111 |   size_t upp = static_cast<size_t>(freq_ghz * ns);
112 |   while (end - start < upp) end = rdtsc();
113 | }
114 | 
115 | static double measure_rdtsc_freq() {
116 |   struct timespec start, end;
117 |   clock_gettime(CLOCK_REALTIME, &start);
118 |   uint64_t rdtsc_start = rdtsc();
119 | 
120 |   // Do not change this loop! The hardcoded value below depends on this loop
121 |   // and prevents it from being optimized out.
122 |   uint64_t sum = 5;
123 |   for (uint64_t i = 0; i < 1000000; i++) {
124 |     sum += i + (sum + i) * (i % sum);
125 |   }
126 |   rt_assert(sum == 13580802877818827968ull, "Error in RDTSC freq measurement");
127 | 
128 |   clock_gettime(CLOCK_REALTIME, &end);
129 |   uint64_t clock_ns =
130 |       static_cast<uint64_t>(end.tv_sec - start.tv_sec) * 1000000000 +
131 |       static_cast<uint64_t>(end.tv_nsec - start.tv_nsec);
132 |   uint64_t rdtsc_cycles = rdtsc() - rdtsc_start;
133 | 
134 |   double _freq_ghz = rdtsc_cycles * 1.0 / clock_ns;
135 |   rt_assert(_freq_ghz >= 0.5 && _freq_ghz <= 5.0, "Invalid RDTSC frequency");
136 | 
137 |   return _freq_ghz;
138 | }
139 | 
140 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to seconds
141 | static double to_sec(size_t cycles, double freq_ghz) {
142 |   return (cycles / (freq_ghz * 1000000000));
143 | }
144 | 
145 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to msec
146 | static double to_msec(size_t cycles, double freq_ghz) {
147 |   return (cycles / (freq_ghz * 1000000));
148 | }
149 | 
150 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to usec
151 | static double to_usec(size_t cycles, double freq_ghz) {
152 |   return (cycles / (freq_ghz * 1000));
153 | }
154 | 
155 | static size_t ms_to_cycles(double ms, double freq_ghz) {
156 |   return static_cast<size_t>(ms * 1000 * 1000 * freq_ghz);
157 | }
158 | 
159 | static size_t us_to_cycles(double us, double freq_ghz) {
160 |   return static_cast<size_t>(us * 1000 * freq_ghz);
161 | }
162 | 
163 | static size_t ns_to_cycles(double ns, double freq_ghz) {
164 |   return static_cast<size_t>(ns * freq_ghz);
165 | }
166 | 
167 | // Edit
168 | /// Convert cycles measured by rdtsc with frequence \p freq_ghz to nsec
169 | static double to_nsec(size_t cycles, double freq_ghz) {
170 |   return (cycles / freq_ghz);
171 | }
172 | 
173 | /// Return seconds elapsed since timestamp \p t0
174 | static double sec_since(const struct timespec &t0) {
175 |   struct timespec t1;
176 |   clock_gettime(CLOCK_REALTIME, &t1);
177 |   return (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1000000000.0;
178 | }
179 | 
180 | /// Return nanoseconds elapsed since timestamp \p t0
181 | static double ns_since(const struct timespec &t0) {
182 |   struct timespec t1;
183 |   clock_gettime(CLOCK_REALTIME, &t1);
184 |   return (t1.tv_sec - t0.tv_sec) * 1000000000.0 + (t1.tv_nsec - t0.tv_nsec);
185 | }
186 | 
187 | /// Return the number of logical cores per NUMA node
188 | static size_t num_lcores_per_numa_node() {
189 |   return static_cast<size_t>(numa_num_configured_cpus() /
190 |                              numa_num_configured_nodes());
191 | }
192 | 
193 | /// Return a list of logical cores in \p numa_node
194 | static std::vector<size_t> get_lcores_for_numa_node(size_t numa_node) {
195 |   rt_assert(numa_node <= static_cast<size_t>(numa_max_node()));
196 | 
197 |   std::vector<size_t> ret;
198 |   size_t num_lcores = static_cast<size_t>(numa_num_configured_cpus());
199 | 
200 |   for (size_t i = 0; i < num_lcores; i++) {
201 |     if (numa_node == static_cast<size_t>(numa_node_of_cpu(i))) {
202 |       ret.push_back(i);
203 |     }
204 |   }
205 | 
206 |   return ret;
207 | }
208 | 
209 | /// Bind \p thread to core with index \p numa_local_index on \p numa_node
210 | static void bind_to_core(std::thread &thread, size_t numa_node,
211 |                          size_t numa_local_index) {
212 |   cpu_set_t cpuset;
213 |   CPU_ZERO(&cpuset);
214 | 
215 |   auto lcore_vec = get_lcores_for_numa_node(numa_node);
216 |   size_t global_index = lcore_vec.at(numa_local_index);
217 | 
218 |   CPU_SET(global_index, &cpuset);
219 |   int rc = pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t),
220 |                                   &cpuset);
221 |   rt_assert(rc == 0, "Error setting thread affinity");
222 | }
223 | 
224 | /// Compute the standard deviation of a vector
225 | static double stddev(std::vector<double> v) {
226 |   if (unlikely(v.empty())) return 0;
227 |   double sum = std::accumulate(v.begin(), v.end(), 0.0);
228 |   double mean = sum / v.size();
229 |   double sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
230 |   double var = sq_sum / v.size() - (mean * mean);
231 |   if (unlikely(var < 0)) return 0.0;  // This can happen when var ~ 0
232 | 
233 |   return std::sqrt(var);
234 | }
235 | 


--------------------------------------------------------------------------------
/rdma/rdma-write-flush-lat/main.cc:
--------------------------------------------------------------------------------
  1 | #include <fcntl.h>
  2 | #include <gflags/gflags.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <sys/mman.h>
  7 | #include <sys/stat.h>
  8 | #include <pcg/pcg_random.hpp>
  9 | #include <random>
 10 | #include <thread>
 11 | #include <vector>
 12 | #include "../libhrd_cpp/hrd.h"
 13 | #include "latency.h"
 14 | 
 15 | DEFINE_uint64(is_client, 0, "Is this process a client?");
 16 | 
 17 | static constexpr size_t kBufSize = KB(128);  // Registered buffer size
 18 | static constexpr size_t kMinWriteSize = 64;
 19 | static constexpr size_t kMaxWriteSize = 1024;
 20 | 
 21 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages.
 22 | static constexpr bool kUsePmem = true;
 23 | static constexpr const char* kPmemFile = "/dev/dax0.0";
 24 | 
 25 | // Number of writes to flush. The (WRITE+READ) combos for all writes are
 26 | // issued in one postlist. Only the last READ in the postlist is signaled, so
 27 | // kNumWrites cannot be too large. Else we'll run into signaling issues.
 28 | static constexpr size_t kNumWritesToFlush = 1;
 29 | 
 30 | // If true, we issue only one signaled write and no reads
 31 | static constexpr bool kJustAWrite = true;
 32 | 
 33 | uint8_t* get_pmem_buf() {
 34 |   int fd = open(kPmemFile, O_RDWR);
 35 |   rt_assert(fd >= 0, "devdax open failed");
 36 | 
 37 |   size_t pmem_size = round_up<MB(2)>(kBufSize);  // Smaller sizes may fail
 38 |   void* buf =
 39 |       mmap(nullptr, pmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 40 |   rt_assert(buf != MAP_FAILED, "mmap failed for devdax");
 41 |   rt_assert(reinterpret_cast<size_t>(buf) % 256 == 0);
 42 |   memset(buf, 0, pmem_size);
 43 | 
 44 |   return reinterpret_cast<uint8_t*>(buf);
 45 | }
 46 | 
 47 | void run_server() {
 48 |   uint8_t* pmem_buf = nullptr;
 49 |   if (kUsePmem) pmem_buf = get_pmem_buf();
 50 | 
 51 |   struct hrd_conn_config_t conn_config;
 52 |   conn_config.num_qps = 1;
 53 |   conn_config.use_uc = false;
 54 |   conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr;
 55 |   conn_config.buf_size = kBufSize;
 56 |   conn_config.buf_shm_key = kUsePmem ? -1 : 3185;
 57 | 
 58 |   auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */,
 59 |                                &conn_config, nullptr /* dgram config */);
 60 |   memset(const_cast<uint8_t*>(cb->conn_buf), 0, kBufSize);
 61 | 
 62 |   hrd_publish_conn_qp(cb, 0, "server");
 63 |   printf("main: Server published. Waiting for client.\n");
 64 | 
 65 |   hrd_qp_attr_t* clt_qp = nullptr;
 66 |   while (clt_qp == nullptr) {
 67 |     clt_qp = hrd_get_published_qp("client");
 68 |     if (clt_qp == nullptr) usleep(200000);
 69 |   }
 70 | 
 71 |   printf("main: Server %s found client! Connecting..\n", "server");
 72 |   hrd_connect_qp(cb, 0, clt_qp);
 73 |   hrd_publish_ready("server");
 74 |   printf("main: Server ready. Going to sleep.\n");
 75 | 
 76 |   while (true) sleep(1);
 77 | }
 78 | 
 79 | /// Get a random offset in the registered buffer with at least \p msg_size room
 80 | size_t get_256_aligned_random_offset(pcg64_fast& pcg, size_t msg_size) {
 81 |   size_t iters = 0;
 82 |   while (true) {
 83 |     size_t rand_offset = (pcg() % kBufSize);
 84 |     if (likely(kBufSize - rand_offset > msg_size)) return rand_offset;
 85 |     iters++;
 86 |     if (unlikely(iters > 10)) printf("Random offset took over 10 iters\n");
 87 |   }
 88 | }
 89 | 
 90 | void run_client() {
 91 |   Latency latency;
 92 |   hrd_conn_config_t conn_config;
 93 |   conn_config.num_qps = 1;
 94 |   conn_config.use_uc = false;
 95 |   conn_config.prealloc_buf = nullptr;
 96 |   conn_config.buf_size = kBufSize;
 97 |   conn_config.buf_shm_key = 3185;
 98 | 
 99 |   auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */,
100 |                                &conn_config, nullptr /* dgram config */);
101 |   memset(const_cast<uint8_t*>(cb->conn_buf), 31, kBufSize);
102 | 
103 |   hrd_publish_conn_qp(cb, 0, "client");
104 |   printf("main: Client published. Waiting for server.\n");
105 | 
106 |   hrd_qp_attr_t* srv_qp = nullptr;
107 |   while (srv_qp == nullptr) {
108 |     srv_qp = hrd_get_published_qp("server");
109 |     if (srv_qp == nullptr) usleep(2000);
110 |   }
111 | 
112 |   printf("main: Client found server. Connecting..\n");
113 |   hrd_connect_qp(cb, 0, srv_qp);
114 |   printf("main: Client connected!\n");
115 | 
116 |   hrd_wait_till_ready("server");
117 | 
118 |   // The +1s are for simpler postlist chain pointer math
119 |   static constexpr size_t kArrSz = kNumWritesToFlush + 1;
120 |   struct ibv_send_wr write_wr[kArrSz], read_wr[kArrSz];
121 |   struct ibv_send_wr* bad_send_wr;
122 |   struct ibv_sge write_sge[kArrSz], read_sge[kArrSz];
123 |   struct ibv_wc wc;
124 | 
125 |   size_t write_size = kMinWriteSize;  // Increases by powers of two
126 |   size_t num_iters = 0;
127 | 
128 |   // Remote memory is divided into write_size chunks. The RDMA writes use these
129 |   // chunks in order.
130 |   size_t write_chunk_idx = 0;
131 | 
132 |   // pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
133 | 
134 |   printf("#write_size median_us 5th_us 99th_us 999th_us\n");  // Stats header
135 |   while (true) {
136 |     if (num_iters == KB(256)) {
137 |       printf("%zu %.1f %.1f %.1f %.1f\n", write_size, latency.perc(.50) / 10.0,
138 |              latency.perc(.05) / 10.0, latency.perc(.99) / 10.0,
139 |              latency.perc(.999) / 10.0);
140 |       latency.reset();
141 | 
142 |       write_size *= 2;
143 |       if (write_size > kMaxWriteSize) write_size = kMinWriteSize;
144 | 
145 |       num_iters = 0;
146 |       write_chunk_idx = 0;
147 |     }
148 | 
149 |     struct timespec start;
150 |     clock_gettime(CLOCK_REALTIME, &start);
151 | 
152 |     // Enter the loop below with room for at least (kNumWritesToFlush + 1)
153 |     // chunks. We don't use the last chunk because we read from there.
154 |     if (write_chunk_idx + 1 >=
155 |         (kBufSize / write_size) - kNumWritesToFlush - 1) {
156 |       write_chunk_idx = 0;
157 |     }
158 | 
159 |     // WRITE
160 |     for (size_t i = 0; i < kNumWritesToFlush; i++) {
161 |       const size_t remote_offset = write_chunk_idx * write_size;
162 |       write_chunk_idx++;
163 | 
164 |       write_sge[i].addr =
165 |           reinterpret_cast<uint64_t>(&cb->conn_buf[i * write_size]);
166 |       write_sge[i].length = write_size;
167 |       write_sge[i].lkey = cb->conn_buf_mr->lkey;
168 | 
169 |       write_wr[i].opcode = IBV_WR_RDMA_WRITE;
170 |       write_wr[i].num_sge = 1;
171 |       write_wr[i].sg_list = &write_sge[i];
172 |       write_wr[i].send_flags = 0 /* unsignaled */;
173 |       if (write_size <= kHrdMaxInline) {
174 |         write_wr[i].send_flags |= IBV_SEND_INLINE;
175 |       }
176 | 
177 |       write_wr[i].wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset;
178 |       write_wr[i].wr.rdma.rkey = srv_qp->rkey;
179 | 
180 |       // READ. We can read from any address.
181 |       read_sge[i].addr =
182 |           reinterpret_cast<uint64_t>(&cb->conn_buf[kBufSize - sizeof(size_t)]);
183 |       read_sge[i].length = sizeof(size_t);  // Indepenent of write size
184 |       read_sge[i].lkey = cb->conn_buf_mr->lkey;
185 | 
186 |       read_wr[i].opcode = IBV_WR_RDMA_READ;
187 |       read_wr[i].num_sge = 1;
188 |       read_wr[i].sg_list = &read_sge[i];
189 |       read_wr[i].send_flags = 0;  // Unsignaled. The last read is signaled.
190 |       read_wr[i].wr.rdma.remote_addr =
191 |           srv_qp->buf_addr + kBufSize - sizeof(size_t);
192 |       read_wr[i].wr.rdma.rkey = srv_qp->rkey;
193 | 
194 |       // Make a chain
195 |       write_wr[i].next = &read_wr[i];
196 |       read_wr[i].next = &write_wr[i + 1];
197 |     }
198 | 
199 |     if (!kJustAWrite) {
200 |       read_wr[kNumWritesToFlush - 1].send_flags = IBV_SEND_SIGNALED;
201 |       read_wr[kNumWritesToFlush - 1].next = nullptr;
202 |     } else {
203 |       write_wr[0].send_flags |= IBV_SEND_SIGNALED;
204 |       write_wr[0].next = nullptr;
205 |     }
206 | 
207 |     int ret = ibv_post_send(cb->conn_qp[0], &write_wr[0], &bad_send_wr);
208 |     rt_assert(ret == 0);
209 |     hrd_poll_cq(cb->conn_cq[0], 1, &wc);  // Block till the RDMA read completes
210 |     num_iters++;
211 | 
212 |     double us = ns_since(start) / 1000.0;
213 |     latency.update(us * 10);
214 |   }
215 | }
216 | 
217 | int main(int argc, char* argv[]) {
218 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
219 |   FLAGS_is_client == 1 ? run_client() : run_server();
220 |   return 0;
221 | }
222 | 


--------------------------------------------------------------------------------
/microbench/bench.cc:
--------------------------------------------------------------------------------
  1 | #include "bench.h"
  2 | 
  3 | #include "rand_read_latency.h"
  4 | #include "rand_read_tput.h"
  5 | #include "rand_write_latency.h"
  6 | #include "rand_write_tput.h"
  7 | #include "seq_read_tput.h"
  8 | #include "seq_write_latency.h"
  9 | #include "seq_write_tput.h"
 10 | 
 11 | // Return true if kPmemFile is in devdax mode
 12 | static bool is_pmem_file_devdax() {
 13 |   if (std::string(kPmemFile).find("dax") != std::string::npos) return true;
 14 |   return false;
 15 | }
 16 | 
 17 | // Write to the whole buffer to "map it in", whatever that means
 18 | void map_in_buffer_whole(uint8_t *pbuf) {
 19 |   printf("Writing to the whole file for map-in...\n");
 20 |   const size_t chunk_sz = GB(16);
 21 |   rt_assert(kPmemFileSize % chunk_sz == 0, "Invalid chunk size for map-in");
 22 | 
 23 |   for (size_t i = 0; i < kPmemFileSize; i += chunk_sz) {
 24 |     struct timespec start;
 25 |     clock_gettime(CLOCK_REALTIME, &start);
 26 |     pmem_memset_persist(&pbuf[i], 3185, chunk_sz);  // nodrain performs similar
 27 |     printf("Fraction complete = %.2f. Took %.3f sec for %zu GB.\n",
 28 |            (i + 1) * 1.0 / kPmemFileSize, sec_since(start), chunk_sz / GB(1));
 29 |   }
 30 | 
 31 |   printf("Done writing.\n");
 32 | }
 33 | 
 34 | // Write to a byte in each page of the buffer, to map the pages in
 35 | void map_in_buffer_by_page(uint8_t *pbuf) {
 36 |   printf("Mapping-in file pages.\n");
 37 |   struct timespec start;
 38 |   clock_gettime(CLOCK_REALTIME, &start);
 39 | 
 40 |   for (size_t i = 0; i < kPmemFileSize; i += KB(4)) {
 41 |     pmem_memset_nodrain(&pbuf[i], 3185, 1);
 42 |     if (i % GB(32) == 0 && i > 0) {
 43 |       printf("Fraction complete = %.2f. Took %.3f sec for %u GB.\n",
 44 |              (i + 1) * 1.0 / kPmemFileSize, sec_since(start), 32);
 45 |       clock_gettime(CLOCK_REALTIME, &start);
 46 |     }
 47 |   }
 48 | 
 49 |   printf("Done mapping-in.\n");
 50 | }
 51 | 
 52 | // Map pmem file in devdax mode
 53 | uint8_t *map_pmem_file_devdax() {
 54 |   int fd = open(kPmemFile, O_RDWR);
 55 |   rt_assert(fd >= 0, "devdax open failed");
 56 |   rt_assert(kPmemFileSize % MB(2) == 0, "File size must be multiple of 2 MB");
 57 | 
 58 |   void *buf =
 59 |       mmap(nullptr, kPmemFileSize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 60 |   rt_assert(buf != MAP_FAILED, "mmap failed for devdax");
 61 |   rt_assert(reinterpret_cast<size_t>(buf) % 256 == 0);
 62 | 
 63 |   return reinterpret_cast<uint8_t *>(buf);
 64 | }
 65 | 
 66 | // Map pmem file in fsdax mode
 67 | uint8_t *map_pmem_file_fsdax() {
 68 |   uint8_t *pbuf;
 69 |   size_t mapped_len;
 70 |   int is_pmem;
 71 | 
 72 |   pbuf = reinterpret_cast<uint8_t *>(pmem_map_file(
 73 |       kPmemFile, 0 /* length */, 0 /* flags */, 0666, &mapped_len, &is_pmem));
 74 | 
 75 |   rt_assert(pbuf != nullptr,
 76 |             "pmem_map_file() failed. " + std::string(strerror(errno)));
 77 |   rt_assert(mapped_len >= kPmemFileSize,
 78 |             "pmem file too small " + std::to_string(mapped_len));
 79 |   rt_assert(reinterpret_cast<size_t>(pbuf) % 4096 == 0,
 80 |             "Mapped buffer isn't page-aligned");
 81 |   rt_assert(is_pmem == 1, "File is not pmem");
 82 |   printf("Mapped file of length %.2f GB\n", mapped_len * 1.0 / GB(1));
 83 | 
 84 |   return pbuf;
 85 | }
 86 | 
 87 | int main(int argc, char **argv) {
 88 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
 89 |   uint8_t *pbuf;
 90 | 
 91 |   freq_ghz = measure_rdtsc_freq();
 92 |   printf("RDTSC frequency = %.2f GHz\n", freq_ghz);
 93 | 
 94 |   pbuf = is_pmem_file_devdax() ? map_pmem_file_devdax() : map_pmem_file_fsdax();
 95 | 
 96 |   // Print some random file samples to check it's full of random contents
 97 |   printf("File contents sample: ");
 98 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
 99 |   for (size_t i = 0; i < 10; i++) {
100 |     printf("%zu ", *reinterpret_cast<size_t *>(&pbuf[pcg() % kPmemFileSize]));
101 |   }
102 |   printf("\n");
103 | 
104 |   // map_in_buffer_by_page(pbuf);
105 |   // map_in_buffer_whole(pbuf);
106 | 
107 |   std::string bench_func;  // Last one wins
108 |   bench_func = "bench_seq_read_latency";
109 |   bench_func = "bench_rand_write_latency";
110 |   bench_func = "bench_rand_read_tput";
111 |   bench_func = "bench_seq_write_tput";
112 |   bench_func = "bench_seq_write_latency";
113 |   bench_func = "bench_rand_read_latency";
114 |   bench_func = "bench_seq_read_tput";
115 |   bench_func = "bench_rand_write_tput";
116 |   bench_func = "bench_seq_write_tput";
117 | 
118 |   // Sequential write throughput
119 |   if (bench_func == "bench_seq_write_tput") {
120 |     printf("Sequential write throughput. %zu threads\n", FLAGS_num_threads);
121 |     std::ostringstream dat_header;
122 |     std::ostringstream dat_data;
123 |     dat_header << "Threads ";
124 |     dat_data << std::to_string(FLAGS_num_threads) << " ";
125 | 
126 |     for (size_t copy_sz = MB(2); copy_sz <= GB(1); copy_sz *= 2) {
127 |       dat_header << std::to_string(copy_sz) << " ";
128 |       std::vector<double> avg_tput_GBps(FLAGS_num_threads);
129 | 
130 |       std::vector<std::thread> threads(FLAGS_num_threads);
131 |       for (size_t i = 0; i < FLAGS_num_threads; i++) {
132 |         threads[i] = std::thread(bench_seq_write_tput, pbuf, i, copy_sz,
133 |                                  &avg_tput_GBps[i]);
134 |         bind_to_core(threads[i], kNumaNode, i);
135 |       }
136 |       for (auto &t : threads) t.join();
137 | 
138 |       double total_tput = 0.0;
139 |       for (size_t i = 0; i < FLAGS_num_threads; i++)
140 |         total_tput += avg_tput_GBps[i];
141 |       dat_data << std::setprecision(2) << total_tput << " ";
142 |     }
143 | 
144 |     printf("%s\n", dat_header.str().c_str());
145 |     printf("%s\n", dat_data.str().c_str());
146 |   }
147 | 
148 |   // Sequential write latency
149 |   if (bench_func == "bench_seq_write_latency") {
150 |     printf("Sequential write latency. One thread only!\n");
151 |     bench_seq_write_latency(pbuf);
152 |   }
153 | 
154 |   // Random write latency
155 |   if (bench_func == "bench_rand_write_latency") {
156 |     printf("Random write latency. One thread only!\n");
157 |     bench_rand_write_latency(pbuf);
158 |   }
159 | 
160 |   // Random read latency
161 |   if (bench_func == "bench_rand_read_latency") {
162 |     printf("Random read latency. One thread only!\n");
163 |     bench_rand_read_latency(pbuf);
164 |   }
165 | 
166 |   // Random write tput
167 |   if (bench_func == "bench_rand_write_tput") {
168 |     std::vector<size_t> thread_count = {1};
169 |     std::vector<size_t> copy_sz_vec = {256};
170 | 
171 |     for (size_t copy_sz : copy_sz_vec) {
172 |       for (size_t num_threads : thread_count) {
173 |         printf("Rand write tput with %zu threads, copy_sz %zu\n", num_threads,
174 |                copy_sz);
175 |         std::vector<std::thread> threads(num_threads);
176 | 
177 |         for (size_t i = 0; i < num_threads; i++) {
178 |           threads[i] =
179 |               std::thread(bench_rand_write_tput, pbuf, i, copy_sz, num_threads);
180 |         }
181 | 
182 |         for (size_t i = 0; i < num_threads; i++) threads[i].join();
183 |       }
184 |     }
185 |   }
186 | 
187 |   // Random read throughput
188 |   if (bench_func == "bench_rand_read_tput") {
189 |     std::vector<size_t> thread_count = {1, 2, 4, 8, 16, 24, 48};
190 |     std::vector<size_t> copy_sz_vec = {64, 256, 512, 1024};
191 | 
192 |     for (size_t copy_sz : copy_sz_vec) {
193 |       for (size_t num_threads : thread_count) {
194 |         printf("Rand read tput with %zu threads, copy_sz %zu\n", num_threads,
195 |                copy_sz);
196 |         std::vector<std::thread> threads(num_threads);
197 | 
198 |         for (size_t i = 0; i < num_threads; i++) {
199 |           threads[i] =
200 |               std::thread(bench_rand_read_tput, pbuf, i, copy_sz, num_threads);
201 |         }
202 | 
203 |         for (size_t i = 0; i < num_threads; i++) threads[i].join();
204 |       }
205 |     }
206 |   }
207 | 
208 |   // Sequential read throughput
209 |   if (bench_func == "bench_seq_read_tput") {
210 |     std::vector<size_t> thread_count = {1, 2, 4, 8, 16, 24, 48};
211 | 
212 |     for (size_t num_threads : thread_count) {
213 |       printf("Seq read tput with %zu threads\n", num_threads);
214 |       std::vector<std::thread> threads(num_threads);
215 | 
216 |       for (size_t i = 0; i < num_threads; i++) {
217 |         threads[i] = std::thread(bench_seq_read_tput, pbuf, i, num_threads);
218 |         bind_to_core(threads[i], kNumaNode, i);
219 |       }
220 | 
221 |       for (size_t i = 0; i < num_threads; i++) threads[i].join();
222 |     }
223 |   }
224 | 
225 |   is_pmem_file_devdax() ? munmap(pbuf, kPmemFileSize)
226 |                         : pmem_unmap(pbuf, kPmemFileSize);
227 |   exit(0);
228 | }
229 | 


--------------------------------------------------------------------------------
/rdma/rw-tput-receiver/main.cc:
--------------------------------------------------------------------------------
  1 | #include <fcntl.h>
  2 | #include <gflags/gflags.h>
  3 | #include <libpmem.h>
  4 | #include <stdlib.h>
  5 | #include <string.h>
  6 | #include <sys/mman.h>
  7 | #include <sys/stat.h>
  8 | #include <thread>
  9 | #include <vector>
 10 | #include "../libhrd_cpp/hrd.h"
 11 | 
 12 | static constexpr size_t kServerBufSize = GB(8);
 13 | static constexpr size_t kAppMaxPostlist = 64;
 14 | static constexpr size_t kAppUnsigBatch = 64;
 15 | static constexpr size_t kBaseSHMKey = 3185;
 16 | 
 17 | // If true, we use a devdax-mapped buffer. If false, we use DRAM hugepages.
 18 | static constexpr bool kUsePmem = true;
 19 | static constexpr const char* kPmemFile = "/dev/dax0.0";
 20 | 
 21 | // If true, server zeroes out its buffer and reports write throughput
 22 | static constexpr bool kZeroServerBuf = false;
 23 | 
 24 | DEFINE_uint64(num_client_processes, 1, "Number of client processes");
 25 | DEFINE_uint64(num_threads_per_client, 1, "Threads per client process");
 26 | DEFINE_uint64(is_client, 0, "Is this process a client?");
 27 | DEFINE_uint64(use_uc, 0, "Use unreliable connected transport?");
 28 | DEFINE_uint64(do_read, 0, "Do RDMA reads?");
 29 | DEFINE_uint64(machine_id, 0, "Zero-based ID of this client machine");
 30 | DEFINE_uint64(size, 0, "RDMA size");
 31 | DEFINE_uint64(postlist, 0, "Postlist size");
 32 | 
 33 | // Parameters for a client thread
 34 | struct clt_thread_params_t {
 35 |   size_t global_thread_id;
 36 |   double* tput;
 37 | };
 38 | 
 39 | // Map the devdax buffer at the server
 40 | uint8_t* get_pmem_buf_server() {
 41 |   int fd = open(kPmemFile, O_RDWR);
 42 |   rt_assert(fd >= 0, "devdax open failed");
 43 | 
 44 |   size_t pmem_size = roundup<MB(2)>(kServerBufSize);  // Smaller sizes may fail
 45 |   void* buf =
 46 |       mmap(nullptr, pmem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
 47 |   rt_assert(buf != MAP_FAILED, "mmap failed for devdax");
 48 |   rt_assert(reinterpret_cast<size_t>(buf) % 256 == 0);
 49 | 
 50 |   return reinterpret_cast<uint8_t*>(buf);
 51 | }
 52 | 
 53 | void run_server() {
 54 |   size_t num_client_connections =
 55 |       FLAGS_num_client_processes * FLAGS_num_threads_per_client;
 56 | 
 57 |   uint8_t* pmem_buf = nullptr;
 58 |   if (kUsePmem) {
 59 |     pmem_buf = get_pmem_buf_server();
 60 | 
 61 |     // Fill in the persistent buffer, also sanity-check local write throughput
 62 |     if (kZeroServerBuf) {
 63 |       printf("main: Zero-ing pmem buffer\n");
 64 |       struct timespec start;
 65 |       clock_gettime(CLOCK_REALTIME, &start);
 66 |       pmem_memset_persist(pmem_buf, 0, kServerBufSize);
 67 |       printf("main: Zero-ed %f MB of pmem at %.1f GB/s\n",
 68 |              kServerBufSize * 1.0 / MB(1),
 69 |              kServerBufSize / (1000000000.0 * sec_since(start)));
 70 |     }
 71 |   }
 72 | 
 73 |   struct hrd_conn_config_t conn_config;
 74 |   conn_config.num_qps = num_client_connections;
 75 |   conn_config.use_uc = (FLAGS_use_uc == 1);
 76 |   conn_config.prealloc_buf = kUsePmem ? pmem_buf : nullptr;
 77 |   conn_config.buf_size = kServerBufSize;
 78 |   conn_config.buf_shm_key = kUsePmem ? -1 : 3185;
 79 | 
 80 |   auto* cb = hrd_ctrl_blk_init(0 /* id */, 0 /* port */, 0 /* numa */,
 81 |                                &conn_config, nullptr /* dgram config */);
 82 | 
 83 |   // Publish server QPs. Server i is for global connection ID i
 84 |   for (size_t i = 0; i < num_client_connections; i++) {
 85 |     auto srv_qp_name = std::string("server-") + std::to_string(i);
 86 |     hrd_publish_conn_qp(cb, i, srv_qp_name.c_str());
 87 |   }
 88 | 
 89 |   for (size_t i = 0; i < num_client_connections; i++) {
 90 |     auto conn_name = std::string("conn-") + std::to_string(i);
 91 |     hrd_qp_attr_t* conn_qp = nullptr;
 92 |     while (conn_qp == nullptr) {
 93 |       conn_qp = hrd_get_published_qp(conn_name.c_str());
 94 |       if (conn_qp == nullptr) {
 95 |         usleep(200000);
 96 |         continue;
 97 |       }
 98 | 
 99 |       printf("main: Server found client connection %zu! Connecting..\n", i);
100 |       hrd_connect_qp(cb, i, conn_qp);
101 |     }
102 |   }
103 | 
104 |   hrd_publish_ready("server");
105 |   printf("main: Server ready. Going to sleep.\n");
106 | 
107 |   while (true) sleep(1);
108 | }
109 | 
110 | void run_client(clt_thread_params_t* params) {
111 |   FastRand fast_rand;
112 |   size_t clt_lid = params->global_thread_id % FLAGS_num_threads_per_client;
113 | 
114 |   hrd_conn_config_t conn_config;
115 |   conn_config.num_qps = 1;
116 |   conn_config.use_uc = (FLAGS_use_uc == 1);
117 |   conn_config.prealloc_buf = nullptr;
118 |   conn_config.buf_size = FLAGS_size;
119 |   conn_config.buf_shm_key = kBaseSHMKey + clt_lid;
120 | 
121 |   auto* cb = hrd_ctrl_blk_init(params->global_thread_id, 0 /* port */,
122 |                                0 /* numa */, &conn_config, nullptr);
123 | 
124 |   memset(const_cast<uint8_t*>(cb->conn_buf),
125 |          static_cast<uint8_t>(params->global_thread_id) + 1,
126 |          conn_config.buf_size);
127 | 
128 |   size_t global_conn_id = params->global_thread_id;
129 |   auto conn_name = std::string("conn-") + std::to_string(global_conn_id);
130 |   hrd_publish_conn_qp(cb, 0, conn_name.c_str());
131 |   printf("main: Connection %s published. Waiting for server.\n",
132 |          conn_name.c_str());
133 | 
134 |   auto srv_qp_name = std::string("server-") + std::to_string(global_conn_id);
135 |   hrd_qp_attr_t* srv_qp = nullptr;
136 |   while (srv_qp == nullptr) {
137 |     srv_qp = hrd_get_published_qp(srv_qp_name.c_str());
138 |     if (srv_qp == nullptr) usleep(2000);
139 |   }
140 | 
141 |   rt_assert(srv_qp->buf_addr % FLAGS_size == 0,
142 |             "Server buffer address not aligned to RDMA size");
143 | 
144 |   printf("main: Found server for connection %s. Connecting..\n",
145 |          conn_name.c_str());
146 |   hrd_connect_qp(cb, 0, srv_qp);
147 |   printf("main: Client connected!\n");
148 | 
149 |   hrd_wait_till_ready("server");
150 | 
151 |   struct ibv_send_wr wr[kAppMaxPostlist], *bad_send_wr;
152 |   struct ibv_sge sgl[kAppMaxPostlist];
153 |   struct ibv_wc wc;
154 |   size_t rolling_iter = 0;  // For performance measurement
155 |   size_t nb_tx = 0;         // For selective signaling
156 |   int ret;
157 | 
158 |   struct timespec start, end;
159 |   clock_gettime(CLOCK_REALTIME, &start);
160 | 
161 |   while (true) {
162 |     if (rolling_iter >= KB(512)) {
163 |       clock_gettime(CLOCK_REALTIME, &end);
164 |       double seconds = (end.tv_sec - start.tv_sec) +
165 |                        (end.tv_nsec - start.tv_nsec) / 1000000000.0;
166 |       double tput_mrps = rolling_iter / (seconds * 1000000);
167 |       printf("main: Client %zu: %.2f M/s\n", params->global_thread_id,
168 |              tput_mrps);
169 |       rolling_iter = 0;
170 | 
171 |       // Per-machine stats
172 |       params->tput[clt_lid] = tput_mrps;
173 |       if (clt_lid == 0) {
174 |         double tot = 0;
175 |         for (size_t i = 0; i < FLAGS_num_threads_per_client; i++)
176 |           tot += params->tput[i];
177 |         hrd_red_printf("main: Machine: %.2f M/s\n", tot);
178 |       }
179 | 
180 |       clock_gettime(CLOCK_REALTIME, &start);
181 |     }
182 | 
183 |     // Post a batch
184 |     for (size_t w_i = 0; w_i < FLAGS_postlist; w_i++) {
185 |       wr[w_i].opcode =
186 |           FLAGS_do_read == 0 ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ;
187 |       wr[w_i].num_sge = 1;
188 |       wr[w_i].next = (w_i == FLAGS_postlist - 1) ? nullptr : &wr[w_i + 1];
189 |       wr[w_i].sg_list = &sgl[w_i];
190 | 
191 |       wr[w_i].send_flags = nb_tx % kAppUnsigBatch == 0 ? IBV_SEND_SIGNALED : 0;
192 |       if (nb_tx % kAppUnsigBatch == 0 && nb_tx > 0) {
193 |         hrd_poll_cq(cb->conn_cq[0], 1, &wc);
194 |       }
195 | 
196 |       wr[w_i].send_flags |= FLAGS_do_read == 0 ? IBV_SEND_INLINE : 0;
197 | 
198 |       sgl[w_i].addr = reinterpret_cast<uint64_t>(&cb->conn_buf);
199 |       sgl[w_i].length = FLAGS_size;
200 |       sgl[w_i].lkey = cb->conn_buf_mr->lkey;
201 | 
202 |       size_t remote_offset =
203 |           (fast_rand.next_u32() % (kServerBufSize / FLAGS_size)) * FLAGS_size;
204 | 
205 |       wr[w_i].wr.rdma.remote_addr = srv_qp->buf_addr + remote_offset;
206 |       wr[w_i].wr.rdma.rkey = srv_qp->rkey;
207 | 
208 |       nb_tx++;
209 |     }
210 | 
211 |     ret = ibv_post_send(cb->conn_qp[0], &wr[0], &bad_send_wr);
212 |     rt_assert(ret == 0);
213 | 
214 |     rolling_iter += FLAGS_postlist;
215 |   }
216 | }
217 | 
218 | int main(int argc, char* argv[]) {
219 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
220 | 
221 |   if (FLAGS_is_client == 1) {
222 |     if (FLAGS_do_read == 0) {
223 |       rt_assert(FLAGS_size <= kHrdMaxInline, "Inline size too small");
224 |     }
225 |     rt_assert(FLAGS_postlist <= kAppMaxPostlist, "Postlist too large");
226 |     rt_assert(kAppUnsigBatch >= FLAGS_postlist, "Postlist check failed");
227 |     rt_assert(kHrdSQDepth >= 2 * kAppUnsigBatch, "Queue capacity check failed");
228 |   }
229 | 
230 |   // Launch a single server thread or multiple client threads
231 | 
232 |   if (FLAGS_is_client == 1) {
233 |     std::vector<std::thread> thread_arr(FLAGS_num_threads_per_client);
234 |     auto* tput = new double[FLAGS_num_threads_per_client];
235 |     printf("main: Using %zu threads\n", FLAGS_num_threads_per_client);
236 |     auto* param_arr = new clt_thread_params_t[FLAGS_num_threads_per_client];
237 |     for (size_t i = 0; i < FLAGS_num_threads_per_client; i++) {
238 |       param_arr[i].global_thread_id =
239 |           (FLAGS_machine_id * FLAGS_num_threads_per_client) + i;
240 |       param_arr[i].tput = tput;
241 | 
242 |       thread_arr[i] = std::thread(run_client, &param_arr[i]);
243 |     }
244 | 
245 |     for (auto& thread : thread_arr) thread.join();
246 |   } else {
247 |     auto server_thread = std::thread(run_server);
248 |     server_thread.join();
249 |   }
250 | }
251 | 


--------------------------------------------------------------------------------
/ioat/bench.cc:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |  * @file bench.cc
  4 |  *
  5 |  * @brief Benchmark for IOAT DMA based on DPDK instead of SPDK. The benchmark
  6 |  * task is to paste small, cached source buffers sequentially into the large
  7 |  * destination buffer.
  8 |  *
  9 |  * Flexibility: use IOAT or memcpy for the copy. Use volatile or persistent
 10 |  * buffer for the large destination buffer.
 11 |  */
 12 | 
 13 | #include <errno.h>
 14 | #include <fcntl.h>
 15 | #include <gflags/gflags.h>
 16 | #include <libpmem.h>
 17 | #include <rte_rawdev.h>
 18 | #include <rte_ioat_rawdev.h>
 19 | #include <rte_memcpy.h>
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | #include <sys/stat.h>
 25 | #include <sys/types.h>
 26 | #include <time.h>
 27 | #include <unistd.h>
 28 | #include <pcg/pcg_random.hpp>
 29 | #include <random>
 30 | #include "../common.h"
 31 | #include "huge_alloc.h"
 32 | #include "virt2phy.h"
 33 | 
 34 | static constexpr size_t kIoatDevID = 0;
 35 | static constexpr size_t kIoatDoFence = 0;
 36 | static constexpr size_t kIoatRingSize = 512;
 37 | 
 38 | static constexpr size_t kDstBufferSize = GB(32);
 39 | static constexpr bool kCheckCopyResults = true;
 40 | 
 41 | static constexpr const char *kPmemFile = "/dev/dax0.0";
 42 | 
 43 | DEFINE_uint64(num_prints, 3, "Number of measurements printed before exit");
 44 | DEFINE_uint64(size, KB(128), "Size of each copy");
 45 | DEFINE_uint64(window_size, 8, "Number of outstanding transfers");
 46 | DEFINE_uint64(numa_node, 0, "NUMA node for experiment");
 47 | DEFINE_uint64(use_ioat, 1, "Use IOAT DMA engines, else memcpy");
 48 | DEFINE_uint64(use_pmem, 1, "Use persistent memory for destination buffer");
 49 | 
 50 | // Initialize and start device 0
 51 | void setup_ioat_device() {
 52 |   struct rte_rawdev_info info;
 53 |   info.dev_private = NULL;
 54 | 
 55 |   rt_assert(rte_rawdev_info_get(kIoatDevID, &info) == 0);
 56 |   rt_assert(std::string(info.driver_name).find("ioat") != std::string::npos);
 57 | 
 58 |   struct rte_ioat_rawdev_config p;
 59 |   memset(&info, 0, sizeof(info));
 60 |   info.dev_private = &p;
 61 | 
 62 |   rte_rawdev_info_get(kIoatDevID, &info);
 63 |   rt_assert(p.ring_size == 0, "Initial ring size is non-zero");
 64 | 
 65 |   p.ring_size = kIoatRingSize;
 66 |   rt_assert(rte_rawdev_configure(kIoatDevID, &info) == 0,
 67 |             "rte_rawdev_configure failed");
 68 | 
 69 |   rte_rawdev_info_get(kIoatDevID, &info);
 70 |   rt_assert(p.ring_size == kIoatRingSize, "Wrong ring size");
 71 | 
 72 |   rt_assert(rte_rawdev_start(kIoatDevID) == 0, "Rawdev start failed");
 73 | 
 74 |   printf("Started device %zu\n", kIoatDevID);
 75 | }
 76 | 
 77 | void poll_one() {
 78 |   while (true) {
 79 |     uintptr_t _src, _dst;
 80 |     int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst);
 81 |     rt_assert(ret >= 0, "rte_ioat_completed_copies error");
 82 | 
 83 |     if (ret > 0) break;
 84 |   }
 85 | }
 86 | 
 87 | int main(int argc, char **argv) {
 88 |   if (getuid() != 0) {
 89 |     // Mapping devdax files needs root perms for now
 90 |     printf("You need to be root to run this benchmark\n");
 91 |     exit(-1);
 92 |   }
 93 | 
 94 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
 95 | 
 96 |   auto hugepage_caching_v2p = new HugepageCachingVirt2Phy();
 97 |   double freq_ghz = measure_rdtsc_freq();
 98 | 
 99 |   rt_assert(FLAGS_size <= KB(128),
100 |             "Copy size must be small to reduce the likelihood of "
101 |             "straddling 2 hugepages");
102 | 
103 |   rt_assert(kDstBufferSize / FLAGS_size > 2 * FLAGS_window_size,
104 |             "Copy size too large, pipelined copies might overlap");
105 | 
106 |   // Init DPDK
107 |   const char *rte_argv[] = {"-c", "1",  "-n",  "4", "--log-level",
108 |                             "5",  "-m", "128", NULL};
109 | 
110 |   int rte_argc = sizeof(rte_argv) / sizeof(rte_argv[0]) - 1;
111 |   int ret = rte_eal_init(rte_argc, const_cast<char **>(rte_argv));
112 |   rt_assert(ret >= 0, "rte_eal_init failed");
113 | 
114 |   if (FLAGS_use_ioat == 1) {
115 |     size_t count = rte_rawdev_count();
116 |     printf("Fount %zu rawdev devices\n", count);
117 |     rt_assert(count >= 1, "No rawdev devices available");
118 | 
119 |     setup_ioat_device();
120 |   }
121 | 
122 |   // Create source and destination buffers
123 |   auto huge_alloc = new hugealloc::HugeAlloc(MB(512), FLAGS_numa_node);
124 |   std::vector<hugealloc::Buffer> src_bufs(FLAGS_window_size);
125 |   for (size_t i = 0; i < FLAGS_window_size; i++) {
126 |     src_bufs[i] = huge_alloc->alloc(FLAGS_size);
127 |     rt_assert(src_bufs[i].buf != nullptr);
128 | 
129 |     memset(src_bufs[i].buf, i + 1, FLAGS_size);  // Page-in
130 |   }
131 | 
132 |   printf("Allocating %zu GB destination buffer...", kDstBufferSize / GB(1));
133 |   uint8_t *dst_buf = nullptr;
134 | 
135 |   if (FLAGS_use_pmem == 1) {
136 |     // Map pmem buffer
137 |     size_t mapped_len;
138 |     int is_pmem;
139 | 
140 |     dst_buf = reinterpret_cast<uint8_t *>(
141 |         pmem_map_file(kPmemFile, 0, 0, 0666, &mapped_len, &is_pmem));
142 | 
143 |     rt_assert(dst_buf != nullptr);
144 |     rt_assert(mapped_len >= kDstBufferSize);
145 |     rt_assert(is_pmem == 1);
146 | 
147 |   } else {
148 |     hugealloc::Buffer _dst_buf = huge_alloc->alloc_raw(kDstBufferSize);
149 |     rt_assert(_dst_buf.buf != nullptr);
150 |     rt_assert(reinterpret_cast<size_t>(_dst_buf.buf) % MB(2) == 0);
151 |     dst_buf = _dst_buf.buf;
152 |   }
153 | 
154 |   for (size_t i = 0; i < kDstBufferSize; i += MB(2)) dst_buf[i] = i;  // Page-in
155 |   printf("done!\n");
156 | 
157 |   // Start test
158 |   printf("Flags: size %zu, window size %zu, use_ioat %zu, use_pmem %zu\n",
159 |          FLAGS_size, FLAGS_window_size, FLAGS_use_ioat, FLAGS_use_pmem);
160 | 
161 |   size_t num_printed = 0;  // Number of times we printed stats
162 |   size_t num_completed_copies = 0;
163 | 
164 |   size_t src_bufs_i = 0;      // Index among the source buffers for the next job
165 |   size_t dst_buf_offset = 0;  // Offset in the destination buffer
166 | 
167 |   size_t ioat_outstanding_jobs = 0;
168 |   size_t timer_start = rdtsc();
169 |   FastRand fast_rand;
170 | 
171 |   while (true) {
172 |     if (FLAGS_use_ioat == 1) {
173 |       if (dst_buf_offset / MB(2) != (dst_buf_offset + FLAGS_size) / MB(2)) {
174 |         // The copy operating will straddle two hugepages
175 |         dst_buf_offset += FLAGS_size;
176 |         continue;  // Go back
177 |       }
178 | 
179 |       uint8_t *dst_buf_ptr = &dst_buf[dst_buf_offset];
180 |       uint64_t dst_phys_addr = hugepage_caching_v2p->translate(dst_buf_ptr);
181 | 
182 |       uint8_t *src_buf_ptr = src_bufs[src_bufs_i].buf;
183 |       uint64_t src_phys_addr = hugepage_caching_v2p->translate(src_buf_ptr);
184 | 
185 |       // Pass zeroes as callback args, we don't need them for now
186 |       int ret = rte_ioat_enqueue_copy(
187 |           kIoatDevID, src_phys_addr, dst_phys_addr, FLAGS_size,
188 |           reinterpret_cast<uintptr_t>(src_buf_ptr),
189 |           reinterpret_cast<uintptr_t>(dst_buf_ptr), kIoatDoFence);
190 | 
191 |       rt_assert(ret == 1, "Error with rte_ioat_enqueue_copy");
192 |       rte_ioat_do_copies(kIoatDevID);
193 | 
194 |       ioat_outstanding_jobs++;
195 |       rt_assert(ioat_outstanding_jobs <= kIoatRingSize);
196 | 
197 |       if (ioat_outstanding_jobs == FLAGS_window_size) {
198 |         // Poll for a completed copy
199 |         while (true) {
200 |           uintptr_t _src = 0, _dst = 0;
201 |           int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst);
202 |           rt_assert(ret >= 0, "rte_ioat_completed_copies error");
203 | 
204 |           if (ret == 1 && kCheckCopyResults) {
205 |             // Check at a random offset
206 |             size_t offset = fast_rand.next_u32() % FLAGS_size;
207 |             uint8_t src_val = reinterpret_cast<uint8_t *>(_src)[offset];
208 |             uint8_t dst_val = reinterpret_cast<uint8_t *>(_dst)[offset];
209 |             if (unlikely(src_val != dst_val)) {
210 |               fprintf(stderr, "Mismatch\n");
211 |             }
212 |           }
213 | 
214 |           num_completed_copies += static_cast<size_t>(ret);
215 |           ioat_outstanding_jobs -= static_cast<size_t>(ret);
216 |           if (ret > 0) break;
217 |         }
218 |       }
219 |     } else {  // Use memcpy
220 |       if (FLAGS_use_pmem == 0) {
221 |         rte_memcpy(&dst_buf[dst_buf_offset], src_bufs[src_bufs_i].buf,
222 |                    FLAGS_size);
223 |       } else {
224 |         pmem_memcpy_persist(&dst_buf[dst_buf_offset], src_bufs[src_bufs_i].buf,
225 |                             FLAGS_size);
226 |       }
227 |       num_completed_copies++;
228 |     }
229 | 
230 |     // If we're here, we did/enqueued a copy. Bump src and dst buffers.
231 |     src_bufs_i++;
232 |     if (src_bufs_i == FLAGS_window_size) src_bufs_i = 0;
233 | 
234 |     dst_buf_offset += FLAGS_size;
235 |     if (dst_buf_offset + FLAGS_size >= kDstBufferSize) {
236 |       dst_buf_offset = 0;
237 | 
238 |       double ns_total = to_nsec(rdtsc() - timer_start, freq_ghz);
239 |       printf("%.2f GB/s\n", num_completed_copies * FLAGS_size / ns_total);
240 | 
241 |       num_completed_copies = 0;
242 |       num_printed++;
243 |       timer_start = rdtsc();
244 |     }
245 | 
246 |     if (num_printed == FLAGS_num_prints) break;
247 |   }
248 | 
249 |   // With IOAT, wait for outstanding copies before deleting hugepages
250 |   printf("Waiting for outstanding copies to finish\n");
251 |   while (FLAGS_use_ioat == 1 && ioat_outstanding_jobs > 0) {
252 |     uintptr_t _src, _dst;
253 |     int ret = rte_ioat_completed_copies(kIoatDevID, 1u, &_src, &_dst);
254 |     rt_assert(ret >= 0, "rte_ioat_completed_copies error");
255 |     ioat_outstanding_jobs -= static_cast<size_t>(ret);
256 |   }
257 | 
258 |   delete huge_alloc;
259 | }
260 | 


--------------------------------------------------------------------------------
/hopscotch_pmem/bench.cc:
--------------------------------------------------------------------------------
  1 | #include <gflags/gflags.h>
  2 | #include <condition_variable>
  3 | #include <map>
  4 | #include <mutex>
  5 | #include <pcg/pcg_random.hpp>
  6 | #include "../common.h"
  7 | #include "phopscotch.h"
  8 | 
  9 | #define table phopscotch
 10 | 
 11 | DEFINE_string(pmem_file, "/dev/dax12.0", "Persistent memory file name");
 12 | DEFINE_uint64(table_key_capacity, MB(1), "Number of keys in table per thread");
 13 | DEFINE_uint64(batch_size, table::kMaxBatchSize, "Batch size");
 14 | DEFINE_string(benchmark, "get", "Benchmark to run");
 15 | DEFINE_uint64(num_threads, 1, "Number of threads");
 16 | DEFINE_uint64(sweep_optimizations, 0, "Sweep optimizations");
 17 | 
 18 | //
 19 | // Overhead to occupancy map:
 20 | // 0.05 -> 0.56
 21 | static constexpr double kDefaultOverhead = 0.05;
 22 | static constexpr double kNumaNode = 0;
 23 | 
 24 | class Key {
 25 |  public:
 26 |   size_t key_frag[2];
 27 |   bool operator==(const Key &rhs) const {
 28 |     return memcmp(this, &rhs, sizeof(Key)) == 0;
 29 |   }
 30 |   bool operator!=(const Key &rhs) const {
 31 |     return memcmp(this, &rhs, sizeof(Key)) != 0;
 32 |   }
 33 |   Key() { memset(key_frag, 0, sizeof(Key)); }
 34 | };
 35 | 
 36 | class Value {
 37 |  public:
 38 |   size_t val_frag[8];
 39 |   Value() { memset(val_frag, 0, sizeof(Value)); }
 40 | };
 41 | 
 42 | // Used for threads to wait to begin work
 43 | // https://stackoverflow.com/questions/24465533/implementing-boostbarrier-in-c11
 44 | class Barrier {
 45 |  private:
 46 |   std::mutex mutex;
 47 |   std::condition_variable cv;
 48 |   std::size_t count;
 49 | 
 50 |  public:
 51 |   explicit Barrier(std::size_t count) : count{count} {}
 52 |   void wait() {
 53 |     std::unique_lock<std::mutex> lock{mutex};
 54 |     if (--count == 0) {
 55 |       cv.notify_all();
 56 |     } else {
 57 |       cv.wait(lock, [this] { return count == 0; });
 58 |     }
 59 |   }
 60 | };
 61 | Barrier *barrier;
 62 | 
 63 | /// Given a random number \p rand, return a random number
 64 | static inline uint64_t fastrange64(uint64_t rand, uint64_t n) {
 65 |   return static_cast<uint64_t>(
 66 |       static_cast<__uint128_t>(rand) * static_cast<__uint128_t>(n) >> 64);
 67 | }
 68 | 
 69 | /// Generate a key for a thread's partition. Each partition hosts a contiguous
 70 | /// range of keys {1, ..., max_key}
 71 | static inline size_t gen_key(size_t offset_in_partition, size_t thread_id) {
 72 |   assert(thread_id <= 31);
 73 |   return ((offset_in_partition << 5) | thread_id);
 74 | }
 75 | 
 76 | typedef table::HashMap<Key, Value> HashMap;
 77 | 
 78 | size_t populate(HashMap *hashmap, size_t thread_id) {
 79 |   bool is_set_arr[table::kMaxBatchSize];
 80 |   Key key_arr[table::kMaxBatchSize];
 81 |   Value val_arr[table::kMaxBatchSize];
 82 |   Key *key_ptr_arr[table::kMaxBatchSize];
 83 |   Value *val_ptr_arr[table::kMaxBatchSize];
 84 |   bool success_arr[table::kMaxBatchSize];
 85 | 
 86 |   size_t num_success = 0;
 87 | 
 88 |   for (size_t i = 0; i < table::kMaxBatchSize; i++) {
 89 |     key_ptr_arr[i] = &key_arr[i];
 90 |     val_ptr_arr[i] = &val_arr[i];
 91 |   }
 92 | 
 93 |   const size_t num_keys_to_insert =
 94 |       roundup<table::kMaxBatchSize>(FLAGS_table_key_capacity);
 95 |   size_t progress_console_lim = num_keys_to_insert / 10;
 96 | 
 97 |   for (size_t i = 1; i <= num_keys_to_insert; i += table::kMaxBatchSize) {
 98 |     for (size_t j = 0; j < table::kMaxBatchSize; j++) {
 99 |       is_set_arr[j] = true;
100 |       size_t offset_in_partition = (i + j);
101 |       key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id);
102 |       val_arr[j].val_frag[0] = key_arr[j].key_frag[0];
103 |     }
104 | 
105 |     hashmap->batch_op_drain(is_set_arr, const_cast<const Key **>(key_ptr_arr),
106 |                             val_ptr_arr, success_arr, table::kMaxBatchSize);
107 | 
108 |     if (i >= progress_console_lim) {
109 |       printf("thread %zu: %.2f percent done\n", thread_id,
110 |              i * 1.0 / num_keys_to_insert);
111 |       progress_console_lim += num_keys_to_insert / 10;
112 |     }
113 | 
114 |     for (size_t j = 0; j < table::kMaxBatchSize; j++) {
115 |       num_success += success_arr[j];
116 |       if (!success_arr[j]) {
117 |         printf("thread %zu: populate() failed at key %zu of %zu keys\n",
118 |                thread_id, i + j, num_keys_to_insert);
119 |         return num_success;
120 |       }
121 |     }
122 |   }
123 | 
124 |   return FLAGS_table_key_capacity;  // All keys were added
125 | }
126 | 
127 | enum class Workload { kGets, kSets, k5050 };
128 | double batch_exp(HashMap *hashmap, size_t max_key, size_t batch_size,
129 |                  Workload workload, size_t thread_id) {
130 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
131 |   constexpr size_t kNumIters = MB(1);
132 | 
133 |   struct timespec start;
134 |   bool is_set_arr[table::kMaxBatchSize];
135 |   Key key_arr[table::kMaxBatchSize];
136 |   Value val_arr[table::kMaxBatchSize];
137 |   Key *key_ptr_arr[table::kMaxBatchSize];
138 |   Value *val_ptr_arr[table::kMaxBatchSize];
139 |   bool success_arr[table::kMaxBatchSize];
140 |   clock_gettime(CLOCK_REALTIME, &start);
141 | 
142 |   for (size_t i = 0; i < table::kMaxBatchSize; i++) {
143 |     key_ptr_arr[i] = &key_arr[i];
144 |     val_ptr_arr[i] = &val_arr[i];
145 |   }
146 | 
147 |   size_t num_success = 0;
148 |   for (size_t i = 1; i <= kNumIters; i += batch_size) {
149 |     for (size_t j = 0; j < batch_size; j++) {
150 |       switch (workload) {
151 |         case Workload::kGets: is_set_arr[j] = false; break;
152 |         case Workload::kSets: is_set_arr[j] = true; break;
153 |         case Workload::k5050: is_set_arr[j] = pcg() % 2 == 0; break;
154 |       }
155 | 
156 |       size_t offset_in_partition = 1 + fastrange64(pcg(), max_key - 1);
157 | 
158 |       key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id);
159 |       val_arr[j].val_frag[0] = is_set_arr[j] ? key_arr[j].key_frag[0] : 0;
160 |     }
161 | 
162 |     hashmap->batch_op_drain(is_set_arr, const_cast<const Key **>(key_ptr_arr),
163 |                             val_ptr_arr, success_arr, batch_size);
164 | 
165 |     for (size_t j = 0; j < batch_size; j++) {
166 |       num_success += success_arr[j];
167 |       if (!is_set_arr[j] && val_arr[j].val_frag[0] != key_arr[j].key_frag[0]) {
168 |         printf("invalid value %zu for key %zu\n", val_arr[j].val_frag[0],
169 |                key_arr[j].key_frag[0]);
170 |       }
171 |     }
172 |   }
173 | 
174 |   double seconds = sec_since(start);
175 |   double tput = kNumIters / (seconds * 1000000);
176 |   return tput;
177 | }
178 | 
179 | void thread_func(size_t thread_id) {
180 |   size_t bytes_per_map = HashMap::get_required_bytes(FLAGS_table_key_capacity);
181 |   bytes_per_map = roundup<256>(bytes_per_map);
182 | 
183 |   auto *hashmap = new HashMap(FLAGS_pmem_file, thread_id * bytes_per_map,
184 |                               FLAGS_table_key_capacity);
185 | 
186 |   printf("thread %zu: Populating hashmap. Expected time = %.1f seconds\n",
187 |          thread_id, FLAGS_table_key_capacity / (4.0 * 1000000));  // 4 M/s
188 | 
189 |   size_t max_key = populate(hashmap, thread_id);
190 |   printf("thread %zu: final occupancy = %.2f\n", thread_id,
191 |          max_key * 1.0 / FLAGS_table_key_capacity);
192 | 
193 |   std::vector<double> tput_vec;
194 |   Workload workload;
195 |   if (FLAGS_benchmark == "set") workload = Workload::kSets;
196 |   if (FLAGS_benchmark == "get") workload = Workload::kGets;
197 |   if (FLAGS_benchmark == "5050") workload = Workload::k5050;
198 | 
199 |   printf("thread %zu, done populating. waiting for others.\n", thread_id);
200 |   barrier->wait();
201 |   printf("thread %zu, starting work.\n", thread_id);
202 | 
203 |   for (size_t i = 0; i < 10; i++) {
204 |     double tput =
205 |         batch_exp(hashmap, max_key, FLAGS_batch_size, workload, thread_id);
206 |     printf("thread %zu, iter %zu: tput = %.2f\n", thread_id, i, tput);
207 |     tput_vec.push_back(tput);
208 |   }
209 | 
210 |   double avg_tput =
211 |       std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size();
212 |   double _stddev = stddev(tput_vec);
213 | 
214 |   printf("thread %zu of %zu final M/s: %.2f avg, %.2f stddev\n", thread_id,
215 |          FLAGS_num_threads, avg_tput, _stddev);
216 | 
217 |   delete hashmap;
218 | }
219 | 
220 | void sweep_do_one(HashMap *hashmap, size_t max_key, size_t batch_size,
221 |                   Workload workload) {
222 |   std::vector<double> tput_vec;
223 | 
224 |   for (size_t i = 0; i < 10; i++) {
225 |     double tput;
226 |     tput = batch_exp(hashmap, max_key, batch_size, workload, 0 /* thread_id */);
227 |     tput_vec.push_back(tput);
228 |   }
229 | 
230 |   double avg_tput =
231 |       std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size();
232 |   double _stddev = stddev(tput_vec);
233 | 
234 |   printf("  Tput (M/s) = %.2f avg, %.2f stddev\n", avg_tput, _stddev);
235 | }
236 | 
237 | // Measure the effectiveness of optimizations with one thread
238 | void sweep_optimizations() {
239 |   auto *hashmap = new HashMap(FLAGS_pmem_file, 0, FLAGS_table_key_capacity);
240 | 
241 |   printf("Populating hashmap. Expected time = %.1f seconds\n",
242 |          FLAGS_table_key_capacity / (4.0 * 1000000));  // 4 M/s
243 | 
244 |   size_t max_key = populate(hashmap, 0 /* thread_id */);
245 |   printf("Final occupancy = %.2f\n", max_key * 1.0 / FLAGS_table_key_capacity);
246 | 
247 |   std::vector<size_t> batch_size_vec = {1, 4, 8, 16};
248 | 
249 |   // GET batch sizes
250 |   for (auto &batch_size : batch_size_vec) {
251 |     printf("get. Batch size %zu\n", batch_size);
252 |     sweep_do_one(hashmap, max_key, batch_size, Workload::kGets);
253 |   }
254 | 
255 |   // SET batch sizes
256 |   for (auto &batch_size : batch_size_vec) {
257 |     printf("set. Batch size %zu\n", batch_size);
258 |     sweep_do_one(hashmap, max_key, batch_size, Workload::kSets);
259 |   }
260 | 
261 |   // 50/50 batch sizes
262 |   for (auto &batch_size : batch_size_vec) {
263 |     printf("50/50. Batch size %zu\n", batch_size);
264 |     sweep_do_one(hashmap, max_key, batch_size, Workload::k5050);
265 |   }
266 | 
267 |   // Optimizations for GETs
268 |   hashmap->opts.prefetch = false;
269 |   printf("get. Batch size 16, no prefetch.\n");
270 |   sweep_do_one(hashmap, max_key, 16, Workload::kGets);
271 |   hashmap->opts.reset();
272 | 
273 |   // Optimizations for SETs, and 50/50
274 |   hashmap->opts.redo_batch = false;
275 |   printf("set. Batch size 16, only redo batch disabled\n");
276 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
277 |   printf("50/50. Batch size 16, only redo batch disabled\n");
278 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
279 |   hashmap->opts.reset();
280 | 
281 |   hashmap->opts.prefetch = false;
282 |   printf("set. Batch size 16, only prefetch disabled.\n");
283 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
284 |   printf("50/50. Batch size 16, only prefetch disabled\n");
285 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
286 |   hashmap->opts.reset();
287 | 
288 |   hashmap->opts.async_drain = false;
289 |   printf("set. Batch size 16, only async slot drain disabled.\n");
290 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
291 |   printf("50/50. Batch size 16, only async slot drain disabled.\n");
292 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
293 |   hashmap->opts.reset();
294 | 
295 |   delete hashmap;
296 | }
297 | 
298 | int main(int argc, char **argv) {
299 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
300 | 
301 |   if (FLAGS_sweep_optimizations == 1) {
302 |     std::thread t = std::thread(sweep_optimizations);
303 |     bind_to_core(t, kNumaNode, 0);
304 |     t.join();
305 |     exit(0);
306 |   }
307 | 
308 |   barrier = new Barrier(FLAGS_num_threads);
309 |   std::vector<std::thread> threads(FLAGS_num_threads);
310 | 
311 |   printf("Launching %zu threads\n", FLAGS_num_threads);
312 |   for (size_t i = 0; i < FLAGS_num_threads; i++) {
313 |     threads[i] = std::thread(thread_func, i);
314 |     bind_to_core(threads[i], kNumaNode, i);
315 |   }
316 | 
317 |   for (size_t i = 0; i < FLAGS_num_threads; i++) {
318 |     threads[i].join();
319 |   }
320 | 
321 |   delete barrier;
322 | }
323 | 


--------------------------------------------------------------------------------
/mica_pmem/bench.cc:
--------------------------------------------------------------------------------
  1 | #include <gflags/gflags.h>
  2 | #include <condition_variable>
  3 | #include <map>
  4 | #include <mutex>
  5 | #include <pcg/pcg_random.hpp>
  6 | #include "../common.h"
  7 | #include "pmica.h"
  8 | 
  9 | #define table pmica
 10 | 
 11 | DEFINE_string(pmem_file, "/dev/dax12.0", "Persistent memory file name");
 12 | DEFINE_uint64(table_key_capacity, MB(1), "Number of keys in table per thread");
 13 | DEFINE_uint64(batch_size, table::kMaxBatchSize, "Batch size");
 14 | DEFINE_string(benchmark, "get", "Benchmark to run");
 15 | DEFINE_uint64(num_threads, 1, "Number of threads");
 16 | DEFINE_uint64(sweep_optimizations, 0, "Sweep optimizations");
 17 | 
 18 | //
 19 | // Overhead to occupancy map:
 20 | // 0.05 -> 0.56
 21 | static constexpr double kDefaultOverhead = 0.05;
 22 | static constexpr double kNumaNode = 0;
 23 | 
 24 | class Key {
 25 |  public:
 26 |   size_t key_frag[2];
 27 |   bool operator==(const Key &rhs) const {
 28 |     return memcmp(this, &rhs, sizeof(Key)) == 0;
 29 |   }
 30 |   bool operator!=(const Key &rhs) const {
 31 |     return memcmp(this, &rhs, sizeof(Key)) != 0;
 32 |   }
 33 |   Key() { memset(key_frag, 0, sizeof(Key)); }
 34 | };
 35 | 
 36 | class Value {
 37 |  public:
 38 |   size_t val_frag[4];
 39 |   Value() { memset(val_frag, 0, sizeof(Value)); }
 40 | };
 41 | 
 42 | // Used for threads to wait to begin work
 43 | // https://stackoverflow.com/questions/24465533/implementing-boostbarrier-in-c11
 44 | class Barrier {
 45 |  private:
 46 |   std::mutex mutex;
 47 |   std::condition_variable cv;
 48 |   std::size_t count;
 49 | 
 50 |  public:
 51 |   explicit Barrier(std::size_t count) : count{count} {}
 52 |   void wait() {
 53 |     std::unique_lock<std::mutex> lock{mutex};
 54 |     if (--count == 0) {
 55 |       cv.notify_all();
 56 |     } else {
 57 |       cv.wait(lock, [this] { return count == 0; });
 58 |     }
 59 |   }
 60 | };
 61 | Barrier *barrier;
 62 | 
 63 | /// Given a random number \p rand, return a random number
 64 | static inline uint64_t fastrange64(uint64_t rand, uint64_t n) {
 65 |   return static_cast<uint64_t>(
 66 |       static_cast<__uint128_t>(rand) * static_cast<__uint128_t>(n) >> 64);
 67 | }
 68 | 
 69 | /// Generate a key for a thread's partition. Each partition hosts a contiguous
 70 | /// range of keys {1, ..., max_key}
 71 | static inline size_t gen_key(size_t offset_in_partition, size_t thread_id) {
 72 |   assert(thread_id <= 31);
 73 |   return ((offset_in_partition << 5) | thread_id);
 74 | }
 75 | 
 76 | typedef table::HashMap<Key, Value> HashMap;
 77 | 
 78 | size_t populate(HashMap *hashmap, size_t thread_id) {
 79 |   bool is_set_arr[table::kMaxBatchSize];
 80 |   Key key_arr[table::kMaxBatchSize];
 81 |   Value val_arr[table::kMaxBatchSize];
 82 |   Key *key_ptr_arr[table::kMaxBatchSize];
 83 |   Value *val_ptr_arr[table::kMaxBatchSize];
 84 |   bool success_arr[table::kMaxBatchSize];
 85 | 
 86 |   size_t num_success = 0;
 87 | 
 88 |   for (size_t i = 0; i < table::kMaxBatchSize; i++) {
 89 |     key_ptr_arr[i] = &key_arr[i];
 90 |     val_ptr_arr[i] = &val_arr[i];
 91 |   }
 92 | 
 93 |   const size_t num_keys_to_insert =
 94 |       roundup<table::kMaxBatchSize>(FLAGS_table_key_capacity);
 95 |   size_t progress_console_lim = num_keys_to_insert / 10;
 96 | 
 97 |   for (size_t i = 1; i <= num_keys_to_insert; i += table::kMaxBatchSize) {
 98 |     for (size_t j = 0; j < table::kMaxBatchSize; j++) {
 99 |       is_set_arr[j] = true;
100 |       size_t offset_in_partition = (i + j);
101 |       key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id);
102 |       val_arr[j].val_frag[0] = key_arr[j].key_frag[0];
103 |     }
104 | 
105 |     hashmap->batch_op_drain(is_set_arr, const_cast<const Key **>(key_ptr_arr),
106 |                             val_ptr_arr, success_arr, table::kMaxBatchSize);
107 | 
108 |     if (i >= progress_console_lim) {
109 |       printf("thread %zu: %.2f percent done\n", thread_id,
110 |              i * 1.0 / num_keys_to_insert);
111 |       progress_console_lim += num_keys_to_insert / 10;
112 |     }
113 | 
114 |     for (size_t j = 0; j < table::kMaxBatchSize; j++) {
115 |       num_success += success_arr[j];
116 |       if (!success_arr[j]) {
117 |         printf("thread %zu: populate() failed at key %zu of %zu keys\n",
118 |                thread_id, i + j, num_keys_to_insert);
119 |         return num_success;
120 |       }
121 |     }
122 |   }
123 | 
124 |   return FLAGS_table_key_capacity;  // All keys were added
125 | }
126 | 
127 | enum class Workload { kGets, kSets, k5050 };
128 | double batch_exp(HashMap *hashmap, size_t max_key, size_t batch_size,
129 |                  Workload workload, size_t thread_id) {
130 |   pcg64_fast pcg(pcg_extras::seed_seq_from<std::random_device>{});
131 |   constexpr size_t kNumIters = MB(1);
132 | 
133 |   struct timespec start;
134 |   bool is_set_arr[table::kMaxBatchSize];
135 |   Key key_arr[table::kMaxBatchSize];
136 |   Value val_arr[table::kMaxBatchSize];
137 |   Key *key_ptr_arr[table::kMaxBatchSize];
138 |   Value *val_ptr_arr[table::kMaxBatchSize];
139 |   bool success_arr[table::kMaxBatchSize];
140 |   clock_gettime(CLOCK_REALTIME, &start);
141 | 
142 |   for (size_t i = 0; i < table::kMaxBatchSize; i++) {
143 |     key_ptr_arr[i] = &key_arr[i];
144 |     val_ptr_arr[i] = &val_arr[i];
145 |   }
146 | 
147 |   size_t num_success = 0;
148 |   for (size_t i = 1; i <= kNumIters; i += batch_size) {
149 |     for (size_t j = 0; j < batch_size; j++) {
150 |       switch (workload) {
151 |         case Workload::kGets: is_set_arr[j] = false; break;
152 |         case Workload::kSets: is_set_arr[j] = true; break;
153 |         case Workload::k5050: is_set_arr[j] = pcg() % 2 == 0; break;
154 |       }
155 | 
156 |       size_t offset_in_partition = 1 + fastrange64(pcg(), max_key - 1);
157 | 
158 |       key_arr[j].key_frag[0] = gen_key(offset_in_partition, thread_id);
159 |       val_arr[j].val_frag[0] = is_set_arr[j] ? key_arr[j].key_frag[0] : 0;
160 |     }
161 | 
162 |     hashmap->batch_op_drain(is_set_arr, const_cast<const Key **>(key_ptr_arr),
163 |                             val_ptr_arr, success_arr, batch_size);
164 | 
165 |     for (size_t j = 0; j < batch_size; j++) {
166 |       num_success += success_arr[j];
167 |       if (!is_set_arr[j] && val_arr[j].val_frag[0] != key_arr[j].key_frag[0]) {
168 |         printf("invalid value %zu for key %zu\n", val_arr[j].val_frag[0],
169 |                key_arr[j].key_frag[0]);
170 |       }
171 |     }
172 |   }
173 | 
174 |   double seconds = sec_since(start);
175 |   double tput = kNumIters / (seconds * 1000000);
176 |   return tput;
177 | }
178 | 
179 | void thread_func(size_t thread_id) {
180 |   size_t bytes_per_map =
181 |       HashMap::get_required_bytes(FLAGS_table_key_capacity, kDefaultOverhead);
182 |   bytes_per_map = roundup<256>(bytes_per_map);
183 | 
184 |   auto *hashmap = new HashMap(FLAGS_pmem_file, thread_id * bytes_per_map,
185 |                               FLAGS_table_key_capacity, kDefaultOverhead);
186 | 
187 |   printf("thread %zu: Populating hashmap. Expected time = %.1f seconds\n",
188 |          thread_id, FLAGS_table_key_capacity / (4.0 * 1000000));  // 4 M/s
189 | 
190 |   size_t max_key = populate(hashmap, thread_id);
191 |   printf("thread %zu: final occupancy = %.2f\n", thread_id,
192 |          max_key * 1.0 / hashmap->get_key_capacity());
193 | 
194 |   std::vector<double> tput_vec;
195 |   Workload workload;
196 |   if (FLAGS_benchmark == "set") workload = Workload::kSets;
197 |   if (FLAGS_benchmark == "get") workload = Workload::kGets;
198 |   if (FLAGS_benchmark == "5050") workload = Workload::k5050;
199 | 
200 |   printf("thread %zu, done populating. waiting for others.\n", thread_id);
201 |   barrier->wait();
202 |   printf("thread %zu, starting work.\n", thread_id);
203 | 
204 |   for (size_t i = 0; i < 10; i++) {
205 |     double tput =
206 |         batch_exp(hashmap, max_key, FLAGS_batch_size, workload, thread_id);
207 |     printf("thread %zu, iter %zu: tput = %.2f\n", thread_id, i, tput);
208 |     tput_vec.push_back(tput);
209 |   }
210 | 
211 |   double avg_tput =
212 |       std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size();
213 |   double _stddev = stddev(tput_vec);
214 | 
215 |   printf("thread %zu of %zu final M/s: %.2f avg, %.2f stddev\n", thread_id,
216 |          FLAGS_num_threads, avg_tput, _stddev);
217 | 
218 |   delete hashmap;
219 | }
220 | 
221 | // Measure the effectiveness of optimizations with one thread, given a config
222 | void sweep_do_one(HashMap *hashmap, size_t max_key, size_t batch_size,
223 |                   Workload workload) {
224 |   std::vector<double> tput_vec;
225 | 
226 |   for (size_t msr = 0; msr < 3; msr++) {
227 |     double tput;
228 |     tput = batch_exp(hashmap, max_key, batch_size, workload, 0 /* thread_id */);
229 |     tput_vec.push_back(tput);
230 |   }
231 | 
232 |   double avg_tput =
233 |       std::accumulate(tput_vec.begin(), tput_vec.end(), 0.0) / tput_vec.size();
234 |   double _stddev = stddev(tput_vec);
235 | 
236 |   printf("  Tput (M/s) = %.2f avg, %.2f stddev\n", avg_tput, _stddev);
237 | }
238 | 
239 | // Measure the effectiveness of optimizations with one thread
240 | void sweep_optimizations() {
241 |   auto *hashmap = new HashMap(FLAGS_pmem_file, 0, FLAGS_table_key_capacity,
242 |                               kDefaultOverhead);
243 | 
244 |   printf("Populating hashmap. Expected time = %.1f seconds\n",
245 |          FLAGS_table_key_capacity / (4.0 * 1000000));  // 4 M/s
246 | 
247 |   size_t max_key = populate(hashmap, 0 /* thread_id */);
248 |   printf("Final occupancy = %.2f\n",
249 |          max_key * 1.0 / hashmap->get_key_capacity());
250 | 
251 |   std::vector<size_t> batch_size_vec = {1, 4, 8, 16};
252 | 
253 |   // SET batch sizes
254 |   for (auto &batch_size : batch_size_vec) {
255 |     printf("set. Batch size %zu\n", batch_size);
256 |     sweep_do_one(hashmap, max_key, batch_size, Workload::kSets);
257 |   }
258 | 
259 |   // GET batch sizes
260 |   for (auto &batch_size : batch_size_vec) {
261 |     printf("get. Batch size %zu\n", batch_size);
262 |     sweep_do_one(hashmap, max_key, batch_size, Workload::kGets);
263 |   }
264 | 
265 |   // 50/50 batch sizes
266 |   for (auto &batch_size : batch_size_vec) {
267 |     printf("50/50. Batch size %zu\n", batch_size);
268 |     sweep_do_one(hashmap, max_key, batch_size, Workload::k5050);
269 |   }
270 | 
271 |   // Optimizations for GETs
272 |   hashmap->opts.prefetch = false;
273 |   printf("get. Batch size 16, no prefetch.\n");
274 |   sweep_do_one(hashmap, max_key, 16, Workload::kGets);
275 |   hashmap->opts.reset();
276 | 
277 |   // Optimizations for SETs, and 50/50
278 |   hashmap->opts.redo_batch = false;
279 |   printf("set. Batch size 16, only redo batch disabled\n");
280 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
281 |   printf("50/50. Batch size 16, only redo batch disabled\n");
282 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
283 |   hashmap->opts.reset();
284 | 
285 |   hashmap->opts.prefetch = false;
286 |   printf("set. Batch size 16, only prefetch disabled.\n");
287 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
288 |   printf("50/50. Batch size 16, only prefetch disabled\n");
289 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
290 |   hashmap->opts.reset();
291 | 
292 |   hashmap->opts.async_drain = false;
293 |   printf("set. Batch size 16, only async slot drain disabled.\n");
294 |   sweep_do_one(hashmap, max_key, 16, Workload::kSets);
295 |   printf("50/50. Batch size 16, only async slot drain disabled.\n");
296 |   sweep_do_one(hashmap, max_key, 16, Workload::k5050);
297 |   hashmap->opts.reset();
298 | 
299 |   delete hashmap;
300 | }
301 | 
302 | int main(int argc, char **argv) {
303 |   gflags::ParseCommandLineFlags(&argc, &argv, true);
304 | 
305 |   if (FLAGS_sweep_optimizations == 1) {
306 |     std::thread t = std::thread(sweep_optimizations);
307 |     bind_to_core(t, kNumaNode, 0);
308 |     t.join();
309 |     exit(0);
310 |   }
311 | 
312 |   barrier = new Barrier(FLAGS_num_threads);
313 |   std::vector<std::thread> threads(FLAGS_num_threads);
314 | 
315 |   printf("Launching %zu threads\n", FLAGS_num_threads);
316 |   for (size_t i = 0; i < FLAGS_num_threads; i++) {
317 |     threads[i] = std::thread(thread_func, i);
318 |     bind_to_core(threads[i], kNumaNode, i);
319 |   }
320 | 
321 |   for (size_t i = 0; i < FLAGS_num_threads; i++) {
322 |     threads[i].join();
323 |   }
324 | 
325 |   delete barrier;
326 | }
327 | 


--------------------------------------------------------------------------------
/nvme_perf/c/hello.c:
--------------------------------------------------------------------------------
  1 | /*-
  2 |  *   BSD LICENSE
  3 |  *
  4 |  *   Copyright (c) Intel Corporation.
  5 |  *   All rights reserved.
  6 |  *
  7 |  *   Redistribution and use in source and binary forms, with or without
  8 |  *   modification, are permitted provided that the following conditions
  9 |  *   are met:
 10 |  *
 11 |  *     * Redistributions of source code must retain the above copyright
 12 |  *       notice, this list of conditions and the following disclaimer.
 13 |  *     * Redistributions in binary form must reproduce the above copyright
 14 |  *       notice, this list of conditions and the following disclaimer in
 15 |  *       the documentation and/or other materials provided with the
 16 |  *       distribution.
 17 |  *     * Neither the name of Intel Corporation nor the names of its
 18 |  *       contributors may be used to endorse or promote products derived
 19 |  *       from this software without specific prior written permission.
 20 |  *
 21 |  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 |  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 |  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 |  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25 |  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 26 |  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 27 |  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 28 |  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 29 |  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 30 |  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 31 |  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 |  */
 33 | 
 34 | #include "spdk/stdinc.h"
 35 | 
 36 | #include "spdk/nvme.h"
 37 | #include "spdk/env.h"
 38 | 
 39 | struct ctrlr_entry {
 40 | 	struct spdk_nvme_ctrlr	*ctrlr;
 41 | 	struct ctrlr_entry	*next;
 42 | 	char			name[1024];
 43 | };
 44 | 
 45 | struct ns_entry {
 46 | 	struct spdk_nvme_ctrlr	*ctrlr;
 47 | 	struct spdk_nvme_ns	*ns;
 48 | 	struct ns_entry		*next;
 49 | 	struct spdk_nvme_qpair	*qpair;
 50 | };
 51 | 
 52 | static struct ctrlr_entry *g_controllers = NULL;
 53 | static struct ns_entry *g_namespaces = NULL;
 54 | 
 55 | static void
 56 | register_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns)
 57 | {
 58 | 	struct ns_entry *entry;
 59 | 	const struct spdk_nvme_ctrlr_data *cdata;
 60 | 
 61 | 	/*
 62 | 	 * spdk_nvme_ctrlr is the logical abstraction in SPDK for an NVMe
 63 | 	 *  controller.  During initialization, the IDENTIFY data for the
 64 | 	 *  controller is read using an NVMe admin command, and that data
 65 | 	 *  can be retrieved using spdk_nvme_ctrlr_get_data() to get
 66 | 	 *  detailed information on the controller.  Refer to the NVMe
 67 | 	 *  specification for more details on IDENTIFY for NVMe controllers.
 68 | 	 */
 69 | 	cdata = spdk_nvme_ctrlr_get_data(ctrlr);
 70 | 
 71 | 	if (!spdk_nvme_ns_is_active(ns)) {
 72 | 		printf("Controller %-20.20s (%-20.20s): Skipping inactive NS %u\n",
 73 | 		       cdata->mn, cdata->sn,
 74 | 		       spdk_nvme_ns_get_id(ns));
 75 | 		return;
 76 | 	}
 77 | 
 78 | 	entry = malloc(sizeof(struct ns_entry));
 79 | 	if (entry == NULL) {
 80 | 		perror("ns_entry malloc");
 81 | 		exit(1);
 82 | 	}
 83 | 
 84 | 	entry->ctrlr = ctrlr;
 85 | 	entry->ns = ns;
 86 | 	entry->next = g_namespaces;
 87 | 	g_namespaces = entry;
 88 | 
 89 | 	printf("  Namespace ID: %d size: %juGB\n", spdk_nvme_ns_get_id(ns),
 90 | 	       spdk_nvme_ns_get_size(ns) / 1000000000);
 91 | }
 92 | 
 93 | struct hello_world_sequence {
 94 | 	struct ns_entry	*ns_entry;
 95 | 	char		*buf;
 96 | 	unsigned        using_cmb_io;
 97 | 	int		is_completed;
 98 | };
 99 | 
100 | static void
101 | read_complete(void *arg, const struct spdk_nvme_cpl *)
102 | {
103 | 	struct hello_world_sequence *sequence = arg;
104 | 
105 | 	/*
106 | 	 * The read I/O has completed.  Print the contents of the
107 | 	 *  buffer, free the buffer, then mark the sequence as
108 | 	 *  completed.  This will trigger the hello_world() function
109 | 	 *  to exit its polling loop.
110 | 	 */
111 | 	printf("%s", sequence->buf);
112 | 	spdk_free(sequence->buf);
113 | 	sequence->is_completed = 1;
114 | }
115 | 
116 | static void
117 | write_complete(void *arg, const struct spdk_nvme_cpl *)
118 | {
119 | 	struct hello_world_sequence	*sequence = arg;
120 | 	struct ns_entry			*ns_entry = sequence->ns_entry;
121 | 	int				rc;
122 | 
123 | 	/*
124 | 	 * The write I/O has completed.  Free the buffer associated with
125 | 	 *  the write I/O and allocate a new zeroed buffer for reading
126 | 	 *  the data back from the NVMe namespace.
127 | 	 */
128 | 	if (sequence->using_cmb_io) {
129 | 		spdk_nvme_ctrlr_free_cmb_io_buffer(ns_entry->ctrlr, sequence->buf, 0x1000);
130 | 	} else {
131 | 		spdk_free(sequence->buf);
132 | 	}
133 | 	sequence->buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
134 | 
135 | 	rc = spdk_nvme_ns_cmd_read(ns_entry->ns, ns_entry->qpair, sequence->buf,
136 | 				   0, /* LBA start */
137 | 				   1, /* number of LBAs */
138 | 				   read_complete, (void *)sequence, 0);
139 | 	if (rc != 0) {
140 | 		fprintf(stderr, "starting read I/O failed\n");
141 | 		exit(1);
142 | 	}
143 | }
144 | 
145 | static void
146 | hello_world(void)
147 | {
148 | 	struct ns_entry			*ns_entry;
149 | 	struct hello_world_sequence	sequence;
150 | 	int				rc;
151 | 
152 | 	ns_entry = g_namespaces;
153 | 	while (ns_entry != NULL) {
154 | 		/*
155 | 		 * Allocate an I/O qpair that we can use to submit read/write requests
156 | 		 *  to namespaces on the controller.  NVMe controllers typically support
157 | 		 *  many qpairs per controller.  Any I/O qpair allocated for a controller
158 | 		 *  can submit I/O to any namespace on that controller.
159 | 		 *
160 | 		 * The SPDK NVMe driver provides no synchronization for qpair accesses -
161 | 		 *  the application must ensure only a single thread submits I/O to a
162 | 		 *  qpair, and that same thread must also check for completions on that
163 | 		 *  qpair.  This enables extremely efficient I/O processing by making all
164 | 		 *  I/O operations completely lockless.
165 | 		 */
166 | 		ns_entry->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ns_entry->ctrlr, NULL, 0);
167 | 		if (ns_entry->qpair == NULL) {
168 | 			printf("ERROR: spdk_nvme_ctrlr_alloc_io_qpair() failed\n");
169 | 			return;
170 | 		}
171 | 
172 | 		/*
173 | 		 * Use spdk_dma_zmalloc to allocate a 4KB zeroed buffer.  This memory
174 | 		 * will be pinned, which is required for data buffers used for SPDK NVMe
175 | 		 * I/O operations.
176 | 		 */
177 | 		sequence.using_cmb_io = 1;
178 | 		sequence.buf = spdk_nvme_ctrlr_alloc_cmb_io_buffer(ns_entry->ctrlr, 0x1000);
179 | 		if (sequence.buf == NULL) {
180 | 			sequence.using_cmb_io = 0;
181 | 			sequence.buf = spdk_zmalloc(0x1000, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
182 | 		}
183 | 		if (sequence.buf == NULL) {
184 | 			printf("ERROR: write buffer allocation failed\n");
185 | 			return;
186 | 		}
187 | 		if (sequence.using_cmb_io) {
188 | 			printf("INFO: using controller memory buffer for IO\n");
189 | 		} else {
190 | 			printf("INFO: using host memory buffer for IO\n");
191 | 		}
192 | 		sequence.is_completed = 0;
193 | 		sequence.ns_entry = ns_entry;
194 | 
195 | 		/*
196 | 		 * Print "Hello world!" to sequence.buf.  We will write this data to LBA
197 | 		 *  0 on the namespace, and then later read it back into a separate buffer
198 | 		 *  to demonstrate the full I/O path.
199 | 		 */
200 | 		snprintf(sequence.buf, 0x1000, "%s", "Hello world!\n");
201 | 
202 | 		/*
203 | 		 * Write the data buffer to LBA 0 of this namespace.  "write_complete" and
204 | 		 *  "&sequence" are specified as the completion callback function and
205 | 		 *  argument respectively.  write_complete() will be called with the
206 | 		 *  value of &sequence as a parameter when the write I/O is completed.
207 | 		 *  This allows users to potentially specify different completion
208 | 		 *  callback routines for each I/O, as well as pass a unique handle
209 | 		 *  as an argument so the application knows which I/O has completed.
210 | 		 *
211 | 		 * Note that the SPDK NVMe driver will only check for completions
212 | 		 *  when the application calls spdk_nvme_qpair_process_completions().
213 | 		 *  It is the responsibility of the application to trigger the polling
214 | 		 *  process.
215 | 		 */
216 | 		rc = spdk_nvme_ns_cmd_write(ns_entry->ns, ns_entry->qpair, sequence.buf,
217 | 					    0, /* LBA start */
218 | 					    1, /* number of LBAs */
219 | 					    write_complete, &sequence, 0);
220 | 		if (rc != 0) {
221 | 			fprintf(stderr, "starting write I/O failed\n");
222 | 			exit(1);
223 | 		}
224 | 
225 | 		/*
226 | 		 * Poll for completions.  0 here means process all available completions.
227 | 		 *  In certain usage models, the caller may specify a positive integer
228 | 		 *  instead of 0 to signify the maximum number of completions it should
229 | 		 *  process.  This function will never block - if there are no
230 | 		 *  completions pending on the specified qpair, it will return immediately.
231 | 		 *
232 | 		 * When the write I/O completes, write_complete() will submit a new I/O
233 | 		 *  to read LBA 0 into a separate buffer, specifying read_complete() as its
234 | 		 *  completion routine.  When the read I/O completes, read_complete() will
235 | 		 *  print the buffer contents and set sequence.is_completed = 1.  That will
236 | 		 *  break this loop and then exit the program.
237 | 		 */
238 | 		while (!sequence.is_completed) {
239 | 			spdk_nvme_qpair_process_completions(ns_entry->qpair, 0);
240 | 		}
241 | 
242 | 		/*
243 | 		 * Free the I/O qpair.  This typically is done when an application exits.
244 | 		 *  But SPDK does support freeing and then reallocating qpairs during
245 | 		 *  operation.  It is the responsibility of the caller to ensure all
246 | 		 *  pending I/O are completed before trying to free the qpair.
247 | 		 */
248 | 		spdk_nvme_ctrlr_free_io_qpair(ns_entry->qpair);
249 | 		ns_entry = ns_entry->next;
250 | 	}
251 | }
252 | 
253 | static bool
254 | probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
255 | 	 struct spdk_nvme_ctrlr_opts *opts)
256 | {
257 | 	printf("Attaching to %s\n", trid->traddr);
258 | 
259 | 	return true;
260 | }
261 | 
262 | static void
263 | attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
264 | 	  struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
265 | {
266 | 	int nsid, num_ns;
267 | 	struct ctrlr_entry *entry;
268 | 	struct spdk_nvme_ns *ns;
269 | 	const struct spdk_nvme_ctrlr_data *cdata = spdk_nvme_ctrlr_get_data(ctrlr);
270 | 
271 | 	entry = malloc(sizeof(struct ctrlr_entry));
272 | 	if (entry == NULL) {
273 | 		perror("ctrlr_entry malloc");
274 | 		exit(1);
275 | 	}
276 | 
277 | 	printf("Attached to %s\n", trid->traddr);
278 | 
279 | 	snprintf(entry->name, sizeof(entry->name), "%-20.20s (%-20.20s)", cdata->mn, cdata->sn);
280 | 
281 | 	entry->ctrlr = ctrlr;
282 | 	entry->next = g_controllers;
283 | 	g_controllers = entry;
284 | 
285 | 	/*
286 | 	 * Each controller has one or more namespaces.  An NVMe namespace is basically
287 | 	 *  equivalent to a SCSI LUN.  The controller's IDENTIFY data tells us how
288 | 	 *  many namespaces exist on the controller.  For Intel(R) P3X00 controllers,
289 | 	 *  it will just be one namespace.
290 | 	 *
291 | 	 * Note that in NVMe, namespace IDs start at 1, not 0.
292 | 	 */
293 | 	num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr);
294 | 	printf("Using controller %s with %d namespaces.\n", entry->name, num_ns);
295 | 	for (nsid = 1; nsid <= num_ns; nsid++) {
296 | 		ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
297 | 		if (ns == NULL) {
298 | 			continue;
299 | 		}
300 | 		register_ns(ctrlr, ns);
301 | 	}
302 | }
303 | 
304 | static void
305 | cleanup(void)
306 | {
307 | 	struct ns_entry *ns_entry = g_namespaces;
308 | 	struct ctrlr_entry *ctrlr_entry = g_controllers;
309 | 
310 | 	while (ns_entry) {
311 | 		struct ns_entry *next = ns_entry->next;
312 | 		free(ns_entry);
313 | 		ns_entry = next;
314 | 	}
315 | 
316 | 	while (ctrlr_entry) {
317 | 		struct ctrlr_entry *next = ctrlr_entry->next;
318 | 
319 | 		spdk_nvme_detach(ctrlr_entry->ctrlr);
320 | 		free(ctrlr_entry);
321 | 		ctrlr_entry = next;
322 | 	}
323 | }
324 | 
325 | int main(int argc, char **argv)
326 | {
327 | 	int rc;
328 | 	struct spdk_env_opts opts;
329 | 
330 | 	/*
331 | 	 * SPDK relies on an abstraction around the local environment
332 | 	 * named env that handles memory allocation and PCI device operations.
333 | 	 * This library must be initialized first.
334 | 	 *
335 | 	 */
336 | 	spdk_env_opts_init(&opts);
337 | 	opts.name = "hello_world";
338 | 	opts.shm_id = 0;
339 | 	if (spdk_env_init(&opts) < 0) {
340 | 		fprintf(stderr, "Unable to initialize SPDK env\n");
341 | 		return 1;
342 | 	}
343 | 
344 | 	printf("Initializing NVMe Controllers\n");
345 | 
346 | 	/*
347 | 	 * Start the SPDK NVMe enumeration process.  probe_cb will be called
348 | 	 *  for each NVMe controller found, giving our application a choice on
349 | 	 *  whether to attach to each controller.  attach_cb will then be
350 | 	 *  called for each controller after the SPDK NVMe driver has completed
351 | 	 *  initializing the controller we chose to attach.
352 | 	 */
353 | 	rc = spdk_nvme_probe(NULL, NULL, probe_cb, attach_cb, NULL);
354 | 	if (rc != 0) {
355 | 		fprintf(stderr, "spdk_nvme_probe() failed\n");
356 | 		cleanup();
357 | 		return 1;
358 | 	}
359 | 
360 | 	if (g_controllers == NULL) {
361 | 		fprintf(stderr, "no NVMe controllers found\n");
362 | 		cleanup();
363 | 		return 1;
364 | 	}
365 | 
366 | 	printf("Initialization complete.\n");
367 | 	hello_world();
368 | 	cleanup();
369 | 	return 0;
370 | }
371 | 


--------------------------------------------------------------------------------
/ioat/huge_alloc.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file huge_alloc.h
  3 |  * @brief A header-only fast hugepage allocator with no dependencies
  4 |  * @author Anuj Kalia
  5 |  * @date 2018-09-25
  6 |  */
  7 | 
  8 | #pragma once
  9 | 
 10 | #include <assert.h>
 11 | #include <errno.h>
 12 | #include <fcntl.h>
 13 | #include <malloc.h>
 14 | #include <numaif.h>
 15 | #include <stdint.h>
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <string.h>
 19 | #include <sys/ipc.h>
 20 | #include <sys/shm.h>
 21 | #include <sys/stat.h>
 22 | #include <unistd.h>
 23 | #include <random>
 24 | #include <sstream>
 25 | #include <stdexcept>
 26 | #include <vector>
 27 | 
 28 | namespace hugealloc {
 29 | 
 30 | static constexpr size_t kHugepageSize = (2 * 1024 * 1024);  ///< Hugepage size
 31 | 
 32 | //
 33 | // Utility classes for HugeAlloc
 34 | //
 35 | 
 36 | template <typename T>
 37 | static constexpr inline bool is_power_of_two(T x) {
 38 |   return x && ((x & T(x - 1)) == 0);
 39 | }
 40 | template <uint64_t power_of_two_number, typename T>
 41 | static constexpr inline T round_up(T x) {
 42 |   static_assert(is_power_of_two(power_of_two_number),
 43 |                 "PowerOfTwoNumber must be a power of 2");
 44 |   return ((x) + T(power_of_two_number - 1)) & (~T(power_of_two_number - 1));
 45 | }
 46 | 
 47 | static inline void rt_assert(bool condition, std::string throw_str, char *s) {
 48 |   if (!condition) {
 49 |     throw std::runtime_error(throw_str + std::string(s));
 50 |   }
 51 | }
 52 | 
 53 | /// Check a condition at runtime. If the condition is false, throw exception.
 54 | static inline void rt_assert(bool condition, std::string throw_str) {
 55 |   if (!condition) throw std::runtime_error(throw_str);
 56 | }
 57 | 
 58 | /// Check a condition at runtime. If the condition is false, throw exception.
 59 | /// This is faster than rt_assert(cond, str) as it avoids string construction.
 60 | static inline void rt_assert(bool condition) {
 61 |   if (!condition) throw std::runtime_error("Error");
 62 | }
 63 | 
 64 | /// High-quality but slow RNG
 65 | class SlowRand {
 66 |   std::random_device rand_dev;  // Non-pseudorandom seed for twister
 67 |   std::mt19937_64 mt;
 68 |   std::uniform_int_distribution<uint64_t> dist;
 69 | 
 70 |  public:
 71 |   SlowRand() : mt(rand_dev()), dist(0, UINT64_MAX) {}
 72 | 
 73 |   inline uint64_t next_u64() { return dist(mt); }
 74 | };
 75 | 
 76 | //
 77 | // Definitions for HugeAlloc
 78 | //
 79 | 
 80 | /// Information about an SHM region
 81 | struct shm_region_t {
 82 |   // Constructor args
 83 |   const int shm_key;   /// The key used to create the SHM region
 84 |   const uint8_t *buf;  /// The start address of the allocated SHM buffer
 85 |   const size_t size;   /// The size in bytes of the allocated SHM buffer
 86 | 
 87 |   shm_region_t(int shm_key, uint8_t *buf, size_t size)
 88 |       : shm_key(shm_key), buf(buf), size(size) {
 89 |     assert(size % kHugepageSize == 0);
 90 |   }
 91 | };
 92 | 
 93 | /// The hugepage allocator returns Buffers
 94 | class Buffer {
 95 |  public:
 96 |   Buffer(uint8_t *buf, size_t class_size) : buf(buf), class_size(class_size) {}
 97 | 
 98 |   Buffer() {}
 99 | 
100 |   ~Buffer() {
101 |     // The hugepage allocator frees up memory for its Buffers
102 |   }
103 | 
104 |   /// Return a string representation of this Buffer (excluding lkey)
105 |   std::string to_string() const {
106 |     char ret[100];
107 |     sprintf(ret, "[buf %p, class sz %zu]", buf, class_size);
108 |     return std::string(ret);
109 |   }
110 | 
111 |   /// The backing memory of this Buffer. The Buffer is invalid if this is null.
112 |   uint8_t *buf;
113 |   size_t class_size;  ///< The size of the hugealloc class used for this Buffer
114 | };
115 | 
116 | /// Return the index of the most significant bit of x. The index of the 2^0
117 | /// bit is 1. (x = 0 returns 0, x = 1 returns 1.)
118 | static inline size_t msb_index(int x) {
119 |   assert(x < INT32_MAX / 2);
120 |   int index;
121 |   asm("bsrl %1, %0" : "=r"(index) : "r"(x << 1));
122 |   return static_cast<size_t>(index);
123 | }
124 | 
125 | /**
126 |  * A hugepage allocator that uses per-class freelists. The minimum class size
127 |  * is kMinClassSize, and class size increases by a factor of 2 until
128 |  * kMaxClassSize.
129 |  *
130 |  * Large Buffers split into smaller Buffers when needed. Small Buffers never
131 |  * merge into larger Buffers.
132 |  *
133 |  * When a new SHM region is added to the allocator, it is split into Buffers of
134 |  * size kMaxClassSize and added to that class. These Buffers are later split to
135 |  * fill up smaller classes.
136 |  *
137 |  * The allocator uses randomly generated positive SHM keys, and deallocates the
138 |  * SHM regions it creates when deleted.
139 |  */
140 | class HugeAlloc {
141 |  public:
142 |   static constexpr const char *alloc_fail_help_str =
143 |       "This could be due to insufficient huge pages or SHM limits.";
144 |   static const size_t kMinClassSize = 64;     /// Min allocation size
145 |   static const size_t kMinClassBitShift = 6;  /// For division by kMinClassSize
146 |   static_assert((kMinClassSize >> kMinClassBitShift) == 1, "");
147 | 
148 |   static const size_t kMaxClassSize = 8 * 1024 * 1024;  /// Max allocation size
149 | 
150 |   // We fill-in physical addresses only when splitting larger Buffers into
151 |   // hugepage-sized buffers
152 |   static_assert(kMaxClassSize >= 2 * kHugepageSize, "");
153 | 
154 |   static const size_t kNumClasses = 18;  /// 64 B (2^6), ..., 8 MB (2^23)
155 |   static_assert(kMaxClassSize == kMinClassSize << (kNumClasses - 1), "");
156 | 
157 |   /// Return the maximum size of a class
158 |   static constexpr size_t class_max_size(size_t class_i) {
159 |     return kMinClassSize * (1ull << class_i);
160 |   }
161 | 
162 |   /**
163 |    * @brief Construct the hugepage allocator
164 |    * @throw runtime_error if construction fails
165 |    */
166 |   HugeAlloc(size_t initial_size, size_t numa_node) : numa_node(numa_node) {
167 |     if (initial_size < kMaxClassSize) initial_size = kMaxClassSize;
168 |     prev_allocation_size = initial_size;
169 |     reserve_hugepages(prev_allocation_size);
170 |   }
171 | 
172 |   HugeAlloc(size_t numa_node) : numa_node(numa_node) {
173 |     prev_allocation_size = kMaxClassSize;
174 |     reserve_hugepages(prev_allocation_size);
175 |   }
176 | 
177 |   ~HugeAlloc() {
178 |     for (shm_region_t &shm_region : shm_list) {
179 |       int ret =
180 |           shmdt(static_cast<void *>(const_cast<uint8_t *>(shm_region.buf)));
181 |       if (ret != 0) {
182 |         fprintf(stderr, "HugeAlloc: Error freeing SHM buf for key %d.\n",
183 |                 shm_region.shm_key);
184 |         exit(-1);
185 |       }
186 |     }
187 |   }
188 | 
189 |   /**
190 |    * @brief Reserve size bytes as huge pages by adding hugepage-backed Buffers
191 |    * to freelists.
192 |    *
193 |    * @return True if the allocation succeeds. False if the allocation fails
194 |    * because no more hugepages are available.
195 |    */
196 |   bool reserve_hugepages(size_t size) {
197 |     Buffer buffer = alloc_raw(size);
198 |     if (buffer.buf == nullptr) return false;
199 | 
200 |     // Add Buffers to the largest class
201 |     size_t num_buffers = size / kMaxClassSize;
202 |     assert(num_buffers >= 1);
203 |     for (size_t i = 0; i < num_buffers; i++) {
204 |       uint8_t *buf = buffer.buf + (i * kMaxClassSize);
205 |       freelist[kNumClasses - 1].push_back(Buffer(buf, kMaxClassSize));
206 |     }
207 |     return true;
208 |   }
209 | 
210 |   /**
211 |    * @brief Allocate memory using raw SHM operations, always bypassing the
212 |    * allocator's freelists. Unlike alloc(), the size of the allocated memory
213 |    * need not fit in the allocator's max class size.
214 |    *
215 |    * Allocated memory can be freed only when this allocator is destroyed, i.e.,
216 |    * free_buf() cannot be used. Use alloc() if freeing is needed.
217 |    *
218 |    * @param size The minimum size of the allocated memory
219 |    *
220 |    * @return The allocated hugepage-backed Buffer. buffer.buf is nullptr if we
221 |    * ran out of memory. buffer.class_size is set to SIZE_MAX to indicate that
222 |    * allocator classes were not used.
223 |    *
224 |    * @throw runtime_error if hugepage reservation failure is catastrophic
225 |    */
226 |   Buffer alloc_raw(size_t size) {
227 |     std::ostringstream xmsg;  // The exception message
228 |     size = round_up<kHugepageSize>(size);
229 |     int shm_key, shm_id;
230 | 
231 |     while (true) {
232 |       // Choose a positive SHM key. Negative is fine but it looks scary in the
233 |       // error message.
234 |       shm_key = static_cast<int>(slow_rand.next_u64());
235 |       shm_key = std::abs(shm_key);
236 | 
237 |       // Try to get an SHM region
238 |       shm_id = shmget(shm_key, size, IPC_CREAT | IPC_EXCL | 0666 | SHM_HUGETLB);
239 | 
240 |       if (shm_id == -1) {
241 |         switch (errno) {
242 |           case EEXIST:
243 |             continue;  // shm_key already exists. Try again.
244 | 
245 |           case EACCES:
246 |             xmsg << "HugeAlloc: SHM allocation error. "
247 |                  << "Insufficient permissions.";
248 |             throw std::runtime_error(xmsg.str());
249 | 
250 |           case EINVAL:
251 |             xmsg << "HugeAlloc: SHM allocation error: SHMMAX/SHMIN "
252 |                  << "mismatch. size = " << std::to_string(size) << " ("
253 |                  << std::to_string(size / (1024 * 1024)) << " MB).";
254 |             throw std::runtime_error(xmsg.str());
255 | 
256 |           case ENOMEM:
257 |             // Out of memory - this is OK
258 |             return Buffer(nullptr, 0);
259 | 
260 |           default:
261 |             xmsg << "HugeAlloc: Unexpected SHM malloc error "
262 |                  << strerror(errno);
263 |             throw std::runtime_error(xmsg.str());
264 |         }
265 |       } else {
266 |         // shm_key worked. Break out of the while loop.
267 |         break;
268 |       }
269 |     }
270 | 
271 |     uint8_t *shm_buf = static_cast<uint8_t *>(shmat(shm_id, nullptr, 0));
272 |     rt_assert(shm_buf != nullptr,
273 |               "HugeAlloc: shmat() failed. Key = " + std::to_string(shm_key));
274 | 
275 |     rt_assert(reinterpret_cast<size_t>(shm_buf) % kHugepageSize == 0,
276 |               "SHM buffer isn't aligned to hugepage size");
277 | 
278 |     // Mark the SHM region for deletion when this process exits
279 |     shmctl(shm_id, IPC_RMID, nullptr);
280 | 
281 |     // Bind the buffer to the NUMA node
282 |     const unsigned long nodemask =
283 |         (1ul << static_cast<unsigned long>(numa_node));
284 |     long ret = mbind(shm_buf, size, MPOL_BIND, &nodemask, 32, 0);
285 |     rt_assert(ret == 0,
286 |               "HugeAlloc: mbind() failed. Key " + std::to_string(shm_key));
287 | 
288 |     // Save the SHM region so we can free it later
289 |     shm_list.push_back(shm_region_t(shm_key, shm_buf, size));
290 |     stats.shm_reserved += size;
291 | 
292 |     // buffer.class_size is invalid because we didn't allocate from a class
293 |     return Buffer(shm_buf, SIZE_MAX);
294 |   }
295 | 
296 |   /**
297 |    * @brief Allocate a Buffer using the allocator's freelists, i.e., the max
298 |    * size that can be allocated is the max freelist class size.
299 |    *
300 |    * @param size The minimum size of the allocated Buffer. size need not
301 |    * equal a class size.
302 |    *
303 |    * @return The allocated buffer. The buffer is invalid if we ran out of
304 |    * memory. The class_size of the allocated Buffer is equal to a
305 |    * HugeAlloc class size.
306 |    *
307 |    * @throw runtime_error if size is too large for the allocator, or if
308 |    * hugepage reservation failure is catastrophic
309 |    */
310 |   Buffer alloc(size_t size) {
311 |     assert(size <= kMaxClassSize);
312 |     const size_t size_class = get_class(size);
313 | 
314 |     if (freelist[size_class].empty()) {
315 |       // There is no free Buffer in this class. Find the first larger class with
316 |       // free Buffers.
317 |       size_t next_class = size_class + 1;
318 |       for (; next_class < kNumClasses; next_class++) {
319 |         if (!freelist[next_class].empty()) break;
320 |       }
321 | 
322 |       if (next_class == kNumClasses) {
323 |         // There's no larger size class with free pages, we need to allocate
324 |         // more hugepages. This adds some Buffers to the largest class.
325 |         prev_allocation_size *= 2;
326 |         bool success = reserve_hugepages(prev_allocation_size);
327 | 
328 |         if (!success) {
329 |           prev_allocation_size /= 2;  // Restore the previous allocation
330 |           return Buffer(nullptr, 0);
331 |         }
332 | 
333 |         next_class = kNumClasses - 1;
334 |       }
335 | 
336 |       while (next_class != size_class) {
337 |         split(next_class);
338 |         next_class--;
339 |       }
340 |     }
341 | 
342 |     assert(!freelist[size_class].empty());
343 | 
344 |     Buffer buffer = freelist[size_class].back();
345 |     freelist[size_class].pop_back();
346 |     stats.user_alloc_tot += buffer.class_size;
347 |     return buffer;
348 |   }
349 | 
350 |   /// Free a Buffer
351 |   inline void free_buf(Buffer buffer) {
352 |     assert(buffer.buf != nullptr);
353 | 
354 |     size_t size_class = get_class(buffer.class_size);
355 |     assert(class_max_size(size_class) == buffer.class_size);
356 | 
357 |     freelist[size_class].push_back(buffer);
358 |     stats.user_alloc_tot -= buffer.class_size;
359 |   }
360 | 
361 |   inline size_t get_numa_node() { return numa_node; }
362 | 
363 |   /// Return the total amount of memory reserved as hugepages
364 |   inline size_t get_stat_shm_reserved() const {
365 |     assert(stats.shm_reserved % kHugepageSize == 0);
366 |     return stats.shm_reserved;
367 |   }
368 | 
369 |   /// Return the total amoung of memory allocated to the user
370 |   inline size_t get_stat_user_alloc_tot() const {
371 |     assert(stats.user_alloc_tot % kMinClassSize == 0);
372 |     return stats.user_alloc_tot;
373 |   }
374 | 
375 |  private:
376 |   /**
377 |    * @brief Get the class index for a Buffer size
378 |    * @param size The size of the buffer, which may or may not be a class size
379 |    */
380 |   inline size_t get_class(size_t size) {
381 |     assert(size >= 1 && size <= kMaxClassSize);
382 |     // Use bit shift instead of division to make debug-mode code a faster
383 |     return msb_index(static_cast<int>((size - 1) >> kMinClassBitShift));
384 |   }
385 | 
386 |   /// Reference function for the optimized get_class function above
387 |   inline size_t get_class_slow(size_t size) {
388 |     assert(size >= 1 && size <= kMaxClassSize);
389 | 
390 |     size_t size_class = 0;             // The size class for size
391 |     size_t class_lim = kMinClassSize;  // The max size for size_class
392 |     while (size > class_lim) {
393 |       size_class++;
394 |       class_lim *= 2;
395 |     }
396 | 
397 |     return size_class;
398 |   }
399 | 
400 |   /// Split one Buffers from class size_class into two Buffers of the previous
401 |   /// class.
402 |   inline void split(size_t size_class) {
403 |     Buffer buffer = freelist[size_class].back();
404 |     freelist[size_class].pop_back();
405 | 
406 |     Buffer buffer_0 = Buffer(buffer.buf, buffer.class_size / 2);
407 |     Buffer buffer_1 =
408 |         Buffer(buffer.buf + buffer.class_size / 2, buffer.class_size / 2);
409 | 
410 |     freelist[size_class - 1].push_back(buffer_0);
411 |     freelist[size_class - 1].push_back(buffer_1);
412 |   }
413 | 
414 |   std::vector<shm_region_t> shm_list;  /// SHM regions by increasing alloc size
415 |   std::vector<Buffer> freelist[kNumClasses];  /// Per-class freelist
416 | 
417 |   SlowRand slow_rand;           /// RNG to generate SHM keys
418 |   const size_t numa_node;       /// NUMA node on which all memory is allocated
419 |   size_t prev_allocation_size;  /// Size of previous hugepage reservation
420 | 
421 |   // Stats
422 |   struct {
423 |     size_t shm_reserved = 0;    /// Total hugepage memory reserved by allocator
424 |     size_t user_alloc_tot = 0;  /// Total memory allocated to user
425 |   } stats;
426 | };
427 | 
428 | }  // namespace hugealloc
429 | 


--------------------------------------------------------------------------------