├── README.md ├── java ├── jmaps └── openjdk8_b132-fp.diff ├── linux └── buildkernel_ec2xenial.sh ├── microbenchmarks ├── gettimeofdaybench.c └── microbench_ubuntu.sh ├── osx └── kernel_diagreport2text.ksh ├── perf_events ├── perf.md ├── perf1.md └── perfmaptidy.pl ├── s1bench ├── README.md ├── chart.png ├── out.seriesbench_kpti_c59xl_41412_100m_nopti_01.txt ├── out.seriesbench_kpti_c59xl_41412_100m_pti_nopcid_01.txt ├── out.seriesbench_kpti_c59xl_41412_100m_pti_pcid_01.txt ├── s1bench.c ├── series2cols.sh └── series_s1bench.sh ├── test └── date.txt └── xen ├── xen-4.6.0-vpmu-filter.diff ├── xen-features.pl └── xen-features.py /README.md: -------------------------------------------------------------------------------- 1 | Misc 2 | ==== 3 | 4 | Misc stuff. 5 | -------------------------------------------------------------------------------- /java/jmaps: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # jmaps - creates java /tmp/perf-PID.map symbol maps for all java processes. 4 | # 5 | # This is a helper script that finds all running "java" processes, then executes 6 | # perf-map-agent on them all, creating symbol map files in /tmp. These map files 7 | # are read by perf_events (aka "perf") when doing system profiles (specifically, 8 | # the "report" and "script" subcommands). 9 | # 10 | # USAGE: jmaps [-u] 11 | # -u # unfoldall: include inlined symbols 12 | # 13 | # My typical workflow is this: 14 | # 15 | # perf record -F 99 -a -g -- sleep 30; jmaps 16 | # perf script > out.stacks 17 | # ./stackcollapse-perf.pl out.stacks | ./flamegraph.pl --color=java --hash > out.stacks.svg 18 | # 19 | # The stackcollapse-perf.pl and flamegraph.pl programs come from: 20 | # https://github.com/brendangregg/FlameGraph 21 | # 22 | # REQUIREMENTS: 23 | # Tune two environment settings below. 24 | # 25 | # 13-Feb-2015 Brendan Gregg Created this. 26 | # 20-Feb-2017 " " Added -u for unfoldall. 27 | 28 | JAVA_HOME=/usr/lib/jvm/java-8-oracle 29 | AGENT_HOME=/usr/lib/jvm/perf-map-agent # from https://github.com/jrudolph/perf-map-agent 30 | debug=0 31 | 32 | if [[ "$USER" != root ]]; then 33 | echo "ERROR: not root user? exiting..." 34 | exit 35 | fi 36 | 37 | if [[ ! -x $JAVA_HOME ]]; then 38 | echo "ERROR: JAVA_HOME not set correctly; edit $0 and fix" 39 | exit 40 | fi 41 | 42 | if [[ ! -x $AGENT_HOME ]]; then 43 | echo "ERROR: AGENT_HOME not set correctly; edit $0 and fix" 44 | exit 45 | fi 46 | 47 | if [[ "$1" == "-u" ]]; then 48 | opts=unfoldall 49 | fi 50 | 51 | # figure out where the agent files are: 52 | AGENT_OUT="" 53 | AGENT_JAR="" 54 | if [[ -e $AGENT_HOME/out/attach-main.jar ]]; then 55 | AGENT_JAR=$AGENT_HOME/out/attach-main.jar 56 | elif [[ -e $AGENT_HOME/attach-main.jar ]]; then 57 | AGENT_JAR=$AGENT_HOME/attach-main.jar 58 | fi 59 | if [[ -e $AGENT_HOME/out/libperfmap.so ]]; then 60 | AGENT_OUT=$AGENT_HOME/out 61 | elif [[ -e $AGENT_HOME/libperfmap.so ]]; then 62 | AGENT_OUT=$AGENT_HOME 63 | fi 64 | if [[ "$AGENT_OUT" == "" || "$AGENT_JAR" == "" ]]; then 65 | echo "ERROR: Missing perf-map-agent files in $AGENT_HOME. Check installation." 66 | exit 67 | fi 68 | 69 | # Fetch map for all "java" processes 70 | echo "Fetching maps for all java processes..." 71 | for pid in $(pgrep -x java); do 72 | mapfile=/tmp/perf-$pid.map 73 | [[ -e $mapfile ]] && rm $mapfile 74 | 75 | cmd="cd $AGENT_OUT; $JAVA_HOME/bin/java -Xms32m -Xmx128m -cp $AGENT_JAR:$JAVA_HOME/lib/tools.jar net.virtualvoid.perf.AttachOnce $pid $opts" 76 | (( debug )) && echo $cmd 77 | 78 | user=$(ps ho user -p $pid) 79 | if [[ "$user" != root ]]; then 80 | # make $user the username if it is a UID: 81 | if [[ "$user" == [0-9]* ]]; then user=$(awk -F: '$3 == '$user' { print $1 }' /etc/passwd); fi 82 | cmd="sudo -u $user sh -c '$cmd'" 83 | fi 84 | 85 | echo "Mapping PID $pid (user $user):" 86 | if (( debug )); then 87 | time eval $cmd 88 | else 89 | eval $cmd 90 | fi 91 | if [[ -e "$mapfile" ]]; then 92 | chown root $mapfile 93 | chmod 666 $mapfile 94 | else 95 | echo "ERROR: $mapfile not created." 96 | fi 97 | 98 | echo "wc(1): $(wc $mapfile)" 99 | echo 100 | done 101 | -------------------------------------------------------------------------------- /java/openjdk8_b132-fp.diff: -------------------------------------------------------------------------------- 1 | --- openjdk8clean/hotspot/src/cpu/x86/vm/x86_64.ad 2014-03-04 02:52:11.000000000 +0000 2 | +++ openjdk8/hotspot/src/cpu/x86/vm/x86_64.ad 2014-11-08 01:10:49.686044933 +0000 3 | @@ -166,10 +166,9 @@ 4 | // 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ ) 5 | // 6 | 7 | -// Class for all pointer registers (including RSP) 8 | +// Class for all pointer registers (including RSP, excluding RBP) 9 | reg_class any_reg(RAX, RAX_H, 10 | RDX, RDX_H, 11 | - RBP, RBP_H, 12 | RDI, RDI_H, 13 | RSI, RSI_H, 14 | RCX, RCX_H, 15 | @@ -184,10 +183,9 @@ 16 | R14, R14_H, 17 | R15, R15_H); 18 | 19 | -// Class for all pointer registers except RSP 20 | +// Class for all pointer registers except RSP and RBP 21 | reg_class ptr_reg(RAX, RAX_H, 22 | RDX, RDX_H, 23 | - RBP, RBP_H, 24 | RDI, RDI_H, 25 | RSI, RSI_H, 26 | RCX, RCX_H, 27 | @@ -199,9 +197,8 @@ 28 | R13, R13_H, 29 | R14, R14_H); 30 | 31 | -// Class for all pointer registers except RAX and RSP 32 | +// Class for all pointer registers except RAX, RSP and RBP 33 | reg_class ptr_no_rax_reg(RDX, RDX_H, 34 | - RBP, RBP_H, 35 | RDI, RDI_H, 36 | RSI, RSI_H, 37 | RCX, RCX_H, 38 | @@ -226,9 +223,8 @@ 39 | R13, R13_H, 40 | R14, R14_H); 41 | 42 | -// Class for all pointer registers except RAX, RBX and RSP 43 | +// Class for all pointer registers except RAX, RBX, RSP and RBP 44 | reg_class ptr_no_rax_rbx_reg(RDX, RDX_H, 45 | - RBP, RBP_H, 46 | RDI, RDI_H, 47 | RSI, RSI_H, 48 | RCX, RCX_H, 49 | @@ -260,10 +256,9 @@ 50 | // Singleton class for TLS pointer 51 | reg_class ptr_r15_reg(R15, R15_H); 52 | 53 | -// Class for all long registers (except RSP) 54 | +// Class for all long registers (except RSP and RBP) 55 | reg_class long_reg(RAX, RAX_H, 56 | RDX, RDX_H, 57 | - RBP, RBP_H, 58 | RDI, RDI_H, 59 | RSI, RSI_H, 60 | RCX, RCX_H, 61 | @@ -275,9 +270,8 @@ 62 | R13, R13_H, 63 | R14, R14_H); 64 | 65 | -// Class for all long registers except RAX, RDX (and RSP) 66 | -reg_class long_no_rax_rdx_reg(RBP, RBP_H, 67 | - RDI, RDI_H, 68 | +// Class for all long registers except RAX, RDX (and RSP, RBP) 69 | +reg_class long_no_rax_rdx_reg(RDI, RDI_H, 70 | RSI, RSI_H, 71 | RCX, RCX_H, 72 | RBX, RBX_H, 73 | @@ -288,9 +282,8 @@ 74 | R13, R13_H, 75 | R14, R14_H); 76 | 77 | -// Class for all long registers except RCX (and RSP) 78 | -reg_class long_no_rcx_reg(RBP, RBP_H, 79 | - RDI, RDI_H, 80 | +// Class for all long registers except RCX (and RSP, RBP) 81 | +reg_class long_no_rcx_reg(RDI, RDI_H, 82 | RSI, RSI_H, 83 | RAX, RAX_H, 84 | RDX, RDX_H, 85 | @@ -302,9 +295,8 @@ 86 | R13, R13_H, 87 | R14, R14_H); 88 | 89 | -// Class for all long registers except RAX (and RSP) 90 | -reg_class long_no_rax_reg(RBP, RBP_H, 91 | - RDX, RDX_H, 92 | +// Class for all long registers except RAX (and RSP, RBP) 93 | +reg_class long_no_rax_reg(RDX, RDX_H, 94 | RDI, RDI_H, 95 | RSI, RSI_H, 96 | RCX, RCX_H, 97 | @@ -325,10 +317,9 @@ 98 | // Singleton class for RDX long register 99 | reg_class long_rdx_reg(RDX, RDX_H); 100 | 101 | -// Class for all int registers (except RSP) 102 | +// Class for all int registers (except RSP and RBP) 103 | reg_class int_reg(RAX, 104 | RDX, 105 | - RBP, 106 | RDI, 107 | RSI, 108 | RCX, 109 | @@ -340,10 +331,9 @@ 110 | R13, 111 | R14); 112 | 113 | -// Class for all int registers except RCX (and RSP) 114 | +// Class for all int registers except RCX (and RSP, RBP) 115 | reg_class int_no_rcx_reg(RAX, 116 | RDX, 117 | - RBP, 118 | RDI, 119 | RSI, 120 | RBX, 121 | @@ -355,8 +345,7 @@ 122 | R14); 123 | 124 | // Class for all int registers except RAX, RDX (and RSP) 125 | -reg_class int_no_rax_rdx_reg(RBP, 126 | - RDI, 127 | +reg_class int_no_rax_rdx_reg(RDI, 128 | RSI, 129 | RCX, 130 | RBX, 131 | @@ -718,6 +707,7 @@ 132 | st->print("# stack bang"); 133 | st->print("\n\t"); 134 | st->print("pushq rbp\t# Save rbp"); 135 | + // BDG consider: st->print("movq rbp, rsp\t# "); 136 | if (framesize) { 137 | st->print("\n\t"); 138 | st->print("subq rsp, #%d\t# Create frame",framesize); 139 | --- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-03-04 02:52:11.000000000 +0000 140 | +++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-11-07 23:57:11.589593723 +0000 141 | @@ -5236,6 +5236,7 @@ 142 | // We always push rbp, so that on return to interpreter rbp, will be 143 | // restored correctly and we can correct the stack. 144 | push(rbp); 145 | + mov(rbp, rsp); 146 | // Remove word for ebp 147 | framesize -= wordSize; 148 | 149 | --- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2014-03-04 02:52:10.000000000 +0000 150 | +++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2014-11-07 23:57:21.933257882 +0000 151 | @@ -358,6 +358,7 @@ 152 | generate_stack_overflow_check(frame_size_in_bytes); 153 | 154 | push(rbp); 155 | + mov(rbp, rsp); 156 | #ifdef TIERED 157 | // c2 leaves fpu stack dirty. Clean it on entry 158 | if (UseSSE < 2 ) { 159 | -------------------------------------------------------------------------------- /linux/buildkernel_ec2xenial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # buildkernel_ec2xenial.sh linux kernel build on ubuntu-xenial/EC2. 4 | # 5 | # USAGE: buildkernel_ec2xenial.sh [-D] {tarball.xz | tarball.xz_URL} [patches_dir] 6 | # 7 | # -D # debuginfo 8 | # 9 | # eg, 10 | # ./buildkernel_ec2xenial.sh ../kernels/linux-3.13.tar.gz 11 | # ./buildkernel_ec2xenial.sh -D ../kernels/linux-3.13.tar.gz 12 | # ./buildkernel_ec2xenial.sh ../kernels/linux-3.13-patched.tar.gz 13 | # ./buildkernel_ec2xenial.sh ../kernels/linux-4.2.tar.gz mypatches01 14 | # ./buildkernel_ec2xenial.sh https://www.kernel.org/pub/linux/kernel/v4.x/testing/linux-4.2-rc5.tar.xz 15 | # 16 | # Can get kernel tar.xz URL from https://www.kernel.org 17 | # Can get patches from https://patchwork.kernel.org/project/LKML/list/ 18 | # 19 | # Kernel is built under /mnt/src/linux... . Kernel build output is written to 20 | # build.{stdout,stderr}. Install output to install.{stdout,stderr}. 21 | # 22 | # See fixapt_xenial for some of the xenial specifics. 23 | # 24 | # 24-Aug-2016 Brendan Gregg Created this. 25 | 26 | ### usage 27 | function usage { 28 | echo "USAGE: $0 [-D] {tarball.xz | tarball.xz_URL} [patches_dir]" 29 | exit 30 | } 31 | 32 | ### process options 33 | opt_debuginfo=0 34 | while getopts D opt 35 | do 36 | case $opt in 37 | D) opt_debuginfo=1 ;; 38 | h|?) usage ;; 39 | esac 40 | done 41 | shift $(( $OPTIND - 1 )) 42 | (( $# == 0 )) && usage 43 | kernel=$1 44 | src=/mnt/src 45 | 46 | ### functions 47 | function run { 48 | "$@" 49 | e=$? 50 | if (( $e != 0 )); then 51 | echo >&2 "CMD : $@" 52 | echo >&2 "ERROR : exit status: $e, quitting" 53 | exit 54 | fi 55 | } 56 | 57 | function die { 58 | echo >&2 "$@" 59 | exit 1 60 | } 61 | 62 | did_update=0 63 | function addpkgs { 64 | all=1 65 | for pkg in "$@"; do 66 | if ! dpkg -s $pkg > /dev/null 2>&1; then all=0; fi 67 | done 68 | if (( all )); then 69 | echo "Packages already installed." 70 | else 71 | if (( ! did_update )); then 72 | sudo apt-get update 73 | did_update=1 74 | fi 75 | for pkg in "$@"; do 76 | sudo apt-get install -y $pkg 77 | done 78 | fi 79 | } 80 | 81 | function fixapt_xenial { 82 | aptsrc=/etc/apt/sources.list 83 | sudo perl -p -i -e 's/^/#/' $aptsrc 84 | sudo sh -c 'echo " 85 | # Ubuntu xenial: 86 | deb [arch=amd64,i386] http://us-west-1.ec2.archive.ubuntu.com/ubuntu/ xenial main restricted universe multiverse 87 | deb-src http://us-west-1.ec2.archive.ubuntu.com/ubuntu/ xenial main restricted universe multiverse 88 | deb [arch=amd64,i386] http://us-west-1.ec2.archive.ubuntu.com/ubuntu/ xenial-updates main restricted universe multiverse 89 | deb-src http://us-west-1.ec2.archive.ubuntu.com/ubuntu/ xenial-updates main restricted universe multiverse 90 | deb [arch=amd64,i386] http://security.ubuntu.com/ubuntu xenial-security main restricted universe multiverse 91 | deb-src http://security.ubuntu.com/ubuntu xenial-security main restricted universe multiverse 92 | " >> '$aptsrc 93 | } 94 | 95 | ### timestamp 96 | echo $0 Begin. 97 | start=$(date) 98 | echo $start 99 | 100 | ### fetch kernel source 101 | if [[ "$kernel" == http* ]]; then 102 | echo Fetching source: $url... 103 | run wget $1 104 | file=${1##*/} 105 | else 106 | file=$kernel 107 | fi 108 | if [[ ! -e $file ]]; then 109 | echo >&2 "ERROR: kernel source file ($file) missing?" 110 | ls $file 111 | exit 2 112 | fi 113 | filepath=$PWD/$file 114 | 115 | ### check for patches 116 | if [[ "$2" != "" ]]; then 117 | patchdir=$2 118 | [[ "$patchdir" != /* ]] && patchdir=$PWD/$patchdir 119 | if [ ! -d $patchdir ]; then 120 | echo >&2 "Patch directory $patchdir doesn't exist" 121 | exit 2 122 | fi 123 | fi 124 | 125 | ### fix apt if needed 126 | echo Setting apt sources... 127 | fixapt_xenial 128 | 129 | ### add packages (both necessary and convenient) 130 | echo Adding packages... 131 | addpkgs gcc make ncurses-dev libssl-dev bc 132 | echo Adding packages for perf... 133 | addpkgs flex bison libelf-dev libdw-dev libaudit-dev 134 | echo Adding packages for perf TUI... 135 | addpkgs libnewt-dev libslang2-dev 136 | echo Adding packages for convenience... 137 | addpkgs sharutils sysstat bc 138 | 139 | ### remove some packages to un-complicate kernel builds 140 | echo Removing ZFS and SPL kernel components 141 | sudo dpkg -r zfs-dkms ubuntu-zfs spl-dkms 142 | 143 | ### expand source 144 | echo Prepping build environment: $src... 145 | sudo mkdir -p $src 146 | run sudo chown $USER $src 147 | cd $src 148 | run tar xf $filepath 149 | 150 | echo Build prep... 151 | dir=${file%.tar.*} 152 | echo directory: $src/$dir 153 | cd $dir 154 | 155 | ### apply patches 156 | if [[ "$patchdir" != "" ]]; then 157 | echo APPLYING PATCHES from $patchdir 158 | for patch in $patchdir/*; do 159 | echo Applying patch: $patch 160 | run patch -p1 < $patch 161 | done 162 | fi 163 | 164 | ### kernel config 165 | run make olddefconfig 166 | # 167 | # for manual setup, run "make menuconfig" and EC2 customizations are: 168 | # Processor type and features -> Linux guest support -> Enable paravirtualization code (PARAVIRT) 169 | # Paravirtualization layer for spinlocks (PARAVIRT_SPINLOCKS) 170 | # Xen guest support, Support for running as a PVH guest (XEN_PVH) 171 | # Paravirtual steal time accounting (PARAVIRT_TIME_ACCOUNTING) 172 | # Device Drivers -> Block devices -> Xen virtual block device support (XEN_BLKDEV_FRONTEND) 173 | # Device Drivers -> Network device support devices -> Xen virtual network frontend driver (XEN_NETDEV_FRONTEND) 174 | # Device Drivers -> Generic driver options -> Maintain a devtmpfs filesystem ..., and Automount (DEVTMPFS, DEVTMPFS_MOUNT) 175 | # Kernel Hacking -> Compile-time checks and compiler options -> Configure kernel debug info (DEBUG_INFO) 176 | # General setup -> Configure standard kernel features (expert users) -> (BPF_SYSCALL) 177 | # On Linux 3.2, just copy /boot/config... 178 | # 179 | echo Running scripts/config ... 180 | run ./scripts/config -e CONFIG_PARAVIRT \ 181 | -e CONFIG_PARAVIRT_SPINLOCKS \ 182 | -e CONFIG_PARAVIRT_TIME_ACCOUNTING \ 183 | -e CONFIG_PARAVIRT_CLOCK \ 184 | -e CONFIG_HYPERVISOR_GUEST \ 185 | -e CONFIG_XEN \ 186 | -e CONFIG_XEN_PVHVM \ 187 | -e CONFIG_XEN_PVH \ 188 | -e CONFIG_XEN_BLKDEV_FRONTEND \ 189 | -e CONFIG_XEN_NETDEV_FRONTEND \ 190 | -e CONFIG_DEVTMPFS \ 191 | -e CONFIG_DEVTMPFS_MOUNT \ 192 | -e CONFIG_BPF_EVENTS \ 193 | -e CONFIG_BPF_SYSCALL \ 194 | -e CONFIG_HIST_TRIGGERS \ 195 | -d CONFIG_SOUND 196 | # 197 | # The following is necessary for update-grub-legacy-ec2 to recognize 198 | # this as a valid kernel and add it to menu.lst; the call path is 199 | # arch/x86/boot/install.sh -> /sbin/installkernel -> 200 | # run-parts /etc/kernel/postinst.d -> .../x-grub-legacy-ec2 -> 201 | # /usr/sbin/update-grub-legacy-ec2 (the latter two are added by 202 | # the grub-legacy-ec2 package). 203 | # 204 | run ./scripts/config --set-str CONFIG_LOCALVERSION "-virtual" 205 | 206 | if (( opt_debuginfo )); then 207 | run ./scripts/config -e CONFIG_DEBUG_INFO 208 | else 209 | run ./scripts/config -d CONFIG_DEBUG_INFO 210 | fi 211 | 212 | ### kernel build 213 | echo Kernel build... 214 | cpus=$(grep -c '^processor.:' /proc/cpuinfo) 215 | (time make -j $cpus) > build.stdout 2> build.stderr 216 | cat build.stderr 217 | 218 | ### extra builds 219 | echo Extra builds... 220 | cd tools/perf 221 | make >> build.stdout 2>> build.stderr 222 | cd ../.. 223 | 224 | ### kernel install 225 | echo Install... 226 | sudo make modules_install > install.stdout 2> install.stderr 227 | sudo make install >> install.stdout 2>> install.stderr # calls update-grub 228 | cat install.stderr 229 | release=$(make kernelrelease) 230 | 231 | ### boot config 232 | echo Grub1... # rewrite the following when grub2 is in use 233 | # check boot files, and add a fallback entry to grub 234 | menu=/boot/grub/menu.lst 235 | tmp=/tmp/menu.lst.$$ 236 | if [ ! -e /boot/vmlinuz-$release ]; then 237 | echo >&2 "ERROR: Can't find boot files for $release." \ 238 | "Build or install failed? Exiting without updating grub." 239 | exit 3 240 | fi 241 | awk '{ out = 1 } 242 | /^default/ { print $0; print "fallback\t2"; out = 0 } 243 | /^fallback/ { out = 0 } 244 | out == 1 { print } 245 | ' $menu > $tmp 246 | if ! egrep -v '^(#|$)' $tmp >/dev/null; then 247 | echo >&2 "ERROR: generated grub file failure. Exiting without" \ 248 | "updating grub." 249 | exit 5 250 | fi 251 | sudo bash -c "cat $tmp > $menu" 252 | 253 | ### really fix grub (update-grub doesn't update properly on xenial) 254 | sudo update-grub-legacy-ec2 255 | sudo update-grub 256 | sudo sync 257 | 258 | ### done 259 | echo Done. Built and installed $release. 260 | echo $start Started. 261 | date 262 | echo $0 Done. 263 | echo Rebooting... 264 | sudo reboot 265 | -------------------------------------------------------------------------------- /microbenchmarks/gettimeofdaybench.c: -------------------------------------------------------------------------------- 1 | /* 2 | * gettimeofdaybench 3 | * 4 | * USAGE: time gettimeofdaybench 5 | * 6 | * Compile with -O1. 7 | * 8 | * 30-Aug-2014 Brendan Gregg Created this. 9 | */ 10 | #include 11 | 12 | int 13 | main(int argc, char *argv[]) 14 | { 15 | int i, ret; 16 | struct timeval tv; 17 | 18 | for (i = 0; i < 100 * 1000 * 1000; i++) { 19 | ret = gettimeofday(&tv, 0); 20 | } 21 | 22 | return (0); 23 | } 24 | -------------------------------------------------------------------------------- /microbenchmarks/microbench_ubuntu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # microbench_ubuntu.sh Initialize and run instance micro-benchmarks. 4 | # 5 | # USAGE: ./microbench_ubuntu.sh # creates an out.microbench.$$ report 6 | # 7 | # Tests have been configured to usually run for between 10 and 30 seconds. 8 | # 9 | # 31-Mar-2014 Brendan Gregg Created this. 10 | # 23-Oct-2017 " " Added several more micro-benchmarks. 11 | 12 | DATADIR=/mnt/microbench 13 | LOGFILE=$PWD/out.microbench.$$ 14 | 15 | ### run: name command [arguments ...] 16 | function run { 17 | ( echo ---------------------------------------- 18 | echo BENCHMARK: $1 19 | echo ---------------------------------------- ) | tee -a $LOGFILE 20 | shift 21 | ( echo RUN: "$@" 22 | echo 23 | sudo time "$@" 2>&1 24 | echo 25 | echo EXIT STATUS: $? ) | tee -a $LOGFILE 26 | } 27 | 28 | function die { 29 | echo >&2 "$@" 30 | exit 1 31 | } 32 | 33 | function addpkgs { 34 | all=1 35 | for pkg in "$@"; do 36 | if ! dpkg -s $pkg > /dev/null; then all=0; fi 37 | done 38 | if (( all )); then 39 | echo "All packages already installed." 40 | else 41 | sudo apt-get update 42 | for pkg in "$@"; do 43 | sudo apt-get install -y $pkg 44 | done 45 | fi 46 | } 47 | 48 | ### determine instance paramaters 49 | memory=$(awk '$1 == "MemTotal:" { printf "%d\n", $2 / 1024 }' /proc/meminfo) 50 | ncpu=$(grep -c '^processor ' /proc/cpuinfo) 51 | mntdev=$(awk '$2 == "/mnt" { print $1; exit }' /etc/mtab) 52 | if [[ "$mntdev" == "" ]]; then 53 | echo "no /mnt device; defaulting to /." | tee -a $LOGFILE 54 | mntdev=$(awk '$2 == "/" { print $1; exit }' /etc/mtab) 55 | fi 56 | 57 | ### print and log hardware 58 | echo Logfile: $LOGFILE 59 | > $LOGFILE 60 | ( echo Main Memory: $memory Mbytes 61 | echo CPUs: $ncpu 62 | echo CPU: 63 | awk '{ print } NF == 0 { exit }' /proc/cpuinfo 64 | echo NUMASTAT: 65 | numastat 66 | echo DF: 67 | df -h 68 | echo FSTAB: 69 | cat /etc/fstab 70 | echo MTAB: 71 | cat /etc/mtab 72 | echo /mnt DEV: $mntdev 73 | ) | tee -a $LOGFILE 74 | sleep 0.5 75 | echo 76 | 77 | ### log extra details 78 | ( echo DATE: $(date) 79 | echo UNAME: $(uname -a) 80 | echo ENV: 81 | env ) | tee -a $LOGFILE 82 | 83 | ### log Netflix details if available 84 | customenv=/etc/profile.d/netflix_environment.sh 85 | [ -e $customenv ] && cat $customenv | tee -a $LOGFILE 86 | 87 | ### add software 88 | echo Adding packages... 89 | addpkgs numactl lmbench sysbench fio hdparm iperf sharutils openssl 90 | 91 | sudo mkdir -p $DATADIR 92 | [[ "$USER" == "" ]] && die "ERROR: Username not found (\$USER?)" 93 | sudo chown $USER $DATADIR 94 | echo cd $DATADIR 95 | cd $DATADIR 96 | [ -e fio.data ] && sudo rm fio.data 97 | [ -e randread.1.0 ] && sudo rm randread.* 98 | 99 | ### benchmark info 100 | ( 101 | echo clocksource: $(cat /sys/devices/system/clocksource/clocksource0/current_clocksource) 102 | echo governor: $(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor) 103 | echo sysbench: "$(sysbench --version)" 104 | echo perl: "$(perl --version)" 105 | echo openssl: "$(openssl version)" 106 | ) | tee -a $LOGFILE 107 | 108 | ### run benchmarks 109 | echo Running benchmarks... 110 | # some are repeated to check for variance 111 | 112 | # clock speed: 113 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 114 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 115 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 116 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 117 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 118 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 119 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 120 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 121 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 122 | run C1 /usr/lib/lmbench/bin/x86_64-linux-gnu/mhz 123 | 124 | # CPU single core performance: 125 | run C2 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 126 | run C2 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 127 | run C2 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 128 | run C2 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 129 | run C2 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 130 | 131 | # CPU single core performance, CPU bound: 132 | run C3 numactl --physcpubind=0 sysbench --max-requests=10000000 --max-time=10 --num-threads=1 --test=cpu --cpu-max-prime=10000 run 133 | 134 | # CPU total capacity: 135 | run C4 sysbench --max-requests=10000000 --max-time=10 --num-threads=$ncpu --test=cpu --cpu-max-prime=10000 run 136 | 137 | # CPU performance, different workload (more to sanity check earlier results): 138 | run C5 openssl speed rsa4096 -multi $ncpu 2>&1 | egrep -v '^(Forked|\+)' 139 | 140 | # system call performance: 141 | run S1 dd if=/dev/zero of=/dev/null bs=1 count=100000000 142 | run S1 dd if=/dev/zero of=/dev/null bs=1 count=100000000 143 | run S1 dd if=/dev/zero of=/dev/null bs=1 count=100000000 144 | 145 | # system call performance, CPU and memory node bound: 146 | run S2 numactl --membind=0 --physcpubind=0 dd if=/dev/zero of=/dev/null bs=1 count=100000000 147 | 148 | # TSC performance: 149 | run S3 perl -e 'use Time::HiRes; for (;$i++ < 100_000_000;) { Time::HiRes::gettimeofday(); }' 150 | run S3 perl -e 'use Time::HiRes; for (;$i++ < 100_000_000;) { Time::HiRes::gettimeofday(); }' 151 | run S3 perl -e 'use Time::HiRes; for (;$i++ < 100_000_000;) { Time::HiRes::gettimeofday(); }' 152 | 153 | # memory access latency across ranges, exposing CPU cache and memory subsystem hierarchy: 154 | run M1 /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_mem_rd 256m 128 155 | run M1 /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_mem_rd 256m 128 156 | run M1 /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_mem_rd 256m 128 157 | 158 | # memory access latency, CPU and memory node bound: 159 | run M2 numactl --membind=0 --physcpubind=0 /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_mem_rd 256m 128 160 | 161 | # memory access latency for a larger range (1/8th; larger is too slow with lat_mem_rd): 162 | run M3 /usr/lib/lmbench/bin/x86_64-linux-gnu/lat_mem_rd $(( memory * 1 / 8 ))m 128 163 | 164 | # memory bandwidth: 165 | run M4 /usr/lib/lmbench/bin/x86_64-linux-gnu/bw_mem 2500m cp 166 | run M4 /usr/lib/lmbench/bin/x86_64-linux-gnu/bw_mem 2500m cp 167 | run M4 /usr/lib/lmbench/bin/x86_64-linux-gnu/bw_mem 2500m cp 168 | 169 | # memory bandwidth, CPU and memory node bound: 170 | run M5 numactl --membind=0 --physcpubind=0 /usr/lib/lmbench/bin/x86_64-linux-gnu/bw_mem 2500m cp 171 | 172 | # file system writes, ending with an fsync to flush: 173 | run F1 fio --name=seqwrite --rw=write --filename=fio.data --bs=128k --size=4g --end_fsync=1 --loops=4 174 | 175 | # file system random reads, cached: 176 | run F2 fio --name=randread --rw=randread --pre_read=1 --norandommap --bs=4k --size=256m --runtime=30 --loops=1000000 177 | 178 | # file system multi-threaded random reads, cached: 179 | run F3 fio --numjobs=$ncpu --name=randread --rw=randread --pre_read=1 --norandommap --bs=4k --size=$((256 / ncpu))m --runtime=30 --loops=1000000 180 | 181 | # file system multi-threaded random reads, partial cache: 182 | run F4 bash -c 'echo 3 > /proc/sys/vm/drop_caches; fio --numjobs='$ncpu' --name=partial --rw=randread --filename=fio.data --norandommap --random_distribution=pareto:0.9 --bs=4k --size=4g --runtime=60 --loops=1000' 183 | 184 | # disk read, cached (first 512 byte sector only): 185 | run D1 fio --name=iops --rw=read --bs=512 --size=512 --io_size=1g --filename=$mntdev --direct=1 --ioengine=libaio --runtime=15 186 | 187 | # disk random reads: 188 | run D2 fio --name=iops --rw=randread --norandommap --bs=512 --size=4g --filename=$mntdev --direct=1 --ioengine=libaio --runtime=15 189 | 190 | # disk random reads, with a queue depth: 191 | run D3 fio --name=iops --rw=randread --norandommap --bs=512 --size=4g --filename=$mntdev --direct=1 --ioengine=libaio --iodepth=32 --runtime=15 192 | 193 | # disk large sequential reads (1 Mbyte): 194 | run D4 fio --name=iops --rw=read --bs=1m --size=4g --filename=$mntdev --direct=1 --ioengine=libaio --runtime=15 195 | 196 | # disk large sequential reads (1 Mbyte), with a queue depth: 197 | run D5 fio --name=iops --rw=read --bs=1m --size=4g --filename=$mntdev --direct=1 --ioengine=libaio --iodepth=4 --runtime=15 198 | 199 | # network, loopback throughput: 200 | run N1 bash -c 'iperf -s & sleep 1; iperf -c 127.0.0.1 -i 1 -t 15; pkill iperf' 201 | 202 | # other network tests needs a remote host... 203 | 204 | echo Done. 205 | echo DATE: $(date) | tee -a $LOGFILE 206 | echo 207 | echo "Now run network benchmarks manually and analyze (active benchmarking)" 208 | echo NOTE: benchmark files are left in $DATADIR. 209 | -------------------------------------------------------------------------------- /osx/kernel_diagreport2text.ksh: -------------------------------------------------------------------------------- 1 | #!/bin/ksh 2 | # 3 | # kernel_diagreport2text.ksh 4 | # 5 | # Prints the stack trace from an OS X kernel panic diagnostic report, along 6 | # with as much symbol translation as your mach_kernel version provides. 7 | # By default, this is some, but with the Kernel Debug Kit, it should be a lot 8 | # more. This is not an official Apple tool. 9 | # 10 | # USAGE: 11 | # ./kernel_diagreport2text.ksh [-f kernel_file] Kernel_report.panic [...] 12 | # 13 | # Note: The Kernel Debug Kit currently requires an Apple ID to download. It 14 | # would be great if this was not necessary. 15 | # 16 | # This script calls atos(1) for symbol translation, and also some sed/awk 17 | # to decorate remaining untranslated symbols with kernel extension names, 18 | # if the ranges match. 19 | # 20 | # This uses your current kernel, /mach_kernel, to translate symbols. If you run 21 | # this on kernel diag reports from a different kernel version, it will print 22 | # a "kernel version mismatch" warning, as the translation may be incorrect. Find 23 | # a matching mach_kernel file and use the -f option to point to it. 24 | # 25 | # Copyright 2014 Brendan Gregg. All rights reserved. 26 | # 27 | # CDDL HEADER START 28 | # 29 | # The contents of this file are subject to the terms of the 30 | # Common Development and Distribution License (the "License"). 31 | # You may not use this file except in compliance with the License. 32 | # 33 | # You can obtain a copy of the license at docs/cddl1.txt or 34 | # http://opensource.org/licenses/CDDL-1.0. 35 | # See the License for the specific language governing permissions 36 | # and limitations under the License. 37 | # 38 | # When distributing Covered Code, include this CDDL HEADER in each 39 | # file and include the License file at docs/cddl1.txt. 40 | # If applicable, add the following below this CDDL HEADER, with the 41 | # fields enclosed by brackets "[]" replaced with your own identifying 42 | # information: Portions Copyright [yyyy] [name of copyright owner] 43 | # 44 | # CDDL HEADER END 45 | 46 | kernel=/mach_kernel 47 | 48 | function usage { 49 | print "USAGE: $0 [-f kernel_file] Kernel_diag_report.panic [...]" 50 | print " eg, $0 /Library/Logs/DiagnosticReports/Kernel_2014-05-26-124827_bgregg.panic" 51 | exit 52 | } 53 | (( $# == 0 )) && usage 54 | [[ $1 == "-h" || $1 == "--help" ]] && usage 55 | 56 | if [[ $1 == "-f" ]]; then 57 | kernel=$2 58 | if [[ ! -e $kernel ]]; then 59 | print -u2 "ERROR: Kernel $kernel not found. Quitting." 60 | exit 2 61 | fi 62 | shift 2 63 | fi 64 | 65 | if [[ ! -x /usr/bin/atos ]]; then 66 | print -u2 "ERROR: Couldn't find, and need, /usr/bin/atos. Is this part of Xcode? Quitting..." 67 | exit 2 68 | fi 69 | 70 | while (( $# != 0 )); do 71 | if [[ "$file" != "" ]]; then print; fi 72 | file=$1 73 | shift 74 | echo "File $file" 75 | 76 | if [[ ! -e $file ]]; then 77 | print "ERROR: File $file not found. Skipping." 78 | continue 79 | fi 80 | 81 | # Find slide address 82 | slide=$(awk '/^Kernel slide:.*0x/ { print $3 }' $file) 83 | if [[ "$slide" == "" ]]; then 84 | print -n "ERROR: Missing \"Kernel slide:\" line, so can't process $file. " 85 | print "This is needed for atos -s. Is this really a Kernel diag panic file?" 86 | continue 87 | fi 88 | 89 | # Print panic line 90 | grep '^panic' $file 91 | 92 | # Check kernel version match (uname -v string) 93 | kernel_ver=$(strings -a $kernel | grep 'Darwin Kernel Version') 94 | panic_ver=$(grep 'Darwin Kernel Version' $file) 95 | warn="" 96 | if [[ "$kernel_ver" != "$panic_ver" ]]; then 97 | print "WARNING: kernel version mismatch (use -f):" 98 | printf "%14s: %s\n" "$kernel" "$kernel_ver" 99 | printf "%14s: %s\n" "panic file" "$panic_ver" 100 | warn=" (may be incorrect due to mismatch)" 101 | fi 102 | 103 | # Find kernel extension ranges 104 | i=0 105 | unset name start end 106 | awk 'ext == 1 && /0x.*->.*0x/ { 107 | gsub(/\[.*\]/, ""); gsub(/@/, " "); gsub(/->/, " ") 108 | print $0 109 | } 110 | /Kernel Extensions in backtrace/ { ext = 1 } 111 | /^$/ { ext = 0 } 112 | ' < $file | while read n s e; do 113 | # the awk gsub's convert this line: 114 | # com.apple.driver.AppleUSBHub(666.4)[CD9B71FF-2FDD-3BC4-9C39-5E066F66D158]@0xffffff7f84ed2000->0xffffff7f84ee9fff 115 | # into this: 116 | # com.apple.driver.AppleUSBHub(666.4) 0xffffff7f84ed2000 0xffffff7f84ee9fff 117 | # which can then be read as three fields 118 | name[i]=$n 119 | start[i]=$s 120 | end[i]=$e 121 | (( i++ )) 122 | done 123 | 124 | # Print and translate stack 125 | print "Stack$warn:" 126 | awk 'backtrace == 1 && /^[^ ]/ { print $3 } 127 | /Backtrace.*Return Address/ { backtrace = 1 } 128 | /^$/ { backtrace = 0 } 129 | ' < $file | atos -d -o $kernel -s $slide | while read line; do 130 | # do extensions 131 | if [[ $line =~ 0x* ]]; then 132 | i=0 133 | while (( i <= ${#name[@]} )); do 134 | [[ "${start[i]}" == "" ]] && break 135 | # assuming fixed width addresses, use string comparison: 136 | if [[ $line > ${start[$i]} && $line < ${end[$i]} ]]; then 137 | line="$line (in ${name[$i]})" 138 | break 139 | fi 140 | (( i++ )) 141 | done 142 | fi 143 | print " $line" 144 | done 145 | 146 | # Print other key details 147 | awk '/^BSD process name/ { gsub(/ corresponding to current thread/, ""); print $0 } 148 | ver == 1 { print "Mac OS version:", $0; ver = 0 } 149 | /^Mac OS version/ { ver = 1 } 150 | ' < $file 151 | done 152 | -------------------------------------------------------------------------------- /perf_events/perf.md: -------------------------------------------------------------------------------- 1 | # Linux perf 2 | 3 | _This is a summary of Linux perf created for a USENIX LISA 2016 workshop by Brendan Gregg and Sasha Goldshtein._ 4 | 5 | The "perf" command is the official profiler and tracer for Linux. Its source is included in the Linux tree (under tools/perf). 6 | 7 | To use perf, first check that it is installed by trying to run "perf": 8 | 9 | ``` 10 | # perf 11 | 12 | usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] 13 | 14 | The most commonly used perf commands are: 15 | annotate Read perf.data (created by perf record) and display annotated code 16 | archive Create archive with object files with build-ids found in perf.data 17 | bench General framework for benchmark suites 18 | [...] 19 | ``` 20 | 21 | It should print a usage message like that above (truncated). On Ubuntu systems, if perf is not installed it will suggest the packages to install. Something like: 22 | 23 | ``` 24 | # apt-get install linux-tools-common linux-tools-`uname -r` 25 | ``` 26 | 27 | ### Operation 28 | 29 | Perf has four basic modes of operation: 30 | 31 | - **counting**: counting events in kernel context and printing a report (low overhead). Eg, "perf stat". 32 | - **capture**: recording events and writing to a perf.data file. Eg, "perf record". 33 | - **reporting**: reading a perf.data file and dumping or summarizing it. Eg, "perf report". 34 | - **live recording**: recording and summarizing events live. Eg, "perf top". 35 | 36 | Whenever the perf.data file is in use, there is overhead to write this file, which is relative to the traced event rate. perf uses ring buffers and dynamic wakeups to lower this overhead. 37 | 38 | ### One-Liners 39 | 40 | Common perf one-liners: 41 | 42 | ``` 43 | # Listing all currently known events: 44 | perf list 45 | 46 | # CPU counter statistics for the entire system, for 5 seconds: 47 | perf stat -a sleep 5 48 | 49 | # Count ext4 events for the entire system, for 10 seconds: 50 | perf stat -e 'ext4:*' -a sleep 10 51 | 52 | # Show system calls by process, refreshing every 2 seconds: 53 | perf top -e raw_syscalls:sys_enter -ns comm 54 | 55 | # Sample CPU stack traces for the specified PID, at 99 Hertz, for 10 seconds: 56 | perf record -F 99 -p PID -g -- sleep 10 57 | 58 | # Sample CPU stack traces for the entire system, at 99 Hertz, for 10 seconds: 59 | perf record -F 99 -ag -- sleep 10 60 | 61 | # Sample CPUs at 49 Hertz, and show top addresses and symbols, live (no perf.data): 62 | perf top -F 49 63 | 64 | # Show perf.data in an ncurses browser (TUI) if possible: 65 | perf report 66 | 67 | # Show perf.data file as a text report with a sample count column: 68 | perf report -n --stdi 69 | 70 | # List all raw events from perf.data: 71 | perf script 72 | ``` 73 | 74 | More one-liners in the Tracing section, and even more are listed on http://www.brendangregg.com/perf.html . 75 | 76 | ## CPU Flame Graphs 77 | 78 | Flame graphs are generated in three steps: 79 | 80 | 1. Capture stacks 81 | 2. Fold stacks 82 | 3. flamegraph.pl 83 | 84 | Using Linux perf, the following samples stack traces at 99 Hertz for 30 seconds, and then generates a flame graph of all sampled stacks (except those containing "cpu_idle": the idle threads): 85 | 86 | ``` 87 | # git clone --depth 1 https://github.com/brendangregg/FlameGraph 88 | # cd FlameGraph 89 | # perf record -F 99 -a -g -- sleep 30 90 | # perf script | ./stackcollapse-perf.pl | grep -v cpu_idle | ./flamegraph.pl > out.svg 91 | ``` 92 | 93 | The "out.svg" file can then be loaded in a web browser. 94 | 95 | ### Broken Stacks 96 | 97 | Broken/incomplete stack traces are a common problem with profilers. perf has multiple ways to walk (fetch) a stack. The easiest to get working is usually frame-pointer based walking. Enabling this for different languages: 98 | 99 | - C: gcc's -f-no-omit-frame-pointer option 100 | - Java: -XX:+PreserveFramePointer 101 | 102 | ### Missing Symbols 103 | 104 | Missing symbols is a common problem when profiling JIT runtimes. perf support supplemental symbol tables in /tmp/perf-PID.map. Enabling this map for different languages: 105 | 106 | - Java: https://github.com/jrudolph/perf-map-agent 107 | - Node.js: --perf\_basic\_prof\_only\_functions 108 | 109 | ### Customizations 110 | 111 | See flamegraph.pl --help. A common customization is to use an alternate palette scheme: eg, "--color java" for Java profiles. 112 | 113 | ## Tracing 114 | 115 | Example static tracing one-liners: 116 | 117 | ``` 118 | # Trace new processes, until Ctrl-C: 119 | perf record -e sched:sched_process_exec -a 120 | 121 | # Trace all context-switches with stack traces, for 1 second: 122 | perf record -e context-switches –ag -- sleep 1 123 | 124 | # Trace CPU migrations, for 10 seconds: 125 | perf record -e migrations -a -- sleep 10 126 | 127 | # Trace all connect()s with stack traces (outbound connections), until Ctrl-C: 128 | perf record -e syscalls:sys_enter_connect –ag 129 | 130 | # Trace all block device (disk I/O) requests with stack traces, until Ctrl-C: 131 | perf record -e block:block_rq_insert -ag 132 | 133 | # Trace all block device issues and completions (has timestamps), until Ctrl-C: 134 | perf record -e block:block_rq_issue -e block:block_rq_complete -a 135 | 136 | # Trace all block completions, of size at least 100 Kbytes, until Ctrl-C: 137 | perf record -e block:block_rq_complete --filter 'nr_sector > 200' 138 | 139 | # Trace all block completions, synchronous writes only, until Ctrl-C: 140 | perf record -e block:block_rq_complete --filter 'rwbs == "WS"' 141 | 142 | # Trace all block completions, all types of writes, until Ctrl-C: 143 | perf record -e block:block_rq_complete --filter 'rwbs ~ "*W*"' 144 | 145 | # Trace all ext4 calls, and write to a non-ext4 location, until Ctrl-C: 146 | perf record -e 'ext4:*' -o /tmp/perf.data -a 147 | ``` 148 | 149 | Example dynamic tracing one-liners: 150 | 151 | ``` 152 | # Add tracepoint for the kernel tcp_sendmsg() function entry ("--add" optional): 153 | perf probe --add tcp_sendmsg 154 | 155 | # Remove the tcp_sendmsg() tracepoint (or use "--del"): 156 | perf probe -d tcp_sendmsg 157 | 158 | # Add a tracepoint for the kernel tcp_sendmsg() function return: 159 | perf probe 'tcp_sendmsg%return' 160 | 161 | # Add tracepoint for tcp_sendmsg() with size and socket state (needs debuginfo): 162 | perf probe 'tcp_sendmsg size sk->__sk_common.skc_state' 163 | 164 | # Add a tracepoint for the user-level malloc() function from libc: 165 | perf probe -x /lib64/libc.so.6 malloc 166 | 167 | # List currently available dynamic probes: 168 | perf probe -l 169 | ``` 170 | 171 | # References 172 | 173 | - https://perf.wiki.kernel.org/index.php/Main_Page 174 | - http://www.brendangregg.com/perf.html 175 | - http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html 176 | - http://queue.acm.org/detail.cfm?id=2927301 177 | 178 | -------------------------------------------------------------------------------- /perf_events/perf1.md: -------------------------------------------------------------------------------- 1 | # Linux perf 2 | 3 | _This is a summary of Linux perf created for a USENIX LISA 2016 workshop by Brendan Gregg and Sasha Goldshtein._ 4 | 5 | The "perf" command is the official profiler and tracer for Linux. Its source is included in the Linux tree (under tools/perf). 6 | 7 | To use perf, first check that it is installed by trying to run "perf": 8 | 9 | ``` 10 | # perf 11 | 12 | usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] 13 | 14 | The most commonly used perf commands are: 15 | annotate Read perf.data (created by perf record) and display annotated code 16 | archive Create archive with object files with build-ids found in perf.data 17 | bench General framework for benchmark suites 18 | [...] 19 | ``` 20 | 21 | It should print a usage message like that above (truncated). On Ubuntu systems, if perf is not installed it will suggest the packages to install. Something like: 22 | 23 | ``` 24 | # apt-get install linux-tools-common linux-tools-`uname -r` 25 | ``` 26 | 27 | ### Operation 28 | 29 | Perf has four basic modes of operation: 30 | 31 | - **counting**: counting events in kernel context and printing a report (low overhead). Eg, "perf stat". 32 | - **capture**: recording events and writing to a perf.data file. Eg, "perf record". 33 | - **reporting**: reading a perf.data file and dumping or summarizing it. Eg, "perf report". 34 | - **live recording**: recording and summarizing events live. Eg, "perf top". 35 | 36 | Whenever the perf.data file is in use, there is overhead to write this file, which is relative to the traced event rate. perf uses ring buffers and dynamic wakeups to lower this overhead. 37 | 38 | ### One-Liners 39 | 40 | Common perf one-liners: 41 | 42 | ``` 43 | # Listing all currently known events: 44 | perf list 45 | 46 | # CPU counter statistics for the entire system, for 5 seconds: 47 | perf stat -a sleep 5 48 | 49 | # Count ext4 events for the entire system, for 10 seconds: 50 | perf stat -e 'ext4:*' -a sleep 10 51 | 52 | # Show system calls by process, refreshing every 2 seconds: 53 | perf top -e raw_syscalls:sys_enter -ns comm 54 | 55 | # Sample CPU stack traces for the specified PID, at 99 Hertz, for 10 seconds: 56 | perf record -F 99 -p PID -g -- sleep 10 57 | 58 | # Sample CPU stack traces for the entire system, at 99 Hertz, for 10 seconds: 59 | perf record -F 99 -ag -- sleep 10 60 | 61 | # Sample CPUs at 49 Hertz, and show top addresses and symbols, live (no perf.data): 62 | perf top -F 49 63 | 64 | # Show perf.data in an ncurses browser (TUI) if possible: 65 | perf report 66 | 67 | # Show perf.data file as a text report with a sample count column: 68 | perf report -n --stdi 69 | 70 | # List all raw events from perf.data: 71 | perf script 72 | ``` 73 | 74 | More one-liners in the Tracing section, and even more are listed on http://www.brendangregg.com/perf.html . 75 | 76 | ## CPU Flame Graphs 77 | 78 | Flame graphs are generated in three steps: 79 | 80 | 1. Capture stacks 81 | 2. Fold stacks 82 | 3. flamegraph.pl 83 | 84 | Using Linux perf, the following samples stack traces at 99 Hertz for 30 seconds, and then generates a flame graph of all sampled stacks (except those containing "cpu_idle": the idle threads): 85 | 86 | ``` 87 | # git clone --depth 1 https://github.com/brendangregg/FlameGraph 88 | # cd FlameGraph 89 | # perf record -F 99 -a -g -- sleep 30 90 | # perf script | ./stackcollapse-perf.pl | grep -v cpu_idle | ./flamegraph.pl > out.svg 91 | ``` 92 | 93 | The "out.svg" file can then be loaded in a web browser. 94 | 95 | ### Broken Stacks 96 | 97 | Broken/incomplete stack traces are a common problem with profilers. perf has multiple ways to walk (fetch) a stack. The easiest to get working is usually frame-pointer based walking. Enabling this for different languages: 98 | 99 | - C: gcc's -f-no-omit-frame-pointer option 100 | - Java: -XX:+PreserveFramePointer 101 | 102 | ### Missing Symbols 103 | 104 | Missing symbols is a common problem when profiling JIT runtimes. perf support supplemental symbol tables in /tmp/perf-PID.map. Enabling this map for different languages: 105 | 106 | - Java: https://github.com/jrudolph/perf-map-agent 107 | - Node.js: --perf\_basic\_prof\_only\_functions 108 | 109 | ### Customizations 110 | 111 | See flamegraph.pl --help. A common customization is to use an alternate palette scheme: eg, "--color java" for Java profiles. 112 | 113 | ## Tracing 114 | 115 | Example static tracing one-liners: 116 | 117 | ``` 118 | # Trace new processes, until Ctrl-C: 119 | perf record -e sched:sched_process_exec -a 120 | 121 | # Trace all context-switches with stack traces, for 1 second: 122 | perf record -e context-switches –ag -- sleep 1 123 | 124 | # Trace CPU migrations, for 10 seconds: 125 | perf record -e migrations -a -- sleep 10 126 | 127 | # Trace all connect()s with stack traces (outbound connections), until Ctrl-C: 128 | perf record -e syscalls:sys_enter_connect –ag 129 | 130 | # Trace all block device (disk I/O) requests with stack traces, until Ctrl-C: 131 | perf record -e block:block_rq_insert -ag 132 | 133 | # Trace all block device issues and completions (has timestamps), until Ctrl-C: 134 | perf record -e block:block_rq_issue -e block:block_rq_complete -a 135 | 136 | # Trace all block completions, of size at least 100 Kbytes, until Ctrl-C: 137 | perf record -e block:block_rq_complete --filter 'nr_sector > 200' 138 | 139 | # Trace all block completions, synchronous writes only, until Ctrl-C: 140 | perf record -e block:block_rq_complete --filter 'rwbs == "WS"' 141 | 142 | # Trace all block completions, all types of writes, until Ctrl-C: 143 | perf record -e block:block_rq_complete --filter 'rwbs ~ "*W*"' 144 | 145 | # Trace all ext4 calls, and write to a non-ext4 location, until Ctrl-C: 146 | perf record -e 'ext4:*' -o /tmp/perf.data -a 147 | ``` 148 | 149 | Example dynamic tracing one-liners: 150 | 151 | ``` 152 | # Add tracepoint for the kernel tcp_sendmsg() function entry ("--add" optional): 153 | perf probe --add tcp_sendmsg 154 | 155 | # Remove the tcp_sendmsg() tracepoint (or use "--del"): 156 | perf probe -d tcp_sendmsg 157 | 158 | # Add a tracepoint for the kernel tcp_sendmsg() function return: 159 | perf probe 'tcp_sendmsg%return' 160 | 161 | # Add tracepoint for tcp_sendmsg() with size and socket state (needs debuginfo): 162 | perf probe 'tcp_sendmsg size sk->__sk_common.skc_state' 163 | 164 | # Add a tracepoint for the user-level malloc() function from libc: 165 | perf probe -x /lib64/libc.so.6 malloc 166 | 167 | # List currently available dynamic probes: 168 | perf probe -l 169 | ``` 170 | 171 | # References 172 | 173 | - https://perf.wiki.kernel.org/index.php/Main_Page 174 | - http://www.brendangregg.com/perf.html 175 | - http://www.brendangregg.com/FlameGraphs/cpuflamegraphs.html 176 | - http://queue.acm.org/detail.cfm?id=2927301 177 | 178 | -------------------------------------------------------------------------------- /perf_events/perfmaptidy.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # perfmaptidy.pl - tidy up a Linux perf_events /tmp/perf-%d.map file. 4 | # 5 | # Linux perf_events can mis-translate symbols when the map file has grown 6 | # dynamically, and includes stale entries. perfmaptidy.pl reads such a symbol 7 | # table, then replays the mappings backwards, dropping overlaps. It then emits 8 | # tidy symbol table, containing only the most recent mappings. 9 | # 10 | # USAGE: ./perfmaptidy.pl /tmp/perf-1572.livemap > /tmp/perf-1572.map 11 | # 12 | # For this to work, you would want your JIT agent to write to a different 13 | # file than usual (eg, ".livemap"), so that perfmaptidy.pl can turn it into 14 | # the ".map" file that perf expects. Or, you can use mv to rename the live .map 15 | # file to be a .livemap file, and then recreate the .map file using 16 | # perfmaptidy.pl (the new .livemap file should continue being written to). 17 | # 18 | # Copyright 2014 Brendan Gregg. All rights reserved. 19 | # 20 | # This program is free software; you can redistribute it and/or 21 | # modify it under the terms of the GNU General Public License 22 | # as published by the Free Software Foundation; either version 2 23 | # of the License, or (at your option) any later version. 24 | # 25 | # This program is distributed in the hope that it will be useful, 26 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 27 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 28 | # GNU General Public License for more details. 29 | # 30 | # You should have received a copy of the GNU General Public License 31 | # along with this program; if not, write to the Free Software Foundation, 32 | # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 33 | # 34 | # (http://www.gnu.org/copyleft/gpl.html) 35 | # 36 | # 02-Dec-2014 Brendan Gregg Created this. 37 | 38 | use strict; 39 | # no warnings, to avoid non-portable warnings when reading large ints 40 | # without BigInt (which is slow) 41 | 42 | my @table; # map table 43 | 44 | sub store { 45 | my ($addr, $size, $symbol) = @_; 46 | my $low = 0; 47 | my $high = $#table; 48 | my $mid; 49 | 50 | # binary search 51 | while ($low <= $high) { 52 | $mid = int(($low + $high) / 2); 53 | if ($addr >= $table[$mid]->{start} && $addr <= $table[$mid]->{end}) { 54 | return; 55 | } elsif ($addr < $table[$mid]->{start}) { 56 | $high = $mid - 1; 57 | } else { 58 | $low = $mid + 1; 59 | } 60 | } 61 | 62 | # check for latter overlaps 63 | if ($#table >= 0 and defined $table[$high + 1]) { 64 | my $next = $table[$high + 1]->{start}; 65 | if ($next >= $addr && $next <= $addr + $size) { 66 | return; 67 | } 68 | } 69 | 70 | # store 71 | my $data = {}; 72 | $data->{start} = $addr; 73 | $data->{end} = $addr + $size; 74 | $data->{symbol} = $symbol; 75 | splice @table, $low, 0, $data; 76 | } 77 | 78 | # load map file from STDIN 79 | my @symbols = <>; 80 | for (my $i = $#symbols; $i >= 0; $i--) { 81 | my ($addr, $size, $symbol) = split ' ', $symbols[$i], 3; 82 | chomp $symbol; 83 | store(hex($addr), hex($size), $symbol); 84 | } 85 | 86 | # emit map file on STDOUT 87 | for (my $i = 0; $i <= $#table; $i++) { 88 | my $size = $table[$i]->{end} - $table[$i]->{start}; 89 | printf "%x %x %s\n", $table[$i]->{start}, $size, $table[$i]->{symbol}; 90 | } 91 | -------------------------------------------------------------------------------- /s1bench/README.md: -------------------------------------------------------------------------------- 1 | This is a benchmark and framework written as a quick hack for some one-off 2 | benchmarking for KPTI. 3 | 4 | Example output is included (out.*), and chart.png shows the difference between 5 | nopti and pti + nopcid (best to worst). 6 | -------------------------------------------------------------------------------- /s1bench/chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brendangregg/Misc/44cbd85cd5e4183987edfcb0f05fd03bc4d56c07/s1bench/chart.png -------------------------------------------------------------------------------- /s1bench/s1bench.c: -------------------------------------------------------------------------------- 1 | /* 2 | * s1bench - syscall benchmark 1. Tests a syscall & think loop. 3 | * 4 | * This benchmark has three stages: 5 | * 6 | * 1. spin loop 7 | * This is a simple control, for checking CPU variance 8 | * between runs and systems. If there's too much variance 9 | * here, don't bother with the tests that follow. 10 | * 2. memory population 11 | * Strides by getpagesize(), creating a region for the 12 | * following test. 13 | * 3. syscall & think 14 | * Does a fast syscall (close(999), which fails) followed 15 | * by some time "thinking": reading over the memory region 16 | * for a specified number of reads, and by a specified 17 | * stride size. 18 | * 19 | * gcc -O0 -pthread -o s1bench s1bench.c 20 | * 21 | * USAGE: see -h for usage. 22 | * 23 | * 03-Jan-2017 Brendan Gregg Created this. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | void usage() 38 | { 39 | printf("USAGE: s1bench spintime(ms) allocsize(B) reads_per_syscalls read_stridesize(B) runtime(ms)\n" 40 | " spintime(ms) spin test time as a control\n" 41 | " allocsize(B) memory size to allocate and populate (bytes)\n" 42 | " reads_per_syscall number of memory reads per syscall\n" 43 | " stridesize(B) size to step after each memory read (bytes)\n" 44 | " runtime(ms) duration of workload run\n" 45 | " eg,\n" 46 | " s1bench 300 $(( 100 * 1024 * 1024 )) 2000 64 5000\n" 47 | " # example run: 100 MB, 2000 reads per syscall, 64 byte stride, 5 sec run\n" 48 | " s1bench 300 0 0 0 0 0 # spin test only (control only)\n" 49 | " s1bench 0 0 0 0 500 # syscalls only, no think\n" 50 | " s1bench 0 1024 100 64 500 # syscalls, plus some think\n\n" 51 | "Output is space-delimited values, one line per category:\n" 52 | " INPUT: (input parameters)\n" 53 | " SPIN: spin_count spin_time(s) spin_usr_time(s) spin_sys_time(s) involuntary_csw\n" 54 | " POP: pop_count pop_time(s) pop_usr_time(s) pop_sys_time(s) minor_faults\n" 55 | " RUN: run_count run_time(s) run_usr_time(s) run_sys_time(s) involuntary_csw\n" 56 | " RATES: spin_count/s pop_count/s run_count/s\n\n" 57 | "The syscalls called is roughly equal to run_count (plus program init).\n"); 58 | } 59 | 60 | /* 61 | * These functions aren't just for code clenlyness: they show up in profilers 62 | * when doing active benchmarking to debug the benchmark. 63 | */ 64 | 65 | int g_spin = 1; 66 | void spinstop(int dummy) { 67 | g_spin = 0; 68 | } 69 | 70 | void *spinloop(void *arg) 71 | { 72 | signal(SIGUSR1, spinstop); 73 | unsigned long long *count = (unsigned long long *)arg; 74 | for (;g_spin;) { (*count)++; } 75 | } 76 | 77 | void spintest(unsigned long long spin_wait_us, unsigned long long *count) 78 | { 79 | pthread_t thread; 80 | 81 | if (!spin_wait_us) 82 | return; 83 | 84 | if (pthread_create(&thread, NULL, spinloop, count) != 0) { 85 | perror("Thread create failed"); 86 | exit(1); 87 | } 88 | usleep(spin_wait_us); 89 | if (pthread_kill(thread, SIGUSR1)) { 90 | perror("Couldn't terminate worker thread normally"); 91 | exit(1); 92 | } 93 | pthread_join(thread, NULL); 94 | } 95 | 96 | int g_work = 1; 97 | void workstop(int dummy) { 98 | g_work = 0; 99 | } 100 | 101 | struct workload_args { 102 | char *mem; 103 | unsigned long long memsize; 104 | unsigned long long readmax; 105 | int stride; 106 | unsigned long long *count; 107 | }; 108 | 109 | void *workloop(void *arg) 110 | { 111 | struct workload_args *a = (struct workload_args *)arg; 112 | char *memp; 113 | unsigned long long i, j; 114 | int junk; 115 | char *mem = a->mem; 116 | unsigned long long memsize = a->memsize; 117 | unsigned long long readmax = a->readmax; 118 | int stride = a->stride; 119 | unsigned long long *count = a->count; 120 | 121 | signal(SIGUSR1, workstop); 122 | memp = mem; 123 | for (;g_work;) { 124 | close(999); // the syscall (it errors, but so what) 125 | (*count)++; 126 | // can do a "memp = mem;" here to reset on each loop 127 | for (j = 0; j < readmax; j++) { 128 | junk += memp[0]; 129 | memp += stride; 130 | if (memp > (mem + memsize)) 131 | memp = mem; 132 | } 133 | } 134 | } 135 | 136 | void workload(char *mem, unsigned long long memsize, 137 | unsigned long long readmax, int stride, unsigned long long *count, 138 | unsigned long long run_wait_us) 139 | { 140 | struct workload_args args = {.mem = mem, .memsize = memsize, 141 | .readmax = readmax, .stride = stride, .count = count}; 142 | pthread_t thread; 143 | 144 | if (!run_wait_us) 145 | return; 146 | 147 | if (pthread_create(&thread, NULL, workloop, &args) != 0) { 148 | perror("Thread create failed"); 149 | exit(1); 150 | } 151 | usleep(run_wait_us); 152 | if (pthread_kill(thread, SIGUSR1)) { 153 | perror("Couldn't terminate worker thread normally"); 154 | exit(0); 155 | } 156 | pthread_join(thread, NULL); 157 | } 158 | 159 | int main(int argc, char *argv[]) 160 | { 161 | char *mem, *memp; 162 | int stride; 163 | unsigned long long memsize, readmax, pagesize, 164 | spin_wait_us, run_wait_us; 165 | unsigned long long spin_count, spin_us, spin_usr_us, spin_sys_us, 166 | spin_ivcs, pop_count, pop_us, pop_usr_us, pop_sys_us, pop_minflt, 167 | run_count, run_us, run_usr_us, run_sys_us, run_ivcs; 168 | static struct timeval ts[6]; 169 | struct rusage u[6]; 170 | 171 | // options 172 | if (argc < 6) { 173 | usage(); 174 | exit(0); 175 | } 176 | spin_wait_us = atoll(argv[1]) * 1000; 177 | memsize = atoll(argv[2]); 178 | readmax = atoll(argv[3]); 179 | stride = atoll(argv[4]); 180 | run_wait_us = atoll(argv[5]) * 1000; 181 | 182 | // init 183 | pagesize = getpagesize(); 184 | spin_count = 0; 185 | pop_count = 0; 186 | run_count = 0; 187 | if ((mem = malloc(memsize)) == NULL) { 188 | printf("ERROR allocating working set memory. Exiting.\n"); 189 | return 1; 190 | } 191 | 192 | /* 193 | * spin time, with timeout 194 | */ 195 | getrusage(RUSAGE_SELF, &u[0]); 196 | gettimeofday(&ts[0], NULL); 197 | spintest(spin_wait_us, &spin_count); 198 | gettimeofday(&ts[1], NULL); 199 | getrusage(RUSAGE_SELF, &u[1]); 200 | 201 | /* 202 | * populate working set 203 | */ 204 | getrusage(RUSAGE_SELF, &u[2]); 205 | gettimeofday(&ts[2], NULL); 206 | for (memp = mem; memp < (mem + memsize); memp += pagesize) { 207 | memp[0] = 'A'; 208 | pop_count++; 209 | } 210 | gettimeofday(&ts[3], NULL); 211 | getrusage(RUSAGE_SELF, &u[3]); 212 | 213 | /* 214 | * workload, with timeout 215 | */ 216 | getrusage(RUSAGE_SELF, &u[4]); 217 | gettimeofday(&ts[4], NULL); 218 | workload(mem, memsize, readmax, stride, &run_count, run_wait_us); 219 | gettimeofday(&ts[5], NULL); 220 | getrusage(RUSAGE_SELF, &u[5]); 221 | 222 | /* 223 | * calculate and print times 224 | */ 225 | spin_us = 1000000 * (ts[1].tv_sec - ts[0].tv_sec) + (ts[1].tv_usec - ts[0].tv_usec) / 1; 226 | spin_usr_us = 1000000 * (u[1].ru_utime.tv_sec - u[0].ru_utime.tv_sec) + (u[1].ru_utime.tv_usec - u[0].ru_utime.tv_usec) / 1; 227 | spin_sys_us = 1000000 * (u[1].ru_stime.tv_sec - u[0].ru_stime.tv_sec) + (u[1].ru_stime.tv_usec - u[0].ru_stime.tv_usec) / 1; 228 | spin_ivcs = u[1].ru_nivcsw - u[0].ru_nivcsw; 229 | pop_us = 1000000 * (ts[3].tv_sec - ts[2].tv_sec) + (ts[3].tv_usec - ts[2].tv_usec) / 1; 230 | pop_usr_us = 1000000 * (u[3].ru_utime.tv_sec - u[2].ru_utime.tv_sec) + (u[3].ru_utime.tv_usec - u[2].ru_utime.tv_usec) / 1; 231 | pop_sys_us = 1000000 * (u[3].ru_stime.tv_sec - u[2].ru_stime.tv_sec) + (u[3].ru_stime.tv_usec - u[2].ru_stime.tv_usec) / 1; 232 | pop_minflt = u[3].ru_minflt - u[2].ru_minflt; 233 | run_us = 1000000 * (ts[5].tv_sec - ts[4].tv_sec) + (ts[5].tv_usec - ts[4].tv_usec) / 1; 234 | run_usr_us = 1000000 * (u[5].ru_utime.tv_sec - u[4].ru_utime.tv_sec) + (u[5].ru_utime.tv_usec - u[4].ru_utime.tv_usec) / 1; 235 | run_sys_us = 1000000 * (u[5].ru_stime.tv_sec - u[4].ru_stime.tv_sec) + (u[5].ru_stime.tv_usec - u[4].ru_stime.tv_usec) / 1; 236 | run_ivcs = u[5].ru_nivcsw - u[4].ru_nivcsw; 237 | printf("INPUT: %llu %llu %llu %d %llu\n", spin_wait_us / 1000, memsize, readmax, stride, run_wait_us / 1000); 238 | printf("SPIN: %llu %.3f %.3f %.3f %llu\n", spin_count, (double)spin_us / 1000000, (double)spin_usr_us / 1000000, (double)spin_sys_us / 1000000, spin_ivcs); 239 | printf("POP: %llu %.3f %.3f %.3f %llu\n", pop_count, (double)pop_us / 1000000, (double)pop_usr_us / 1000000, (double)pop_sys_us / 1000000, pop_minflt); 240 | printf("RUN: %llu %.3f %.3f %.3f %llu\n", run_count, (double)run_us / 1000000, (double)run_usr_us / 1000000, (double)run_sys_us / 1000000, run_ivcs); 241 | printf("RATES: %llu %llu %.1f\n", spin_us ? spin_count * 1000000 / spin_us : 0, 242 | pop_us ? pop_count * 1000000 / pop_us : 0, 243 | run_us ? (double)run_count * 1000000 / run_us : 0); 244 | 245 | return (0); 246 | } 247 | -------------------------------------------------------------------------------- /s1bench/series2cols.sh: -------------------------------------------------------------------------------- 1 | awk 'BEGIN { OFS = "," } $1 == "OUT:" { print $3, $6 }' 2 | -------------------------------------------------------------------------------- /s1bench/series_s1bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # series_s1bench.sh Harness for running s1bench microbenchmark. 4 | # 5 | # This is a quick hack, written as a one-off for benchamrking Meltdown/Spectre 6 | # overheads. It should be rewritten in a different language. 7 | # 8 | # This runs s1bench many times, stepping up the working set reads to lower 9 | # the syscall rate. Each configuration is run $iters times, and the runs 10 | # are printed in the "OUT:" line, sorted in order from fastest to slowest. 11 | # Various measures are taken to lower variance: numactl is used to bind to 12 | # a CPU and node, and the difference between the fastest and second fastest 13 | # runs is compared, and more runs are executed until this satisfies $maxvarpct. 14 | # Debug information is included in the output with different prefixes. 15 | # 16 | # DEPENDENCIES: Apart from the s1bench microbenchmark binary, the 17 | # local directory tools listed under "debug tools" in the source should be 18 | # present. They may use /proc, MSRs, and PMCs, and are from: 19 | # - https://github.com/brendangregg/msr-cloud-tools 20 | # - https://github.com/brendangregg/pmc-cloud-tools 21 | # 22 | # 04-Jan-2018 Brendan Gregg Created this. 23 | 24 | debugfile=/tmp/out.benchdebug.$$ 25 | rawfile=/tmp/out.benchtmp.$$ 26 | 27 | ### benchmark config 28 | wssize=$(( 100 * 1024 * 1024 )) 29 | spintime_ms=300 30 | runtime_ms=3000 31 | stride=64 32 | iters=20 33 | 34 | ### variance tunables 35 | maxvarpct=0.20 # max variation percent; "" to disable 36 | iterstep=5 # extra iters until maxvarpct satisfied 37 | maxiters=100 # the give up point 38 | 39 | ### run configuration 40 | readmax=2000000 41 | start=256 42 | preruns="0 1 8 16 32 64 128" 43 | 44 | ### build range of syscall counts to test 45 | count=$start 46 | sizes="$preruns " 47 | while (( count < readmax )); do 48 | if (( count == 1 )); then 49 | sizes="0 " 50 | fi 51 | sizes="$sizes $count" 52 | (( count2 = count * 2 )) 53 | (( count1 = (count + count2) / 2 )) 54 | (( count1 != count )) && sizes="$sizes $count1" 55 | count=$count2 56 | done 57 | 58 | ### choose a CPU and memory node to benchmark on: the last one 59 | cpu=$(awk '$1 == "processor" { cpu = $3 } END { print cpu }' /proc/cpuinfo) 60 | node=$(echo /sys/devices/system/cpu/cpu$cpu/node*) 61 | node=${node##*node} 62 | if [[ "$cpu" != [0-9]* || "$node" != [0-9]* ]]; then 63 | echo >&2 "ERROR: choosing a CPU and memory node (got $cpu, $node). Exiting." 64 | exit 65 | fi 66 | 67 | ### debug tools 68 | debugsecs=5 69 | debug[0]="mpstat -P $cpu 1 $debugsecs" 70 | debug[1]="./showboost -C$cpu 1 $debugsecs" 71 | debug[2]="./pmcarch -C$cpu 1 $debugsecs" 72 | debug[3]="./tlbstat -C$cpu 1 $debugsecs" 73 | # these will get early SIGINTs 74 | 75 | ### header 76 | echo "OUT: working-set-size working-set-reads working-set-stride runtime(ms) fastest_run_rate/s [next_fastest_run_rate/s ...]" 77 | 78 | ### main 79 | for wsreads in $sizes; do 80 | # clear debug file 81 | > $debugfile 82 | debugidx=0 83 | 84 | spinrates="" 85 | poprates="" 86 | runrates="" 87 | i=0 88 | imax=$iters 89 | while :; do 90 | (( i++ )) 91 | # if present, launch one debug tool for each iter 92 | dpid=0 93 | if (( d < ${#debug[@]} )); then 94 | ${debug[$debugidx]} >> $debugfile & 95 | dpid=$! 96 | (( debugidx++ )) 97 | fi 98 | 99 | # benchmark 100 | cmd="./s1bench $spintime_ms $wssize $wsreads $stride $runtime_ms" 101 | # let STDERR run free 102 | numactl --membind=$node --physcpubind=$cpu $cmd > $rawfile 103 | while read category num1 num2 num3; do 104 | [[ "$category" != "RATES:" ]] && continue 105 | spinrates="$spinrates $num1" 106 | poprates="$poprates $num2" 107 | runrates="$runrates $num3" 108 | done < $rawfile 109 | (( dpid )) && [ -d /proc/$dpid ] && kill -INT $dpid 110 | wait # for debug tool if necessary 111 | 112 | # print raw output 113 | [ -e $rawfile ] && awk -v w=$wsreads -v i=$i '$1 !~ /INPUT/ { print "RAW:", w, i, $0 }' $rawfile 114 | 115 | # check if enough iterations have been done 116 | if (( i >= imax )); then 117 | # sort and calculate variance (uses sort(1) and awk(1)) 118 | set -- $spinrates 119 | spinsorted=$(while [[ "$1" != "" ]]; do echo $1; shift; done | sort -rn) 120 | spinvar=$(echo $spinsorted | awk '{ first=$1; i=1; while (i++ < NF) { printf("%.2f ", 100 * (1 - $i / first)); } printf("\n"); }') 121 | set -- $poprates 122 | popsorted=$(while [[ "$1" != "" ]]; do echo $1; shift; done | sort -rn) 123 | popvar=$(echo $popsorted | awk '{ first=$1; i=1; while (i++ < NF) { printf("%.2f ", 100 * (1 - $i / first)); } printf("\n"); }') 124 | set -- $runrates 125 | runsorted=$(while [[ "$1" != "" ]]; do echo $1; shift; done | sort -rn) 126 | runvar=$(echo $runsorted | awk '{ first=$1; i=1; while (i++ < NF) { printf("%.2f ", 100 * (1 - $i / first)); } printf("\n"); }') 127 | 128 | # check if variance is satisfactory 129 | if [[ "$maxvarpct" != "" ]]; then 130 | set -- $runvar 131 | # borrowing awk for float comparisons 132 | var1=$1 133 | if awk -v var1=$var1 -v maxvarpct=$maxvarpct 'BEGIN { if (var1 < maxvarpct) { exit(0) } else { exit(1) } }'; then 134 | # within max variance 135 | break 136 | fi 137 | # not within max variance 138 | (( imax += iterstep )) 139 | if (( imax > maxiters )); then 140 | echo "MESSAGE: Too many attempts to lower variance, aborting." 141 | break 142 | fi 143 | echo "MESSAGE: Variance too high ($var1 > $maxvarpct). Continuing..." 144 | else 145 | break 146 | fi 147 | fi 148 | done 149 | 150 | # final output 151 | echo OUT: $wssize $wsreads $stride $runtime_ms $runsorted 152 | echo SPINRATES: $spinsorted 153 | echo SPINVAR%: $spinvar 154 | echo POPRATES: $popsorted 155 | echo POPVAR%: $popvar 156 | echo RUNRATES: $runsorted 157 | echo RUNVAR%: $runvar 158 | set -- $runvar 159 | echo RUNS: $(( i - 1 )) for $1 160 | 161 | [ -e $debugfile ] && awk -v w=$wsreads '{ print "DEBUG" NR ":", w, $0 }' $debugfile 162 | done 163 | 164 | ### cleanup 165 | [ -e $rawfile ] && rm $rawfile 166 | [ -e $debugfile ] && rm $debugfile 167 | -------------------------------------------------------------------------------- /test/date.txt: -------------------------------------------------------------------------------- 1 | Sat Aug 22 12:21:21 PDT 2015 2 | -------------------------------------------------------------------------------- /xen/xen-4.6.0-vpmu-filter.diff: -------------------------------------------------------------------------------- 1 | diff -ur xen-4.6.0-clean/docs/misc/xen-command-line.markdown xen-4.6.0-brendan/docs/misc/xen-command-line.markdown 2 | --- xen-4.6.0-clean/docs/misc/xen-command-line.markdown 2015-10-05 07:33:39.000000000 -0700 3 | +++ xen-4.6.0-brendan/docs/misc/xen-command-line.markdown 2015-11-20 15:29:05.663781176 -0800 4 | @@ -1444,7 +1444,7 @@ 5 | flushes on VM entry and exit, increasing performance. 6 | 7 | ### vpmu 8 | -> `= ( bts )` 9 | +> `= ( | bts | ipc | arch )` 10 | 11 | > Default: `off` 12 | 13 | @@ -1460,6 +1460,15 @@ 14 | If 'vpmu=bts' is specified the virtualisation of the Branch Trace Store (BTS) 15 | feature is switched on on Intel processors supporting this feature. 16 | 17 | +vpmu=ipc enables performance monitoring, but restricts the counters to the 18 | +most minimum set possible: instructions, cycles, and reference cycles. These 19 | +can be used to calculate instructions per cycle (IPC). 20 | + 21 | +vpmu=arch enables performance monitoring, but restricts the counters to the 22 | +pre-defined architectural events only. These are exposed by cpuid, and listed 23 | +in Table 18-1 from the Intel 64 and IA-32 Architectures Software Developer's 24 | +Manual, Volume 3B, System Programming Guide, Part 2. 25 | + 26 | Note that if **watchdog** option is also specified vpmu will be turned off. 27 | 28 | *Warning:* 29 | diff -ur xen-4.6.0-clean/xen/arch/x86/cpu/vpmu.c xen-4.6.0-brendan/xen/arch/x86/cpu/vpmu.c 30 | --- xen-4.6.0-clean/xen/arch/x86/cpu/vpmu.c 2015-10-05 07:33:39.000000000 -0700 31 | +++ xen-4.6.0-brendan/xen/arch/x86/cpu/vpmu.c 2015-11-20 15:29:50.847781176 -0800 32 | @@ -43,9 +43,11 @@ 33 | CHECK_pmu_params; 34 | 35 | /* 36 | - * "vpmu" : vpmu generally enabled 37 | - * "vpmu=off" : vpmu generally disabled 38 | - * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on. 39 | + * "vpmu" : vpmu generally enabled (all counters) 40 | + * "vpmu=off" : vpmu generally disabled 41 | + * "vpmu=bts" : vpmu enabled and Intel BTS feature switched on. 42 | + * "vpmu=ipc" : vpmu enabled for IPC counters only (most restrictive) 43 | + * "vpmu=arch" : vpmu enabled for predef arch counters only (restrictive) 44 | */ 45 | static unsigned int __read_mostly opt_vpmu_enabled; 46 | unsigned int __read_mostly vpmu_mode = XENPMU_MODE_OFF; 47 | @@ -67,6 +69,10 @@ 48 | default: 49 | if ( !strcmp(s, "bts") ) 50 | vpmu_features |= XENPMU_FEATURE_INTEL_BTS; 51 | + else if ( !strcmp(s, "ipc") ) 52 | + vpmu_features |= XENPMU_FEATURE_IPC_ONLY; 53 | + else if ( !strcmp(s, "arch") ) 54 | + vpmu_features |= XENPMU_FEATURE_ARCH_ONLY; 55 | else if ( *s ) 56 | { 57 | printk("VPMU: unknown flag: %s - vpmu disabled!\n", s); 58 | diff -ur xen-4.6.0-clean/xen/arch/x86/cpu/vpmu_intel.c xen-4.6.0-brendan/xen/arch/x86/cpu/vpmu_intel.c 59 | --- xen-4.6.0-clean/xen/arch/x86/cpu/vpmu_intel.c 2015-10-05 07:33:39.000000000 -0700 60 | +++ xen-4.6.0-brendan/xen/arch/x86/cpu/vpmu_intel.c 2015-11-20 15:29:42.571781176 -0800 61 | @@ -166,10 +166,10 @@ 62 | */ 63 | static int core2_get_fixed_pmc_count(void) 64 | { 65 | - u32 eax; 66 | + u32 edx; 67 | 68 | - eax = cpuid_eax(0xa); 69 | - return MASK_EXTR(eax, PMU_FIXED_NR_MASK); 70 | + edx = cpuid_edx(0xa); 71 | + return MASK_EXTR(edx, PMU_FIXED_NR_MASK); 72 | } 73 | 74 | /* edx bits 5-12: Bit width of fixed-function performance counters */ 75 | @@ -652,12 +652,52 @@ 76 | tmp = msr - MSR_P6_EVNTSEL(0); 77 | if ( tmp >= 0 && tmp < arch_pmc_cnt ) 78 | { 79 | + int umaskevent, blocked = 0; 80 | struct xen_pmu_cntr_pair *xen_pmu_cntr_pair = 81 | vpmu_reg_pointer(core2_vpmu_cxt, arch_counters); 82 | 83 | if ( msr_content & ARCH_CTRL_MASK ) 84 | return -EINVAL; 85 | 86 | + /* PMC filters */ 87 | + umaskevent = msr_content & MSR_IA32_CMT_EVTSEL_UE_MASK; 88 | + if ( vpmu_features & XENPMU_FEATURE_IPC_ONLY || 89 | + vpmu_features & XENPMU_FEATURE_ARCH_ONLY ) 90 | + { 91 | + blocked = 1; 92 | + switch ( umaskevent ) 93 | + { 94 | + /* 95 | + * See Table 18-1 from the Intel 64 and IA-32 Architectures Software 96 | + * Developer's Manual, Volume 3B, System Programming Guide, Part 2. 97 | + */ 98 | + case 0x003c: /* unhalted core cycles */ 99 | + case 0x013c: /* unhalted ref cycles */ 100 | + case 0x00c0: /* instruction retired */ 101 | + blocked = 0; 102 | + default: 103 | + break; 104 | + } 105 | + } 106 | + 107 | + if ( vpmu_features & XENPMU_FEATURE_ARCH_ONLY ) 108 | + { 109 | + /* additional counters beyond IPC only; blocked already set */ 110 | + switch ( umaskevent ) 111 | + { 112 | + case 0x4f2e: /* LLC reference */ 113 | + case 0x412e: /* LLC misses */ 114 | + case 0x00c4: /* branch instruction retired */ 115 | + case 0x00c5: /* branch */ 116 | + blocked = 0; 117 | + default: 118 | + break; 119 | + } 120 | + } 121 | + 122 | + if ( blocked ) 123 | + return -EINVAL; 124 | + 125 | if ( has_hvm_container_vcpu(v) ) 126 | vmx_read_guest_msr(MSR_CORE_PERF_GLOBAL_CTRL, 127 | &core2_vpmu_cxt->global_ctrl); 128 | diff -ur xen-4.6.0-clean/xen/include/public/pmu.h xen-4.6.0-brendan/xen/include/public/pmu.h 129 | --- xen-4.6.0-clean/xen/include/public/pmu.h 2015-10-05 07:33:39.000000000 -0700 130 | +++ xen-4.6.0-brendan/xen/include/public/pmu.h 2015-11-20 15:30:08.887781176 -0800 131 | @@ -84,9 +84,17 @@ 132 | 133 | /* 134 | * PMU features: 135 | - * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) 136 | + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) 137 | + * - XENPMU_FEATURE_IPC_ONLY: Restrict PMC to the most minimum set possible. 138 | + * Instructions, cycles, and ref cycles. Can be 139 | + * used to calculate instructions-per-cycle (IPC). 140 | + * - XENPMU_FEATURE_ARCH_ONLY: Restrict PMCs to the Intel pre-defined 141 | + * architecteral events exposed by cpuid and 142 | + * listed in Table 18-1 of the developer's manual. 143 | */ 144 | -#define XENPMU_FEATURE_INTEL_BTS 1 145 | +#define XENPMU_FEATURE_INTEL_BTS (1<<0) 146 | +#define XENPMU_FEATURE_IPC_ONLY (1<<1) 147 | +#define XENPMU_FEATURE_ARCH_ONLY (1<<2) 148 | 149 | /* 150 | * Shared PMU data between hypervisor and PV(H) domains. 151 | -------------------------------------------------------------------------------- /xen/xen-features.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # 3 | # xen-features.pl print Linux Xen guest feature bits in human. 4 | # 5 | # This will get out of date. If you're a Xen developer, you are welcome to put 6 | # this under xen/tools/misc, where others can update it. 7 | # 8 | # 05-May-2014 Brendan Gregg Created this. 9 | 10 | use strict; 11 | 12 | open FEAT, "/sys/hypervisor/properties/features" or die "ERROR open(): $!"; 13 | my $features = ; 14 | close FEAT; 15 | chomp $features; 16 | my $decfeatures = hex $features; 17 | 18 | print "Xen features: $features\n"; 19 | 20 | foreach () { 21 | my ($def, $feat, $bit) = split; 22 | $feat =~ s/^XENFEAT_//; 23 | print "enabled: $feat\n" if $decfeatures & (1 << $bit); 24 | } 25 | 26 | # The following are from include/xen/interface/features.h, and will need updating: 27 | 28 | __DATA__ 29 | #define XENFEAT_writable_page_tables 0 30 | #define XENFEAT_writable_descriptor_tables 1 31 | #define XENFEAT_auto_translated_physmap 2 32 | #define XENFEAT_supervisor_mode_kernel 3 33 | #define XENFEAT_pae_pgdir_above_4gb 4 34 | #define XENFEAT_mmu_pt_update_preserve_ad 5 35 | #define XENFEAT_hvm_callback_vector 8 36 | #define XENFEAT_hvm_safe_pvclock 9 37 | #define XENFEAT_hvm_pirqs 10 38 | #define XENFEAT_dom0 11 39 | #define XENFEAT_grant_map_identity 12 40 | #define XENFEAT_memory_op_vnode_supported 13 41 | #define XENFEAT_ARM_SMCCC_supported 14 42 | -------------------------------------------------------------------------------- /xen/xen-features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # xen-features.py print Linux Xen guest feature bits in human. 4 | # 5 | # This will get out of date. If you're a Xen developer, you are welcome to put 6 | # this under xen/tools/misc, where others can update it. 7 | # 8 | # 05-May-2014 Brendan Gregg Created this. 9 | 10 | try: 11 | with open("/sys/hypervisor/properties/features", "r") as infile: 12 | features = infile.read().rstrip() 13 | except IOError as msg: 14 | print 'ERROR: reading Xen features (not a Xen guest?):', msg 15 | 16 | print "Xen features:", features 17 | decfeatures = int(features, 16) 18 | 19 | with open(__file__) as data: 20 | for line in data: 21 | if line.startswith('# __DATA__'): 22 | for line in data: 23 | a = line[1:].split() 24 | name = a[1] 25 | bit = int(a[2]) 26 | if decfeatures & (1 << bit): 27 | print "enabled:", name[8:] 28 | 29 | # __DATA__ 30 | #define XENFEAT_writable_page_tables 0 31 | #define XENFEAT_writable_descriptor_tables 1 32 | #define XENFEAT_auto_translated_physmap 2 33 | #define XENFEAT_supervisor_mode_kernel 3 34 | #define XENFEAT_pae_pgdir_above_4gb 4 35 | #define XENFEAT_mmu_pt_update_preserve_ad 5 36 | #define XENFEAT_hvm_callback_vector 8 37 | #define XENFEAT_hvm_safe_pvclock 9 38 | #define XENFEAT_hvm_pirqs 10 39 | #define XENFEAT_dom0 11 40 | #define XENFEAT_grant_map_identity 12 41 | #define XENFEAT_memory_op_vnode_supported 13 42 | #define XENFEAT_ARM_SMCCC_supported 14 43 | --------------------------------------------------------------------------------