├── .gitignore ├── .gitmodules ├── Makefile ├── README.md ├── avx.cpp ├── avx512.cpp ├── bdw.csv ├── bdw.log ├── bench.cpp ├── common.hpp ├── compare-result.py ├── gen.cpp ├── glm.log ├── hsw.log ├── knl.log ├── logs ├── linux │ ├── 11thGenIntel(R)Core(TM)i5-1135G7@2.40GHz.csv │ ├── 12thGenIntel(R)Core(TM)i7-12700K.csv │ ├── AMDRyzen52400GwithRadeonVegaGraphics.csv │ ├── AMDRyzen71700XEight-CoreProcessor.csv │ ├── AMDRyzen73700X8-CoreProcessor.csv │ ├── Intel(R)Celeron(R)CPUG3900@2.80GHz.csv │ ├── Intel(R)Celeron(R)CPUN2807@1.58GHz.csv │ ├── Intel(R)Celeron(R)CPUN3450@1.10GHz.csv │ ├── Intel(R)Core(TM)i5-8250UCPU@1.60GHz.csv │ ├── Intel(R)Core(TM)i7-4700MQCPU@2.40GHz.csv │ └── Intel(R)Core(TM)i7-6700CPU@3.40GHz.csv └── w32 │ └── .tmp ├── skl-sub.log ├── skl.log ├── slm.log ├── sse.cpp ├── test.cpp ├── znver1.log └── znver2.log /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.exe 3 | bench.asm 4 | *.ilk 5 | *.obj 6 | *.pdb 7 | *.d 8 | *.o 9 | out.bin 10 | bench 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "xbyak"] 2 | path = xbyak 3 | url = https://github.com/tanakamura/xbyak.git 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: bench 2 | 3 | CXXFLAGS=-DXBYAK_USE_MMAP_ALLOCATOR -DXBYAK_NO_OP_NAMES -std=c++0x -O2 -Ixbyak/xbyak -MMD 4 | 5 | 6 | bench: bench.o gen.o sse.o avx.o avx512.o 7 | #-cl /EHsc /MT /Zi /Ixbyak/xbyak /Fa /O2 bench.cpp /link /DYNAMICBASE:NO 8 | -g++ -o bench $^ 9 | 10 | clean: 11 | -del *~ bench.exe test.exe *.obj *.pdb *.ilk *.suo *.bin *.o bench 12 | -rm -f *~ bench.exe test.exe *.obj *.pdb *.ilk *.suo *.bin *.o bench 13 | 14 | -include *.d -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # instruction-bench 2 | Measures latency and throughputh for each instructions. 3 | 4 | For precise measuring, running on Linux is strongly recommended. 5 | 6 | $ make 7 | $ ./bench 8 | 9 | # Results 10 | [Results](logs/linux/) 11 | 12 | [i7-6700](logs/linux/Intel(R)Core(TM)i7-6700CPU@3.40GHz.csv) 13 | 14 | [Ryzen7-3700X](logs/linux/AMDRyzen73700X8-CoreProcessor.csv) 15 | 16 | 17 | ## compare result 18 | 19 | $ python compare-result.py 20 | -------------------------------------------------------------------------------- /avx.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | static Xbyak::Xmm 4 | ymm_to_xmm(Xbyak::Ymm ymm) 5 | { 6 | return Xbyak::Xmm(ymm.getIdx()); 7 | } 8 | 9 | void 10 | test_avx() 11 | { 12 | using namespace Xbyak; 13 | if (info.have_avx) { 14 | GEN_throughput_only(Ymm, "movaps [mem]", 15 | (g->vmovaps(dst, g->ptr[g->rdx])), 16 | false, OT_FP32); 17 | GEN_latency_only(Ymm, "movaps [mem] -> movq", 18 | (g->vmovaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, ymm_to_xmm(dst))); , 19 | false, OT_FP32); 20 | 21 | GEN_throughput_only(Ymm, "vmovdqu [mem+1]", 22 | (g->vmovdqu(dst, g->ptr[g->rdx + 1])), 23 | false, OT_FP32); 24 | GEN_latency_only(Ymm, "vmovdqu [mem+1] -> movq", 25 | (g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + 1])); (g->movq(g->rdi, ymm_to_xmm(dst))); , 26 | false, OT_FP32); 27 | 28 | GEN_throughput_only(Ymm, "vmovdqu [mem+63] (cross cache)", 29 | (g->vmovdqu(dst, g->ptr[g->rdx + 63])), 30 | false, OT_FP32); 31 | GEN_latency_only(Ymm, "vmovdqu [mem+63] (cross cache) -> movq", 32 | (g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + 63])); (g->movq(g->rdi, ymm_to_xmm(dst))); , 33 | false, OT_FP32); 34 | 35 | GEN_throughput_only(Ymm, "vmovdqu [mem+2MB-1] (cross page)", 36 | (g->vmovdqu(dst, g->ptr[g->rdx + (2048*1024-1)])), 37 | false, OT_FP32); 38 | GEN_latency_only(Ymm, "vmovdqu [mem+2MB-1] (cross page) -> movq", 39 | (g->vmovdqu(dst, g->ptr[g->rdx + g->rdi + (2048*1024-1)])); (g->movq(g->rdi, ymm_to_xmm(dst))); , 40 | false, OT_FP32); 41 | 42 | GEN(Ymm, "vxorps", (g->vxorps(dst, dst, src)), false, OT_FP32); 43 | GEN(Ymm, "vmulps", (g->vmulps(dst, dst, src)), false, OT_FP32); 44 | GEN(Ymm, "vaddps", (g->vaddps(dst, dst, src)), false, OT_FP32); 45 | GEN(Ymm, "vdivps", (g->vdivps(dst, dst, src)), false, OT_FP32); 46 | GEN(Ymm, "vdivpd", (g->vdivpd(dst, dst, src)), false, OT_FP64); 47 | GEN(Ymm, "vrsqrtps", (g->vrsqrtps(dst, dst)), false, OT_FP32); 48 | GEN(Ymm, "vrcpps", (g->vrcpps(dst, dst)), false, OT_FP32); 49 | GEN(Ymm, "vsqrtps", (g->vsqrtps(dst, dst)), false, OT_FP32); 50 | GEN(Ymm, "vperm2f128", (g->vperm2f128(dst,dst,src,0)), false, OT_FP32); 51 | } 52 | 53 | if (info.have_avx2) { 54 | GEN(Ymm, "vpxor", (g->vpxor(dst, dst, src)), false, OT_INT); 55 | GEN(Ymm, "vpaddd", (g->vpaddd(dst, dst, src)), false, OT_INT); 56 | GEN(Ymm, "vpermps", (g->vpermps(dst, dst, src)), false, OT_FP32); 57 | GEN(Ymm, "vpermpd", (g->vpermpd(dst, dst, 0)), false, OT_FP64); 58 | GEN(Ymm, "vpblendvb", (g->vpblendvb(dst, src, src, src)), false, OT_INT); 59 | GEN_throughput_only(Ymm, "vpmovmskb", (g->vpmovmskb(g->edx,g->ymm0)), false, OT_INT); 60 | 61 | 62 | GEN_latency(Ymm, "vpmovsxwd", 63 | (g->vpmovsxwd(g->ymm1,g->xmm0)), 64 | (g->vpmovsxwd(g->ymm0,g->xmm0)), 65 | false, OT_INT); 66 | 67 | GEN_latency(Ymm, "vpgatherdd", 68 | (g->vpgatherdd(g->ymm2, g->ptr[g->rdx + g->ymm0*1], g->ymm1)), 69 | (g->vpgatherdd(g->ymm2, g->ptr[g->rdx + g->ymm0*1], g->ymm1)); (g->vmovdqa(g->ymm0,g->ymm2)), 70 | false, OT_INT); 71 | 72 | GEN_latency(Ymm, "gather32(x8 + perm)", 73 | 74 | /* throughput */ 75 | (g->vmovd(g->xmm2, g->ptr[g->rdx])); 76 | (g->vmovd(g->xmm3, g->ptr[g->rdx])); 77 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 4], 0)); 78 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 4], 0)); 79 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 0)); 80 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 0)); 81 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + 12], 0)); 82 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 12], 0)); 83 | (g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));, 84 | 85 | /* latency */ 86 | (g->vmovd(g->xmm2, g->ptr[g->rdx + g->rdi])); 87 | (g->vmovd(g->xmm3, g->ptr[g->rdx + g->rdi])); 88 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 4], 0)); 89 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 4], 0)); 90 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 8], 0)); 91 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 8], 0)); 92 | (g->vpinsrd(g->xmm2, g->xmm2, g->ptr[g->rdx + g->rdi + 12], 0)); 93 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + g->rdi + 12], 0)); 94 | (g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0)); 95 | (g->vmovd(g->edi, g->xmm2)); 96 | 97 | ,false, OT_FP32); 98 | 99 | 100 | GEN_latency(Ymm, "vgatherdpd", 101 | (g->vgatherdpd(g->ymm2, g->ptr[g->rdx + g->xmm0*1], g->ymm1)), 102 | (g->vgatherdpd(g->ymm2, g->ptr[g->rdx + g->xmm0*1], g->ymm1)); (g->vmovdqa(g->ymm0,g->ymm2)), 103 | false, OT_INT); 104 | 105 | GEN_latency(Ymm, "gather64(x4 + perm)", 106 | 107 | /* throughput */ 108 | (g->vmovq(g->xmm2, g->ptr[g->rdx])); 109 | (g->vmovq(g->xmm3, g->ptr[g->rdx])); 110 | (g->vpinsrq(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 1)); 111 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 1)); 112 | (g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0));, 113 | 114 | /* latency */ 115 | (g->vmovq(g->xmm2, g->ptr[g->rdx + g->rdi])); 116 | (g->vmovq(g->xmm3, g->ptr[g->rdx + g->rdi])); 117 | (g->vpinsrq(g->xmm2, g->xmm2, g->ptr[g->rdx + 8], 1)); 118 | (g->vpinsrd(g->xmm3, g->xmm3, g->ptr[g->rdx + 8], 1)); 119 | (g->vperm2i128(g->ymm2,g->ymm2,g->ymm3,0)); 120 | (g->vmovd(g->edi, g->xmm2));, 121 | 122 | false, OT_FP32); 123 | 124 | GEN(Ymm, "vpshufb", (g->vpshufb(dst, src, src)), false, OT_INT); 125 | } 126 | 127 | if (info.have_fma) { 128 | GEN(Ymm, "vfmaps", (g->vfmadd132ps(dst, src, src)), false, OT_FP32); 129 | GEN(Ymm, "vfmapd", (g->vfmadd132pd(dst, src, src)), false, OT_FP64); 130 | GEN(Xmm, "vfmaps", (g->vfmadd132ps(dst, src, src)), false, OT_FP32); 131 | GEN(Xmm, "vfmapd", (g->vfmadd132pd(dst, src, src)), false, OT_FP64); 132 | } 133 | } -------------------------------------------------------------------------------- /avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | static Xbyak::Ymm 4 | zmm_to_ymm(Xbyak::Zmm zmm) 5 | { 6 | return Xbyak::Ymm(zmm.getIdx()); 7 | } 8 | 9 | void test_avx512() { 10 | if (info.have_avx512f) { 11 | GEN(Zmm, "vaddps", (g->vaddps(dst, src, src)), false, OT_FP32); 12 | GEN(Zmm, "vaddpd", (g->vaddpd(dst, src, src)), false, OT_FP64); 13 | GEN(Zmm, "vorps", (g->vorps(dst, src, src)), false, OT_FP32); 14 | GEN(Zmm, "vorpd", (g->vorpd(dst, src, src)), false, OT_FP64); 15 | GEN(Zmm, "vorps reg, reg, [mem]", (g->vorps(dst, src, g->ptr[g->rdx])), false, OT_FP32); 16 | GEN(Zmm, "vfmaps", (g->vfmadd132ps(dst, src, src)), false, OT_FP32); 17 | GEN(Zmm, "vfmapd", (g->vfmadd132pd(dst, src, src)), false, OT_FP64); 18 | GEN(Zmm, "vfmaps reg, reg, [mem]", (g->vfmadd132pd(dst, src, g->ptr[g->rdx])), false, OT_FP32); 19 | GEN(Zmm, "vpexpandd", (g->vpexpandd(dst, src)), false, OT_FP32); 20 | GEN(Zmm, "vplzcntq", (g->vpexpandd(dst, src)), false, OT_FP32); 21 | GEN(Zmm, "vpconflictd", (g->vpconflictd(dst, src)), false, OT_FP32); 22 | GEN(Zmm, "vpermt2d", (g->vpermt2d(dst, src, src)), false, OT_FP32); 23 | GEN(Zmm, "vshufps", (g->vshufps(dst, src, src, 0)), false, OT_FP32); 24 | GEN(Zmm, "vrcp14pd", (g->vrcp14pd(dst, src)), false, OT_FP32); 25 | GEN(Zmm, "vpternlogd", (g->vpternlogd(dst, src, src, 0)), false, OT_FP32); 26 | 27 | } 28 | 29 | if (info.have_avx512er) { 30 | GEN(Zmm, "vrcp28pd", (g->vrcp28pd(dst, src)), false, OT_FP32); 31 | } 32 | 33 | if (info.have_avx512vnni) { 34 | GEN(Ymm, "vpdpwssds", (g->vpdpwssds(dst, src, src)), false, OT_FP32); 35 | GEN(Ymm, "vpdpwssd", (g->vpdpwssd(dst, src, src)), false, OT_FP32); 36 | 37 | GEN(Zmm, "vpdpwssds", (g->vpdpwssds(dst, src, src)), false, OT_FP32); 38 | GEN(Zmm, "vpdpwssd", (g->vpdpwssd(dst, src, src)), false, OT_FP32); 39 | } 40 | 41 | if (info.have_avx512bf16) { 42 | //GEN(Ymm, "vcvtne2ps2bf16", (g->vcvtne2ps2bf16(dst, src)), false, OT_FP32); 43 | //GEN(Zmm, "vcvtne2ps2bf16", (g->vcvtne2ps2bf16(dst, src)), false, OT_FP32); 44 | } 45 | } -------------------------------------------------------------------------------- /bdw.csv: -------------------------------------------------------------------------------- 1 | Intel(R) Xeon(R) CPU D-1571 @ 1.30GHz 2 | "reg64","add","latency","1.005435e+00","9.945939e-01" 3 | "reg64","add","throughput","2.673515e-01","3.740394e+00" 4 | "reg64","lea","latency","1.005118e+00","9.949084e-01" 5 | "reg64","lea","throughput","5.030286e-01","1.987958e+00" 6 | "reg64","xor dst,dst","latency","2.673577e-01","3.740307e+00" 7 | "reg64","xor dst,dst","throughput","2.673202e-01","3.740833e+00" 8 | "reg64","xor","latency","2.668270e-01","3.747747e+00" 9 | "reg64","xor","throughput","2.669234e-01","3.746393e+00" 10 | "reg64","load","latency","5.018156e+00","1.992764e-01" 11 | "reg64","load","throughput","6.271042e-01","1.594631e+00" 12 | "reg64","crc32","latency","3.008892e+00","3.323482e-01" 13 | "reg64","crc32","throughput","1.004368e+00","9.956510e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","1.229579e+01","8.132862e-02" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.263294e+00","7.915815e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","2.005469e+01","4.986365e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.403139e+01","7.126875e-02" 18 | "m128","pxor","latency","2.792659e-01","3.580816e+00" 19 | "m128","pxor","throughput","2.789258e-01","3.585183e+00" 20 | "m128","padd","latency","1.001859e+00","9.981446e-01" 21 | "m128","padd","throughput","5.019989e-01","1.992036e+00" 22 | "m128","pmuldq","latency","5.007596e+00","1.996966e-01" 23 | "m128","pmuldq","throughput","1.002865e+00","9.971433e-01" 24 | "m128","loadps","throughput","5.020837e-01","1.991700e+00" 25 | "m128","loadps->movq","latency","7.012644e+00","1.425996e-01" 26 | "m128","movq->movq","latency","2.005775e+00","4.985604e-01" 27 | "m128","movq->movq","throughput","1.004190e+00","9.958275e-01" 28 | "m128","xorps","latency","2.786104e-01","3.589241e+00" 29 | "m128","xorps","throughput","2.790360e-01","3.583767e+00" 30 | "m128","addps","latency","3.006787e+00","3.325809e-01" 31 | "m128","addps","throughput","1.003402e+00","9.966099e-01" 32 | "m128","mulps","latency","3.006605e+00","3.326011e-01" 33 | "m128","mulps","throughput","5.009242e-01","1.996310e+00" 34 | "m128","divps","latency","1.101913e+01","9.075130e-02" 35 | "m128","divps","throughput","5.018023e+00","1.992817e-01" 36 | "m128","divpd","latency","1.001664e+01","9.983390e-02" 37 | "m128","divpd","throughput","8.021761e+00","1.246609e-01" 38 | "m128","rsqrtps","latency","5.007502e+00","1.997004e-01" 39 | "m128","rsqrtps","throughput","1.003823e+00","9.961916e-01" 40 | "m128","rcpps","latency","5.008220e+00","1.996717e-01" 41 | "m128","rcpps","throughput","1.003651e+00","9.963626e-01" 42 | "m128","blendps","latency","1.003578e+00","9.964348e-01" 43 | "m128","blendps","throughput","3.341694e-01","2.992494e+00" 44 | "m128","blendvps","latency","2.006539e+00","4.983705e-01" 45 | "m128","blendvps","throughput","2.005635e+00","4.985952e-01" 46 | "m128","pshufb","latency","1.002921e+00","9.970875e-01" 47 | "m128","pshufb","throughput","1.001989e+00","9.980154e-01" 48 | "m128","shufps","latency","1.002982e+00","9.970270e-01" 49 | "m128","shufps","throughput","1.002853e+00","9.971551e-01" 50 | "m128","pmullw","latency","5.007810e+00","1.996881e-01" 51 | "m128","pmullw","throughput","1.002626e+00","9.973805e-01" 52 | "m128","phaddd","latency","3.007200e+00","3.325353e-01" 53 | "m128","phaddd","throughput","2.005810e+00","4.985518e-01" 54 | "m128","haddps","latency","3.007363e+00","3.325172e-01" 55 | "m128","haddps","throughput","2.005387e+00","4.986569e-01" 56 | "m128","pinsrd","latency","2.007249e+00","4.981942e-01" 57 | "m128","pinsrd","throughput","2.005805e+00","4.985529e-01" 58 | "m128","pinsrd->pexr","latency","4.008767e+00","2.494533e-01" 59 | "m128","dpps","latency","1.202906e+01","8.313203e-02" 60 | "m128","dpps","throughput","2.321211e+00","4.308097e-01" 61 | "m128","cvtps2dq","latency","3.006614e+00","3.326000e-01" 62 | "m128","cvtps2dq","throughput","1.002633e+00","9.973737e-01" 63 | "reg64","popcnt","latency","3.006188e+00","3.326472e-01" 64 | "reg64","popcnt","throughput","1.003134e+00","9.968755e-01" 65 | "m128","aesenc","latency","7.010418e+00","1.426448e-01" 66 | "m128","aesenc","throughput","1.002947e+00","9.970614e-01" 67 | "m128","aesenclast","latency","7.011492e+00","1.426230e-01" 68 | "m128","aesenclast","throughput","1.002983e+00","9.970260e-01" 69 | "m128","aesdec","latency","7.009893e+00","1.426555e-01" 70 | "m128","aesdec","throughput","1.003703e+00","9.963104e-01" 71 | "m128","aesdeclast","latency","7.010402e+00","1.426452e-01" 72 | "m128","aesdeclast","throughput","1.002656e+00","9.973512e-01" 73 | "m256","movaps [mem]","latency","1.003582e+00","9.964310e-01" 74 | "m256","movaps [mem]","throughput","5.022303e-01","1.991118e+00" 75 | "m256","vmovdqu [mem+1]","latency","1.003402e+00","9.966090e-01" 76 | "m256","vmovdqu [mem+1]","throughput","5.015736e-01","1.993725e+00" 77 | "m256","vmovdqu [mem+63] (cross cache)","latency","1.004246e+00","9.957720e-01" 78 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.004481e+00","9.955392e-01" 79 | "m256","vmovdqu [mem+2MB-1] (cross page)","latency","3.105519e+01","3.220074e-02" 80 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","3.104532e+01","3.221098e-02" 81 | "m256","xorps","latency","2.791684e-01","3.582067e+00" 82 | "m256","xorps","throughput","2.790542e-01","3.583533e+00" 83 | "m256","mulps","latency","3.007174e+00","3.325381e-01" 84 | "m256","mulps","throughput","5.015511e-01","1.993815e+00" 85 | "m256","addps","latency","3.007763e+00","3.324730e-01" 86 | "m256","addps","throughput","1.003600e+00","9.964127e-01" 87 | "m256","divps","latency","1.719467e+01","5.815756e-02" 88 | "m256","divps","throughput","1.009360e+01","9.907269e-02" 89 | "m256","divpd","latency","1.908511e+01","5.239688e-02" 90 | "m256","divpd","throughput","1.616323e+01","6.186883e-02" 91 | "m256","rsqrtps","latency","7.013317e+00","1.425859e-01" 92 | "m256","rsqrtps","throughput","2.006882e+00","4.982853e-01" 93 | "m256","rcpps","latency","7.013340e+00","1.425854e-01" 94 | "m256","rcpps","throughput","2.004963e+00","4.987623e-01" 95 | "m256","sqrtps","latency","1.808487e+01","5.529484e-02" 96 | "m256","sqrtps","throughput","1.409339e+01","7.095526e-02" 97 | "m256","vperm2f128","latency","3.006833e+00","3.325758e-01" 98 | "m256","vperm2f128","throughput","1.002868e+00","9.971400e-01" 99 | "m256","pxor","latency","2.788529e-01","3.586120e+00" 100 | "m256","pxor","throughput","2.780111e-01","3.596979e+00" 101 | "m256","paddd","latency","1.002739e+00","9.972681e-01" 102 | "m256","paddd","throughput","5.009634e-01","1.996154e+00" 103 | "m256","vpermps","latency","3.008312e+00","3.324123e-01" 104 | "m256","vpermps","throughput","1.003438e+00","9.965739e-01" 105 | "m256","vpermpd","latency","3.006010e+00","3.326669e-01" 106 | "m256","vpermpd","throughput","1.002575e+00","9.974315e-01" 107 | "m256","vpmovsxwd","latency","3.006030e+00","3.326647e-01" 108 | "m256","vpmovsxwd","throughput","1.003455e+00","9.965573e-01" 109 | "m256","vpgatherdd","latency","2.115726e+01","4.726510e-02" 110 | "m256","vpgatherdd","throughput","6.039028e+00","1.655896e-01" 111 | "m256","gather32(x8 + perm)","latency","1.703472e+01","5.870364e-02" 112 | "m256","gather32(x8 + perm)","throughput","8.018219e+00","1.247160e-01" 113 | "m256","vgatherdpd","latency","1.819990e+01","5.494535e-02" 114 | "m256","vgatherdpd","throughput","5.037717e+00","1.985026e-01" 115 | "m256","gather64(x4 + perm)","latency","1.217406e+01","8.214183e-02" 116 | "m256","gather64(x4 + perm)","throughput","4.007869e+00","2.495092e-01" 117 | "m256","vpshufb","latency","1.006102e+00","9.939352e-01" 118 | "m256","vpshufb","throughput","1.003021e+00","9.969881e-01" 119 | "m256","vfmaps","latency","5.008734e+00","1.996512e-01" 120 | "m256","vfmaps","throughput","5.014875e-01","1.994068e+00" 121 | "m256","vfmapd","latency","5.008636e+00","1.996551e-01" 122 | "m256","vfmapd","throughput","5.010047e-01","1.995989e+00" 123 | "m128","vfmaps","latency","5.008727e+00","1.996515e-01" 124 | "m128","vfmaps","throughput","5.016058e-01","1.993597e+00" 125 | "m128","vfmapd","latency","5.008312e+00","1.996681e-01" 126 | "m128","vfmapd","throughput","5.009094e-01","1.996369e+00" 127 | -------------------------------------------------------------------------------- /bdw.log: -------------------------------------------------------------------------------- 1 | Intel(R) Xeon(R) CPU D-1571 @ 1.30GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.01, IPC= 0.99 4 | reg64: add:throughput: CPI= 0.27, IPC= 3.75 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.50, IPC= 1.99 7 | reg64: xor dst,dst: latency: CPI= 0.27, IPC= 3.75 8 | reg64: xor dst,dst:throughput: CPI= 0.27, IPC= 3.75 9 | reg64: xor: latency: CPI= 0.27, IPC= 3.75 10 | reg64: xor:throughput: CPI= 0.27, IPC= 3.75 11 | reg64: load: latency: CPI= 5.02, IPC= 0.20 12 | reg64: load:throughput: CPI= 0.63, IPC= 1.59 13 | reg64: crc32: latency: CPI= 3.01, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.00, IPC= 1.00 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 12.31, IPC= 0.08 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 1.26, IPC= 0.79 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 20.05, IPC= 0.05 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 14.03, IPC= 0.07 19 | m128: pxor: latency: CPI= 0.28, IPC= 3.57 20 | m128: pxor:throughput: CPI= 0.28, IPC= 3.58 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.50, IPC= 1.99 23 | m128: pmuldq: latency: CPI= 5.01, IPC= 0.20 24 | m128: pmuldq:throughput: CPI= 1.00, IPC= 1.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 1.99 26 | m128: loadps->movq: latency: CPI= 7.01, IPC= 0.14 27 | m128: movq->movq: latency: CPI= 2.01, IPC= 0.50 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.28, IPC= 3.59 30 | m128: xorps:throughput: CPI= 0.28, IPC= 3.60 31 | m128: addps: latency: CPI= 3.01, IPC= 0.33 32 | m128: addps:throughput: CPI= 1.00, IPC= 1.00 33 | m128: mulps: latency: CPI= 3.01, IPC= 0.33 34 | m128: mulps:throughput: CPI= 0.50, IPC= 2.00 35 | m128: divps: latency: CPI= 11.02, IPC= 0.09 36 | m128: divps:throughput: CPI= 5.01, IPC= 0.20 37 | m128: divpd: latency: CPI= 10.02, IPC= 0.10 38 | m128: divpd:throughput: CPI= 8.03, IPC= 0.12 39 | m128: rsqrtps: latency: CPI= 5.01, IPC= 0.20 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 5.01, IPC= 0.20 42 | m128: rcpps:throughput: CPI= 1.00, IPC= 1.00 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.33, IPC= 2.99 45 | m128: blendvps: latency: CPI= 2.01, IPC= 0.50 46 | m128: blendvps:throughput: CPI= 2.01, IPC= 0.50 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 1.00, IPC= 1.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 1.00, IPC= 1.00 51 | m128: pmullw: latency: CPI= 5.01, IPC= 0.20 52 | m128: pmullw:throughput: CPI= 1.00, IPC= 1.00 53 | m128: phaddd: latency: CPI= 3.01, IPC= 0.33 54 | m128: phaddd:throughput: CPI= 2.01, IPC= 0.50 55 | m128: haddps: latency: CPI= 3.01, IPC= 0.33 56 | m128: haddps:throughput: CPI= 2.01, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 2.01, IPC= 0.50 58 | m128: pinsrd:throughput: CPI= 2.01, IPC= 0.50 59 | m128: pinsrd->pexr: latency: CPI= 4.01, IPC= 0.25 60 | m128: dpps: latency: CPI= 12.03, IPC= 0.08 61 | m128: dpps:throughput: CPI= 2.32, IPC= 0.43 62 | m128: cvtps2dq: latency: CPI= 3.01, IPC= 0.33 63 | m128: cvtps2dq:throughput: CPI= 1.00, IPC= 1.00 64 | reg64: popcnt: latency: CPI= 3.01, IPC= 0.33 65 | reg64: popcnt:throughput: CPI= 1.00, IPC= 1.00 66 | m128: aesenc: latency: CPI= 7.01, IPC= 0.14 67 | m128: aesenc:throughput: CPI= 1.00, IPC= 1.00 68 | m128: aesenclast: latency: CPI= 7.01, IPC= 0.14 69 | m128: aesenclast:throughput: CPI= 1.00, IPC= 1.00 70 | m128: aesdec: latency: CPI= 7.01, IPC= 0.14 71 | m128: aesdec:throughput: CPI= 1.00, IPC= 1.00 72 | m128: aesdeclast: latency: CPI= 7.01, IPC= 0.14 73 | m128: aesdeclast:throughput: CPI= 1.00, IPC= 1.00 74 | m256: movaps [mem]: latency: CPI= 1.01, IPC= 0.99 75 | m256: movaps [mem]:throughput: CPI= 0.50, IPC= 1.99 76 | m256: vmovdqu [mem+1]: latency: CPI= 1.00, IPC= 1.00 77 | m256: vmovdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 78 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.00, IPC= 1.00 79 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 80 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 31.05, IPC= 0.03 81 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 31.05, IPC= 0.03 82 | m256: xorps: latency: CPI= 0.28, IPC= 3.60 83 | m256: xorps:throughput: CPI= 0.28, IPC= 3.59 84 | m256: mulps: latency: CPI= 3.01, IPC= 0.33 85 | m256: mulps:throughput: CPI= 0.50, IPC= 1.99 86 | m256: addps: latency: CPI= 3.01, IPC= 0.33 87 | m256: addps:throughput: CPI= 1.00, IPC= 1.00 88 | m256: divps: latency: CPI= 17.20, IPC= 0.06 89 | m256: divps:throughput: CPI= 10.09, IPC= 0.10 90 | m256: divpd: latency: CPI= 19.09, IPC= 0.05 91 | m256: divpd:throughput: CPI= 16.16, IPC= 0.06 92 | m256: rsqrtps: latency: CPI= 7.01, IPC= 0.14 93 | m256: rsqrtps:throughput: CPI= 2.01, IPC= 0.50 94 | m256: rcpps: latency: CPI= 7.01, IPC= 0.14 95 | m256: rcpps:throughput: CPI= 2.00, IPC= 0.50 96 | m256: sqrtps: latency: CPI= 18.08, IPC= 0.06 97 | m256: sqrtps:throughput: CPI= 14.09, IPC= 0.07 98 | m256: vperm2f128: latency: CPI= 3.01, IPC= 0.33 99 | m256: vperm2f128:throughput: CPI= 1.00, IPC= 1.00 100 | m256: pxor: latency: CPI= 0.28, IPC= 3.60 101 | m256: pxor:throughput: CPI= 0.28, IPC= 3.58 102 | m256: paddd: latency: CPI= 1.00, IPC= 1.00 103 | m256: paddd:throughput: CPI= 0.50, IPC= 2.00 104 | m256: vpermps: latency: CPI= 3.00, IPC= 0.33 105 | m256: vpermps:throughput: CPI= 1.00, IPC= 1.00 106 | m256: vpermpd: latency: CPI= 3.01, IPC= 0.33 107 | m256: vpermpd:throughput: CPI= 1.00, IPC= 1.00 108 | m256: vpmovsxwd: latency: CPI= 3.01, IPC= 0.33 109 | m256: vpmovsxwd:throughput: CPI= 1.00, IPC= 1.00 110 | m256: vpgatherdd: latency: CPI= 21.16, IPC= 0.05 111 | m256: vpgatherdd:throughput: CPI= 6.04, IPC= 0.17 112 | m256: gather32(x8 + perm): latency: CPI= 17.03, IPC= 0.06 113 | m256: gather32(x8 + perm):throughput: CPI= 8.02, IPC= 0.12 114 | m256: vgatherdpd: latency: CPI= 18.20, IPC= 0.05 115 | m256: vgatherdpd:throughput: CPI= 5.04, IPC= 0.20 116 | m256: gather64(x4 + perm): latency: CPI= 12.17, IPC= 0.08 117 | m256: gather64(x4 + perm):throughput: CPI= 4.01, IPC= 0.25 118 | m256: vpshufb: latency: CPI= 1.00, IPC= 1.00 119 | m256: vpshufb:throughput: CPI= 1.00, IPC= 1.00 120 | m256: vfmaps: latency: CPI= 5.01, IPC= 0.20 121 | m256: vfmaps:throughput: CPI= 0.50, IPC= 1.99 122 | m256: vfmapd: latency: CPI= 5.01, IPC= 0.20 123 | m256: vfmapd:throughput: CPI= 0.50, IPC= 2.00 124 | m128: vfmaps: latency: CPI= 5.01, IPC= 0.20 125 | m128: vfmaps:throughput: CPI= 0.50, IPC= 1.99 126 | m128: vfmapd: latency: CPI= 5.01, IPC= 0.20 127 | m128: vfmapd:throughput: CPI= 0.50, IPC= 2.00 128 | -------------------------------------------------------------------------------- /bench.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hpp" 6 | 7 | bool output_csv = false; 8 | FILE *logs; 9 | cpuinfo info; 10 | 11 | /* x64 regisuter usage 12 | * http://msdn.microsoft.com/en-US/library/9z1stfyw(v=vs.80).aspx 13 | * RAX Volatile Return value register 14 | * RCX Volatile First integer argument 15 | * RDX Volatile Second integer argument 16 | * R8 Volatile Third integer argument 17 | * R9 Volatile Fourth integer argument 18 | * R10:R11 Volatile Must be preserved as needed by caller; used in syscall/sysret instructions 19 | * R12:R15 Nonvolatile Must be preserved by callee 20 | * RDI Nonvolatile Must be preserved by callee 21 | * RSI Nonvolatile Must be preserved by callee 22 | * RBX Nonvolatile Must be preserved by callee 23 | * RBP Nonvolatile May be used as a frame pointer; must be preserved by callee 24 | * RSP Nonvolatile Stack pointer 25 | * XMM0 Volatile First FP argument 26 | * XMM1 Volatile Second FP argument 27 | * XMM2 Volatile Third FP argument 28 | * XMM3 Volatile Fourth FP argument 29 | * XMM4:XMM5 Volatile Must be preserved as needed by caller 30 | * XMM6:XMM15 Nonvolatile Must be preserved as needed by callee. 31 | */ 32 | 33 | #ifdef __linux 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | static int 42 | perf_event_open(struct perf_event_attr *hw_event, pid_t pid, 43 | int cpu, int group_fd, unsigned long flags ) 44 | { 45 | int ret; 46 | 47 | ret = syscall( __NR_perf_event_open, hw_event, pid, cpu, 48 | group_fd, flags ); 49 | return ret; 50 | } 51 | 52 | int perf_fd; 53 | 54 | static void 55 | cycle_counter_init(void) 56 | { 57 | struct perf_event_attr attr; 58 | memset(&attr, 0, sizeof(attr)); 59 | 60 | attr.type = PERF_TYPE_HARDWARE; 61 | attr.size = sizeof(attr); 62 | attr.config = PERF_COUNT_HW_CPU_CYCLES; 63 | attr.exclude_kernel = 1; 64 | 65 | perf_fd = perf_event_open(&attr, 0, -1, -1, 0); 66 | if (perf_fd == -1) { 67 | perror("perf_event_open"); 68 | exit(1); 69 | } 70 | } 71 | 72 | #else 73 | 74 | #define cycle_counter_init() ((void)0) 75 | 76 | #endif 77 | 78 | char MIE_ALIGN(2048*1024) zero_mem[4096*1024]; 79 | char MIE_ALIGN(2048*1024) data_mem[4096*1024]; 80 | 81 | int 82 | main(int argc, char **argv) 83 | { 84 | if (argc >= 2) { 85 | if (strcmp(argv[1],"--csv") == 0) { 86 | output_csv = true; 87 | } 88 | } 89 | 90 | cycle_counter_init(); 91 | 92 | #ifdef _WIN32 93 | #define x_cpuid(p,eax) __cpuid(p, eax) 94 | typedef int cpuid_t; 95 | #else 96 | #define x_cpuid(p,eax) __get_cpuid(eax, &(p)[0], &(p)[1], &(p)[2], &(p)[3]); 97 | typedef unsigned int cpuid_t; 98 | #endif 99 | 100 | #ifdef _WIN32 101 | std::string path = "logs/w32/"; 102 | #else 103 | std::string path = "logs/linux/"; 104 | 105 | #endif 106 | 107 | { 108 | cpuid_t data[4*3+1]; 109 | char data_nospace[4*3*4+1]; 110 | 111 | x_cpuid(data+4*0, 0x80000002); 112 | x_cpuid(data+4*1, 0x80000003); 113 | x_cpuid(data+4*2, 0x80000004); 114 | data[12] = 0; 115 | puts((char*)data); 116 | 117 | char *d0 = (char*)data; 118 | int out = 0; 119 | 120 | for (int i=0; i<4*3*4; i++) { 121 | if (d0[i] != ' ') { 122 | data_nospace[out++] = d0[i]; 123 | } 124 | } 125 | data_nospace[out] = '\0'; 126 | 127 | path += data_nospace; 128 | path += ".csv"; 129 | } 130 | 131 | logs = fopen(path.c_str(), "wb"); 132 | if (logs == NULL) { 133 | perror(path.c_str()); 134 | return 1; 135 | } 136 | fprintf(logs, 137 | "class,inst,l/t,cpi,ipc\n"); 138 | 139 | if (!output_csv) { 140 | printf("== latency/throughput ==\n"); 141 | } 142 | 143 | { 144 | int reg[4]; 145 | 146 | #ifdef _WIN32 147 | __cpuidex(reg, 7, 0); 148 | #else 149 | __cpuid_count(7, 0, reg[0], reg[1], reg[2], reg[3]); 150 | #endif 151 | 152 | if (reg[1] & (1<<5)) { 153 | info.have_avx2 = true; 154 | } 155 | 156 | if (reg[1] & (1<<16)) { 157 | info.have_avx512f = true; 158 | } 159 | 160 | if (reg[1] & (1<<27)) { 161 | info.have_avx512er = true; 162 | } 163 | 164 | #ifdef _WIN32 165 | __cpuid(reg, 1); 166 | #else 167 | __cpuid(1, reg[0], reg[1], reg[2], reg[3]); 168 | #endif 169 | if (reg[2] & (1<<1)) { 170 | info.have_pclmulqdq = true; 171 | } 172 | 173 | if (reg[2] & (1<<12)) { 174 | info.have_fma = true; 175 | } 176 | 177 | 178 | if (reg[2] & (1<<20)) { 179 | info.have_sse42 = true; 180 | } 181 | 182 | if (reg[2] & (1<<28)) { 183 | info.have_avx = true; 184 | } 185 | 186 | if (reg[2] & (1<<23)) { 187 | info.have_popcnt = true; 188 | } 189 | 190 | if (reg[2] & (1<<25)) { 191 | info.have_aes = true; 192 | } 193 | 194 | if (info.have_avx512f) { 195 | if (reg[2] & (1<<11)) { 196 | info.have_avx512vnni = true; 197 | } 198 | } 199 | 200 | #ifdef _WIN32 201 | __cpuidex(reg, 7, 1); 202 | #else 203 | __cpuid_count(7, 1, reg[0], reg[1], reg[2], reg[3]); 204 | #endif 205 | 206 | if (reg[0] & (1<<5)) { 207 | info.have_avx512bf16 = true; 208 | } 209 | 210 | } 211 | 212 | test_generic(); 213 | test_sse(); 214 | test_avx(); 215 | test_avx512(); 216 | 217 | if (info.have_popcnt) { 218 | GEN(Reg64, "popcnt", (g->popcnt(dst, src)), false, OT_INT); 219 | } 220 | 221 | if (info.have_aes) { 222 | GEN(Xmm, "aesenc", (g->aesenc(dst,src)), false, OT_INT); 223 | GEN(Xmm, "aesenclast", (g->aesenclast(dst,src)), false, OT_INT); 224 | GEN(Xmm, "aesdec", (g->aesdec(dst,src)), false, OT_INT); 225 | GEN(Xmm, "aesdeclast", (g->aesdeclast(dst,src)), false, OT_INT); 226 | } 227 | 228 | if (info.have_pclmulqdq) { 229 | GEN(Xmm, "pclmulqdq", (g->pclmulqdq(dst,src,0)), false, OT_INT); 230 | } 231 | 232 | fclose(logs); 233 | } 234 | -------------------------------------------------------------------------------- /compare-result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | def usage(): 4 | import sys 5 | print("usage : compare-result.py a.csv b.csv") 6 | sys.exit(1) 7 | 8 | class Result: 9 | def __init__(self): 10 | self.latency = {} 11 | self.throughput = {} 12 | 13 | 14 | def load_csv(path, l_inst, t_inst): 15 | import csv 16 | 17 | result = Result() 18 | 19 | with open(path, 'r') as f: 20 | r = csv.DictReader(f) 21 | 22 | for row in r: 23 | lt = row['l/t'] 24 | inst = row['inst'] 25 | clas = row['class'] 26 | 27 | if lt == 'latency': 28 | result.latency[(clas,inst)] = row 29 | l_inst[(clas,inst)] = True 30 | else: 31 | result.throughput[(clas,inst)] = row 32 | t_inst[(clas,inst)] = True 33 | 34 | return result 35 | 36 | def dump_row(l_row, r_row, clas, inst): 37 | ratio = 'N/A' 38 | l_val = 'N/A' 39 | r_val = 'N/A' 40 | 41 | if l_row and r_row: 42 | l_ipc = float(l_row['ipc']) 43 | r_ipc = float(r_row['ipc']) 44 | 45 | l_cpi = float(l_row['cpi']) 46 | r_cpi = float(r_row['cpi']) 47 | 48 | ipc_ratio = (l_ipc / r_ipc)-1 49 | cpi_ratio = (l_cpi / r_cpi)-1 50 | 51 | print("%8s %32s | %7.2f-%-7.2f (%6.1f[%%]), %7.2f-%-7.2f (%6.1f[%%])"% 52 | (clas, inst, 53 | l_ipc, r_ipc, ipc_ratio * 100, 54 | l_cpi, r_cpi, cpi_ratio * 100)) 55 | 56 | elif l_row: 57 | l_ipc = float(l_row['ipc']) 58 | r_ipc = 'N/A' 59 | 60 | l_cpi = float(l_row['cpi']) 61 | r_cpi = 'N/A' 62 | 63 | ipc_ratio = 'N/A' 64 | cpi_ratio = 'N/A' 65 | 66 | print("%8s %32s | %7.2f-%-7s (%6s[%%]), %7.2f-%-7s (%6s[%%])"% 67 | (clas, inst, 68 | l_ipc, r_ipc, ipc_ratio, 69 | l_cpi, r_cpi, cpi_ratio)) 70 | 71 | elif r_row: 72 | l_ipc = 'N/A' 73 | r_ipc = float(r_row['ipc']) 74 | 75 | l_cpi = 'N/A' 76 | r_cpi = float(r_row['cpi']) 77 | 78 | ipc_ratio = 'N/A' 79 | cpi_ratio = 'N/A' 80 | 81 | print("%8s %32s | %7s-%-7.2f (%6s[%%]), %7s-%-7.2f (%6s[%%])"% 82 | (clas, inst, 83 | l_ipc, r_ipc, ipc_ratio, 84 | l_cpi, r_cpi, cpi_ratio)) 85 | 86 | 87 | def main(): 88 | import csv 89 | import sys 90 | if (len(sys.argv) < 3): 91 | usage() 92 | 93 | left = sys.argv[1] 94 | right = sys.argv[2] 95 | 96 | l_list = {} 97 | t_list = {} 98 | 99 | l = load_csv(left, l_list, t_list) 100 | r = load_csv(right, l_list, t_list) 101 | 102 | print("============= LATENCY ==============================================================================") 103 | print("%8s %32s | %7s%8s (%6s[%%]), %7s%8s (%6s[%%])"% 104 | (' ', 105 | 'instruction', 106 | 'IPC', 107 | '', 108 | 'rel', 109 | 'CPI', 110 | '', 111 | 'rel')) 112 | print("------------------------------------------+---------------------------------------------------------") 113 | 114 | l_list = list(l_list.keys()) 115 | t_list = list(t_list.keys()) 116 | 117 | l_list.sort() 118 | t_list.sort() 119 | 120 | for i in l_list: 121 | l_row = None 122 | r_row = None 123 | 124 | if i in l.latency: 125 | l_row = l.latency[i] 126 | if i in r.latency: 127 | r_row = r.latency[i] 128 | 129 | dump_row(l_row, r_row, i[0], i[1]) 130 | 131 | print("\n") 132 | print("============= THROUGHPUT ===========================================================================") 133 | print("%8s %32s | %7s%8s (%6s[%%]), %7s%8s (%6s[%%])"% 134 | (' ', 135 | 'instruction', 136 | 'IPC', 137 | '', 138 | 'rel', 139 | 'CPI', 140 | '', 141 | 'rel')) 142 | print("------------------------------------------+---------------------------------------------------------") 143 | for i in t_list: 144 | l_row = None 145 | r_row = None 146 | 147 | if i in l.throughput: 148 | l_row = l.throughput[i] 149 | if i in r.throughput: 150 | r_row = r.throughput[i] 151 | 152 | dump_row(l_row, r_row, i[0], i[1]) 153 | 154 | 155 | 156 | if __name__ == '__main__': 157 | main() -------------------------------------------------------------------------------- /gen.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | void test_generic() 4 | { 5 | GEN(Reg64, "add", (g->add(dst, src)), false, OT_INT); 6 | GEN(Reg64, "lea", (g->lea(dst, g->ptr[src])), false, OT_INT); 7 | GEN(Reg64, "xor dst,dst", (g->xor_(dst, dst)), false, OT_INT); 8 | GEN(Reg64, "xor", (g->xor_(dst, src)), false, OT_INT); 9 | GEN(Reg64, "load", (g->mov(dst, g->ptr[src + g->rdx])), false, OT_INT); 10 | GEN(Reg64, "crc32", (g->crc32(dst, src)), false, OT_INT); 11 | 12 | GEN(Reg64, "store [mem+0]->load[mem+0]", 13 | (g->mov(g->ptr[src+g->rdx],g->rdi)) ; (g->mov(dst, g->ptr[g->rdx])), 14 | false, OT_INT); 15 | 16 | GEN(Reg64, "store [mem+0]->load[mem+1]", 17 | (g->mov(g->ptr[src+g->rdx],g->rdi)) ; (g->mov(dst, g->ptr[g->rdx + 1])), 18 | false, OT_INT); 19 | 20 | GEN(Xmm, "pxor", (g->pxor(dst, src)), false, OT_INT); 21 | GEN(Xmm, "padd", (g->paddd(dst, src)), false, OT_INT); 22 | GEN(Xmm, "pmuldq", (g->pmuldq(dst, src)), false, OT_INT); 23 | } -------------------------------------------------------------------------------- /glm.log: -------------------------------------------------------------------------------- 1 | Intel(R) Celeron(R) CPU N3450 @ 1.10GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.01, IPC= 0.99 4 | reg64: add:throughput: CPI= 0.35, IPC= 2.84 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.35, IPC= 2.84 7 | reg64: xor dst,dst: latency: CPI= 0.35, IPC= 2.84 8 | reg64: xor dst,dst:throughput: CPI= 0.35, IPC= 2.84 9 | reg64: xor: latency: CPI= 0.35, IPC= 2.84 10 | reg64: xor:throughput: CPI= 0.35, IPC= 2.84 11 | reg64: load: latency: CPI= 3.00, IPC= 0.33 12 | reg64: load:throughput: CPI= 1.00, IPC= 1.00 13 | reg64: crc32: latency: CPI= 3.00, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.25, IPC= 0.80 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 5.00, IPC= 0.20 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 1.09, IPC= 0.91 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 13.02, IPC= 0.08 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 9.34, IPC= 0.11 19 | m128: pxor: latency: CPI= 0.50, IPC= 2.00 20 | m128: pxor:throughput: CPI= 0.50, IPC= 2.00 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.50, IPC= 2.00 23 | m128: pmuldq: latency: CPI= 4.00, IPC= 0.25 24 | m128: pmuldq:throughput: CPI= 1.00, IPC= 1.00 25 | m128: loadps:throughput: CPI= 1.00, IPC= 1.00 26 | m128: loadps->movq: latency: CPI= 8.00, IPC= 0.12 27 | m128: movq->movq: latency: CPI= 8.00, IPC= 0.12 28 | m128: movq->movq:throughput: CPI= 2.25, IPC= 0.44 29 | m128: xorps: latency: CPI= 0.50, IPC= 2.00 30 | m128: xorps:throughput: CPI= 0.50, IPC= 2.00 31 | m128: addps: latency: CPI= 3.00, IPC= 0.33 32 | m128: addps:throughput: CPI= 1.00, IPC= 1.00 33 | m128: mulps: latency: CPI= 4.00, IPC= 0.25 34 | m128: mulps:throughput: CPI= 1.00, IPC= 1.00 35 | m128: divps: latency: CPI= 20.00, IPC= 0.05 36 | m128: divps:throughput: CPI= 19.00, IPC= 0.05 37 | m128: divpd: latency: CPI= 20.00, IPC= 0.05 38 | m128: divpd:throughput: CPI= 19.00, IPC= 0.05 39 | m128: rsqrtps: latency: CPI= 9.00, IPC= 0.11 40 | m128: rsqrtps:throughput: CPI= 6.03, IPC= 0.17 41 | m128: rcpps: latency: CPI= 9.00, IPC= 0.11 42 | m128: rcpps:throughput: CPI= 6.03, IPC= 0.17 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.50, IPC= 2.00 45 | m128: blendvps: latency: CPI= 4.03, IPC= 0.25 46 | m128: blendvps:throughput: CPI= 5.97, IPC= 0.17 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 1.00, IPC= 1.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 0.50, IPC= 2.00 51 | m128: pmullw: latency: CPI= 4.00, IPC= 0.25 52 | m128: pmullw:throughput: CPI= 1.00, IPC= 1.00 53 | m128: phaddd: latency: CPI= 4.08, IPC= 0.24 54 | m128: phaddd:throughput: CPI= 4.03, IPC= 0.25 55 | m128: haddps: latency: CPI= 4.03, IPC= 0.25 56 | m128: haddps:throughput: CPI= 4.03, IPC= 0.25 57 | m128: pinsrd: latency: CPI= 1.03, IPC= 0.97 58 | m128: pinsrd:throughput: CPI= 1.03, IPC= 0.97 59 | m128: pinsrd->pextr: latency: CPI= 8.00, IPC= 0.12 60 | m128: dpps: latency: CPI= 13.73, IPC= 0.07 61 | m128: dpps:throughput: CPI= 10.03, IPC= 0.10 62 | m128: cvtps2dq: latency: CPI= 4.00, IPC= 0.25 63 | m128: cvtps2dq:throughput: CPI= 1.00, IPC= 1.00 64 | m128: pmovmskb:throughput: CPI= 1.26, IPC= 0.79 65 | m128: pmovmskb->movq: latency: CPI= 8.00, IPC= 0.12 66 | m128: movq->movq: latency: CPI= 8.00, IPC= 0.12 67 | m128: movaps [mem]: latency: CPI= 8.00, IPC= 0.12 68 | m128: movaps [mem]:throughput: CPI= 1.00, IPC= 1.00 69 | m128: movdqu [mem+1]: latency: CPI= 8.00, IPC= 0.12 70 | m128: movdqu [mem+1]:throughput: CPI= 1.00, IPC= 1.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 14.00, IPC= 0.07 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 2.00, IPC= 0.50 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 17.35, IPC= 0.06 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 12.35, IPC= 0.08 75 | m128: pcmpistri:throughput: CPI= 8.00, IPC= 0.12 76 | m128: pcmpistri->movq: latency: CPI= 18.00, IPC= 0.06 77 | m128: pcmpistrm:throughput: CPI= 7.00, IPC= 0.14 78 | m128: pcmpistrm: latency: CPI= 12.00, IPC= 0.08 79 | m128: pcmpestri:throughput: CPI= 13.00, IPC= 0.08 80 | m128: pcmpestri->movq: latency: CPI= 23.00, IPC= 0.04 81 | m128: pcmpestrm:throughput: CPI= 14.03, IPC= 0.07 82 | m128: pcmpestrm: latency: CPI= 17.02, IPC= 0.06 83 | reg64: popcnt: latency: CPI= 3.00, IPC= 0.33 84 | reg64: popcnt:throughput: CPI= 1.25, IPC= 0.80 85 | m128: aesenc: latency: CPI= 6.00, IPC= 0.17 86 | m128: aesenc:throughput: CPI= 2.00, IPC= 0.50 87 | m128: aesenclast: latency: CPI= 6.00, IPC= 0.17 88 | m128: aesenclast:throughput: CPI= 2.00, IPC= 0.50 89 | m128: aesdec: latency: CPI= 6.00, IPC= 0.17 90 | m128: aesdec:throughput: CPI= 2.00, IPC= 0.50 91 | m128: aesdeclast: latency: CPI= 6.00, IPC= 0.17 92 | m128: aesdeclast:throughput: CPI= 2.00, IPC= 0.50 93 | m128: pclmulqdq: latency: CPI= 6.76, IPC= 0.15 94 | m128: pclmulqdq:throughput: CPI= 4.03, IPC= 0.25 95 | -------------------------------------------------------------------------------- /hsw.log: -------------------------------------------------------------------------------- 1 | Intel(R) Core(TM) i7-4700MQ CPU @ 2.40GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.01, IPC= 0.99 4 | reg64: add:throughput: CPI= 0.27, IPC= 3.69 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.50, IPC= 1.99 7 | reg64: xor dst,dst: latency: CPI= 0.27, IPC= 3.76 8 | reg64: xor dst,dst:throughput: CPI= 0.27, IPC= 3.71 9 | reg64: xor: latency: CPI= 0.27, IPC= 3.76 10 | reg64: xor:throughput: CPI= 0.27, IPC= 3.76 11 | reg64: load: latency: CPI= 5.00, IPC= 0.20 12 | reg64: load:throughput: CPI= 0.63, IPC= 1.59 13 | reg64: crc32: latency: CPI= 3.00, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.00, IPC= 1.00 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 12.20, IPC= 0.08 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 1.27, IPC= 0.79 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 20.01, IPC= 0.05 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 14.01, IPC= 0.07 19 | m128: pxor: latency: CPI= 0.28, IPC= 3.60 20 | m128: pxor:throughput: CPI= 0.28, IPC= 3.60 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.50, IPC= 2.00 23 | m128: pmuldq: latency: CPI= 5.00, IPC= 0.20 24 | m128: pmuldq:throughput: CPI= 1.00, IPC= 1.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 2.00 26 | m128: loadps->movq: latency: CPI= 7.01, IPC= 0.14 27 | m128: movq->movq: latency: CPI= 2.00, IPC= 0.50 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.28, IPC= 3.60 30 | m128: xorps:throughput: CPI= 0.28, IPC= 3.60 31 | m128: addps: latency: CPI= 3.00, IPC= 0.33 32 | m128: addps:throughput: CPI= 1.00, IPC= 1.00 33 | m128: mulps: latency: CPI= 5.00, IPC= 0.20 34 | m128: mulps:throughput: CPI= 0.50, IPC= 1.98 35 | m128: divps: latency: CPI= 10.02, IPC= 0.10 36 | m128: divps:throughput: CPI= 7.04, IPC= 0.14 37 | m128: divpd: latency: CPI= 10.02, IPC= 0.10 38 | m128: divpd:throughput: CPI= 8.03, IPC= 0.12 39 | m128: rsqrtps: latency: CPI= 5.00, IPC= 0.20 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 5.00, IPC= 0.20 42 | m128: rcpps:throughput: CPI= 1.00, IPC= 1.00 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.34, IPC= 2.97 45 | m128: blendvps: latency: CPI= 2.00, IPC= 0.50 46 | m128: blendvps:throughput: CPI= 2.00, IPC= 0.50 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 1.00, IPC= 1.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 1.00, IPC= 1.00 51 | m128: pmullw: latency: CPI= 5.00, IPC= 0.20 52 | m128: pmullw:throughput: CPI= 1.00, IPC= 1.00 53 | m128: phaddd: latency: CPI= 3.00, IPC= 0.33 54 | m128: phaddd:throughput: CPI= 2.00, IPC= 0.50 55 | m128: haddps: latency: CPI= 3.00, IPC= 0.33 56 | m128: haddps:throughput: CPI= 2.00, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 2.00, IPC= 0.50 58 | m128: pinsrd:throughput: CPI= 2.00, IPC= 0.50 59 | m128: pinsrd->pexr: latency: CPI= 4.00, IPC= 0.25 60 | m128: dpps: latency: CPI= 14.01, IPC= 0.07 61 | m128: dpps:throughput: CPI= 2.00, IPC= 0.50 62 | m128: cvtps2dq: latency: CPI= 3.00, IPC= 0.33 63 | m128: cvtps2dq:throughput: CPI= 1.00, IPC= 1.00 64 | reg64: popcnt: latency: CPI= 3.00, IPC= 0.33 65 | reg64: popcnt:throughput: CPI= 1.00, IPC= 1.00 66 | m128: aesenc: latency: CPI= 7.00, IPC= 0.14 67 | m128: aesenc:throughput: CPI= 1.02, IPC= 0.98 68 | m128: aesenclast: latency: CPI= 7.00, IPC= 0.14 69 | m128: aesenclast:throughput: CPI= 1.01, IPC= 0.99 70 | m128: aesdec: latency: CPI= 7.01, IPC= 0.14 71 | m128: aesdec:throughput: CPI= 1.00, IPC= 1.00 72 | m128: aesdeclast: latency: CPI= 7.01, IPC= 0.14 73 | m128: aesdeclast:throughput: CPI= 1.00, IPC= 1.00 74 | m256: movaps [mem]: latency: CPI= 1.00, IPC= 1.00 75 | m256: movaps [mem]:throughput: CPI= 0.50, IPC= 1.99 76 | m256: vmovdqu [mem+1]: latency: CPI= 1.00, IPC= 1.00 77 | m256: vmovdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 78 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.00, IPC= 1.00 79 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 80 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 31.02, IPC= 0.03 81 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 31.02, IPC= 0.03 82 | m256: xorps: latency: CPI= 0.28, IPC= 3.60 83 | m256: xorps:throughput: CPI= 0.28, IPC= 3.60 84 | m256: mulps: latency: CPI= 5.00, IPC= 0.20 85 | m256: mulps:throughput: CPI= 0.50, IPC= 2.00 86 | m256: addps: latency: CPI= 3.01, IPC= 0.33 87 | m256: addps:throughput: CPI= 1.00, IPC= 1.00 88 | m256: divps: latency: CPI= 18.10, IPC= 0.06 89 | m256: divps:throughput: CPI= 14.26, IPC= 0.07 90 | m256: divpd: latency: CPI= 19.07, IPC= 0.05 91 | m256: divpd:throughput: CPI= 16.32, IPC= 0.06 92 | m256: rsqrtps: latency: CPI= 7.01, IPC= 0.14 93 | m256: rsqrtps:throughput: CPI= 2.00, IPC= 0.50 94 | m256: rcpps: latency: CPI= 7.01, IPC= 0.14 95 | m256: rcpps:throughput: CPI= 2.00, IPC= 0.50 96 | m256: sqrtps: latency: CPI= 18.10, IPC= 0.06 97 | m256: sqrtps:throughput: CPI= 14.10, IPC= 0.07 98 | m256: vperm2f128: latency: CPI= 3.00, IPC= 0.33 99 | m256: vperm2f128:throughput: CPI= 1.00, IPC= 1.00 100 | m256: pxor: latency: CPI= 0.28, IPC= 3.60 101 | m256: pxor:throughput: CPI= 0.28, IPC= 3.56 102 | m256: paddd: latency: CPI= 1.00, IPC= 1.00 103 | m256: paddd:throughput: CPI= 0.50, IPC= 2.00 104 | m256: vpermps: latency: CPI= 3.00, IPC= 0.33 105 | m256: vpermps:throughput: CPI= 1.00, IPC= 1.00 106 | m256: vpermpd: latency: CPI= 3.00, IPC= 0.33 107 | m256: vpermpd:throughput: CPI= 1.00, IPC= 1.00 108 | m256: vpmovsxwd: latency: CPI= 3.00, IPC= 0.33 109 | m256: vpmovsxwd:throughput: CPI= 1.00, IPC= 1.00 110 | m256: vpgatherdd: latency: CPI= 20.14, IPC= 0.05 111 | m256: vpgatherdd:throughput: CPI= 11.09, IPC= 0.09 112 | m256: gather32(x8 + perm): latency: CPI= 17.01, IPC= 0.06 113 | m256: gather32(x8 + perm):throughput: CPI= 8.02, IPC= 0.12 114 | m256: vgatherdpd: latency: CPI= 15.15, IPC= 0.07 115 | m256: vgatherdpd:throughput: CPI= 8.09, IPC= 0.12 116 | m256: gather64(x4 + perm): latency: CPI= 12.02, IPC= 0.08 117 | m256: gather64(x4 + perm):throughput: CPI= 4.00, IPC= 0.25 118 | m256: vpshufb: latency: CPI= 1.00, IPC= 1.00 119 | m256: vpshufb:throughput: CPI= 1.00, IPC= 1.00 120 | m256: vfmaps: latency: CPI= 5.00, IPC= 0.20 121 | m256: vfmaps:throughput: CPI= 0.50, IPC= 2.00 122 | m256: vfmapd: latency: CPI= 5.00, IPC= 0.20 123 | m256: vfmapd:throughput: CPI= 0.50, IPC= 2.00 124 | m128: vfmaps: latency: CPI= 5.00, IPC= 0.20 125 | m128: vfmaps:throughput: CPI= 0.50, IPC= 2.00 126 | m128: vfmapd: latency: CPI= 5.00, IPC= 0.20 127 | m128: vfmapd:throughput: CPI= 0.50, IPC= 2.00 128 | -------------------------------------------------------------------------------- /knl.log: -------------------------------------------------------------------------------- 1 | Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.08, IPC= 0.93 4 | reg64: add:throughput: CPI= 0.51, IPC= 1.96 5 | reg64: lea: latency: CPI= 1.02, IPC= 0.98 6 | reg64: lea:throughput: CPI= 0.51, IPC= 1.95 7 | reg64: xor dst,dst: latency: CPI= 1.07, IPC= 0.93 8 | reg64: xor dst,dst:throughput: CPI= 0.51, IPC= 1.96 9 | reg64: xor: latency: CPI= 1.07, IPC= 0.94 10 | reg64: xor:throughput: CPI= 0.51, IPC= 1.97 11 | reg64: load: latency: CPI= 4.03, IPC= 0.25 12 | reg64: load:throughput: CPI= 0.90, IPC= 1.12 13 | reg64: crc32: latency: CPI= 6.01, IPC= 0.17 14 | reg64: crc32:throughput: CPI= 5.34, IPC= 0.19 15 | m128: pxor: latency: CPI= 2.01, IPC= 0.50 16 | m128: pxor:throughput: CPI= 0.60, IPC= 1.68 17 | m128: padd: latency: CPI= 2.01, IPC= 0.50 18 | m128: padd:throughput: CPI= 0.59, IPC= 1.68 19 | m128: pmuldq: latency: CPI= 6.04, IPC= 0.17 20 | m128: pmuldq:throughput: CPI= 4.16, IPC= 0.24 21 | m128: loadps: latency: CPI= 9.06, IPC= 0.11 22 | m128: loadps:throughput: CPI= 0.56, IPC= 1.77 23 | m128: xorps: latency: CPI= 2.02, IPC= 0.50 24 | m128: xorps:throughput: CPI= 0.59, IPC= 1.68 25 | m128: addps: latency: CPI= 6.04, IPC= 0.17 26 | m128: addps:throughput: CPI= 0.60, IPC= 1.67 27 | m128: mulps: latency: CPI= 6.03, IPC= 0.17 28 | m128: mulps:throughput: CPI= 0.60, IPC= 1.67 29 | m128: divps: latency: CPI= 38.92, IPC= 0.03 30 | m128: divps:throughput: CPI= 16.16, IPC= 0.06 31 | m128: divpd: latency: CPI= 37.63, IPC= 0.03 32 | m128: divpd:throughput: CPI= 16.13, IPC= 0.06 33 | m128: rsqrtps: latency: CPI= 8.06, IPC= 0.12 34 | m128: rsqrtps:throughput: CPI= 3.02, IPC= 0.33 35 | m128: rcpps: latency: CPI= 8.06, IPC= 0.12 36 | m128: rcpps:throughput: CPI= 3.03, IPC= 0.33 37 | m128: blendps: latency: CPI= 6.01, IPC= 0.17 38 | m128: blendps:throughput: CPI= 4.17, IPC= 0.24 39 | m128: blendvps: latency: CPI= 10.10, IPC= 0.10 40 | m128: blendvps:throughput: CPI= 12.06, IPC= 0.08 41 | m128: pshufb: latency: CPI= 13.09, IPC= 0.08 42 | m128: pshufb:throughput: CPI= 11.11, IPC= 0.09 43 | m128: shufps: latency: CPI= 4.03, IPC= 0.25 44 | m128: shufps:throughput: CPI= 2.02, IPC= 0.50 45 | m128: pmullw: latency: CPI= 7.05, IPC= 0.14 46 | m128: pmullw:throughput: CPI= 2.03, IPC= 0.49 47 | m128: phaddd: latency: CPI= 11.10, IPC= 0.09 48 | m128: phaddd:throughput: CPI= 10.09, IPC= 0.10 49 | m128: haddps: latency: CPI= 11.11, IPC= 0.09 50 | m128: haddps:throughput: CPI= 10.09, IPC= 0.10 51 | m128: pinsrd:throughput: CPI= 4.17, IPC= 0.24 52 | m128: pinsrd->pexr: latency: CPI= 16.13, IPC= 0.06 53 | m128: dpps: latency: CPI= 36.24, IPC= 0.03 54 | m128: dpps:throughput: CPI= 15.14, IPC= 0.07 55 | m128: cvtps2dq: latency: CPI= 2.03, IPC= 0.49 56 | m128: cvtps2dq:throughput: CPI= 1.01, IPC= 0.99 57 | reg64: popcnt: latency: CPI= 3.08, IPC= 0.32 58 | reg64: popcnt:throughput: CPI= 0.98, IPC= 1.02 59 | m128: aesenc: latency: CPI= 6.02, IPC= 0.17 60 | m128: aesenc:throughput: CPI= 4.18, IPC= 0.24 61 | m128: aesenclast: latency: CPI= 6.02, IPC= 0.17 62 | m128: aesenclast:throughput: CPI= 4.17, IPC= 0.24 63 | m128: aesdec: latency: CPI= 6.01, IPC= 0.17 64 | m128: aesdec:throughput: CPI= 4.17, IPC= 0.24 65 | m128: aesdeclast: latency: CPI= 6.01, IPC= 0.17 66 | m128: aesdeclast:throughput: CPI= 4.17, IPC= 0.24 67 | m256: movaps [mem]: latency: CPI= 1.06, IPC= 0.94 68 | m256: movaps [mem]:throughput: CPI= 0.57, IPC= 1.77 69 | m256: vmovdqu [mem+1]: latency: CPI= 1.07, IPC= 0.94 70 | m256: vmovdqu [mem+1]:throughput: CPI= 0.57, IPC= 1.77 71 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.06, IPC= 0.94 72 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.01, IPC= 0.99 73 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 14.13, IPC= 0.07 74 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 14.13, IPC= 0.07 75 | m256: xorps: latency: CPI= 0.60, IPC= 1.68 76 | m256: xorps:throughput: CPI= 0.60, IPC= 1.68 77 | m256: mulps: latency: CPI= 6.05, IPC= 0.17 78 | m256: mulps:throughput: CPI= 0.59, IPC= 1.68 79 | m256: addps: latency: CPI= 6.04, IPC= 0.17 80 | m256: addps:throughput: CPI= 0.60, IPC= 1.68 81 | m256: divps: latency: CPI= 38.97, IPC= 0.03 82 | m256: divps:throughput: CPI= 16.16, IPC= 0.06 83 | m256: divpd: latency: CPI= 37.57, IPC= 0.03 84 | m256: divpd:throughput: CPI= 16.13, IPC= 0.06 85 | m256: rsqrtps: latency: CPI= 8.04, IPC= 0.12 86 | m256: rsqrtps:throughput: CPI= 3.02, IPC= 0.33 87 | m256: rcpps: latency: CPI= 8.04, IPC= 0.12 88 | m256: rcpps:throughput: CPI= 3.03, IPC= 0.33 89 | m256: sqrtps: latency: CPI= 38.28, IPC= 0.03 90 | m256: sqrtps:throughput: CPI= 16.15, IPC= 0.06 91 | m256: vperm2f128: latency: CPI= 4.03, IPC= 0.25 92 | m256: vperm2f128:throughput: CPI= 2.02, IPC= 0.49 93 | m256: pxor: latency: CPI= 0.60, IPC= 1.67 94 | m256: pxor:throughput: CPI= 0.60, IPC= 1.67 95 | m256: paddd: latency: CPI= 2.02, IPC= 0.50 96 | m256: paddd:throughput: CPI= 0.60, IPC= 1.68 97 | m256: vpermps: latency: CPI= 3.03, IPC= 0.33 98 | m256: vpermps:throughput: CPI= 1.01, IPC= 0.99 99 | m256: vpermpd: latency: CPI= 3.02, IPC= 0.33 100 | m256: vpermpd:throughput: CPI= 1.01, IPC= 0.99 101 | m256: vpmovsxwd: latency: CPI= 8.06, IPC= 0.12 102 | m256: vpmovsxwd:throughput: CPI= 7.07, IPC= 0.14 103 | m256: vpgatherdd: latency: CPI= 19.13, IPC= 0.05 104 | m256: vpgatherdd:throughput: CPI= 9.09, IPC= 0.11 105 | m256: gather32(x8 + perm): latency: CPI= 24.16, IPC= 0.04 106 | m256: gather32(x8 + perm):throughput: CPI= 8.06, IPC= 0.12 107 | m256: vgatherdpd: latency: CPI= 18.12, IPC= 0.06 108 | m256: vgatherdpd:throughput: CPI= 9.10, IPC= 0.11 109 | m256: gather64(x4 + perm): latency: CPI= 4.03, IPC= 0.25 110 | m256: gather64(x4 + perm):throughput: CPI= 4.03, IPC= 0.25 111 | m256: vfmaps: latency: CPI= 6.05, IPC= 0.17 112 | m256: vfmaps:throughput: CPI= 0.60, IPC= 1.68 113 | m256: vfmapd: latency: CPI= 6.06, IPC= 0.17 114 | m256: vfmapd:throughput: CPI= 0.60, IPC= 1.68 115 | m512: vfmaps: latency: CPI= 6.05, IPC= 0.17 116 | m512: vfmaps:throughput: CPI= 0.59, IPC= 1.69 117 | m512: vfmapd: latency: CPI= 6.04, IPC= 0.17 118 | m512: vfmapd:throughput: CPI= 0.60, IPC= 1.68 119 | m512: vfmaps reg, reg, [mem]: latency: CPI= 6.04, IPC= 0.17 120 | m512: vfmaps reg, reg, [mem]:throughput: CPI= 0.60, IPC= 1.67 121 | m512: vpexpandd: latency: CPI= 3.02, IPC= 0.33 122 | m512: vpexpandd:throughput: CPI= 1.01, IPC= 0.99 123 | m512: vplzcntq: latency: CPI= 3.02, IPC= 0.33 124 | m512: vplzcntq:throughput: CPI= 1.01, IPC= 0.99 125 | m512: vpconflictd: latency: CPI= 3.03, IPC= 0.33 126 | m512: vpconflictd:throughput: CPI= 1.02, IPC= 0.98 127 | m512: vpermt2d: latency: CPI= 4.03, IPC= 0.25 128 | m512: vpermt2d:throughput: CPI= 2.02, IPC= 0.50 129 | m512: vshufps: latency: CPI= 4.03, IPC= 0.25 130 | m512: vshufps:throughput: CPI= 2.01, IPC= 0.50 131 | m512: vrcp28pd: latency: CPI= 7.05, IPC= 0.14 132 | m512: vrcp28pd:throughput: CPI= 2.01, IPC= 0.50 133 | m512: vrcp14pd: latency: CPI= 7.05, IPC= 0.14 134 | m512: vrcp14pd:throughput: CPI= 2.01, IPC= 0.50 135 | m512: vpternlogd: latency: CPI= 2.01, IPC= 0.50 136 | m512: vpternlogd:throughput: CPI= 0.60, IPC= 1.68 137 | -------------------------------------------------------------------------------- /logs/linux/11thGenIntel(R)Core(TM)i5-1135G7@2.40GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000012e+00","9.999881e-01" 3 | "reg64","add","throughput","2.556185e-01","3.912081e+00" 4 | "reg64","lea","latency","1.000012e+00","9.999881e-01" 5 | "reg64","lea","throughput","2.557554e-01","3.909985e+00" 6 | "reg64","xor dst,dst","latency","2.031676e-01","4.922046e+00" 7 | "reg64","xor dst,dst","throughput","2.031581e-01","4.922274e+00" 8 | "reg64","xor","latency","2.031461e-01","4.922566e+00" 9 | "reg64","xor","throughput","2.031608e-01","4.922210e+00" 10 | "reg64","load","latency","5.000406e+00","1.999838e-01" 11 | "reg64","load","throughput","6.250165e-01","1.599958e+00" 12 | "reg64","crc32","latency","3.004097e+00","3.328788e-01" 13 | "reg64","crc32","throughput","1.000214e+00","9.997865e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","7.501003e+00","1.333155e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.254951e+00","7.968439e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","2.200145e+01","4.545154e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.700187e+01","5.881706e-02" 18 | "m128","pxor","latency","2.222684e-01","4.499065e+00" 19 | "m128","pxor","throughput","2.222566e-01","4.499305e+00" 20 | "m128","padd","latency","1.000717e+00","9.992838e-01" 21 | "m128","padd","throughput","3.333645e-01","2.999720e+00" 22 | "m128","pmuldq","latency","5.000292e+00","1.999883e-01" 23 | "m128","pmuldq","throughput","5.003967e-01","1.998414e+00" 24 | "m128","loadps","throughput","5.002694e-01","1.998923e+00" 25 | "m128","loadps->movq","latency","8.000911e+00","1.249858e-01" 26 | "m128","movq->movq","latency","4.000279e+00","2.499826e-01" 27 | "m128","movq->movq","throughput","1.000036e+00","9.999642e-01" 28 | "m128","xorps","latency","2.222780e-01","4.498872e+00" 29 | "m128","xorps","throughput","2.222672e-01","4.499090e+00" 30 | "m128","addps","latency","4.001110e+00","2.499307e-01" 31 | "m128","addps","throughput","5.000415e-01","1.999834e+00" 32 | "m128","mulps","latency","4.000150e+00","2.499906e-01" 33 | "m128","mulps","throughput","5.000288e-01","1.999885e+00" 34 | "m128","divps","latency","1.100171e+01","9.089493e-02" 35 | "m128","divps","throughput","3.000283e+00","3.333019e-01" 36 | "m128","divpd","latency","1.300201e+01","7.691119e-02" 37 | "m128","divpd","throughput","4.000372e+00","2.499768e-01" 38 | "m128","rsqrtps","latency","4.000625e+00","2.499610e-01" 39 | "m128","rsqrtps","throughput","1.000031e+00","9.999691e-01" 40 | "m128","rcpps","latency","4.000406e+00","2.499747e-01" 41 | "m128","rcpps","throughput","1.000042e+00","9.999585e-01" 42 | "m128","blendps","latency","1.000664e+00","9.993360e-01" 43 | "m128","blendps","throughput","3.333817e-01","2.999565e+00" 44 | "m128","blendvps","latency","1.000223e+00","9.997769e-01" 45 | "m128","blendvps","throughput","4.167423e-01","2.399564e+00" 46 | "m128","pshufb","latency","1.000520e+00","9.994804e-01" 47 | "m128","pshufb","throughput","5.000430e-01","1.999828e+00" 48 | "m128","shufps","latency","1.000881e+00","9.991194e-01" 49 | "m128","shufps","throughput","5.001625e-01","1.999350e+00" 50 | "m128","pmullw","latency","5.000871e+00","1.999651e-01" 51 | "m128","pmullw","throughput","5.004084e-01","1.998368e+00" 52 | "m128","phaddd","latency","2.083235e+00","4.800226e-01" 53 | "m128","phaddd","throughput","1.070864e+00","9.338253e-01" 54 | "m128","haddps","latency","2.083128e+00","4.800474e-01" 55 | "m128","haddps","throughput","1.070873e+00","9.338175e-01" 56 | "m128","pinsrd","latency","1.088122e+00","9.190142e-01" 57 | "m128","pinsrd","throughput","1.083936e+00","9.225633e-01" 58 | "m128","pinsrd->pextr","latency","6.000528e+00","1.666520e-01" 59 | "m128","dpps","latency","1.400102e+01","7.142338e-02" 60 | "m128","dpps","throughput","4.028108e+00","2.482555e-01" 61 | "m128","cvtps2dq","latency","4.000224e+00","2.499860e-01" 62 | "m128","cvtps2dq","throughput","5.000500e-01","1.999800e+00" 63 | "m128","pmovmskb","throughput","1.000030e+00","9.999703e-01" 64 | "m128","pmovmskb->movq","latency","4.000147e+00","2.499908e-01" 65 | "m128","movq->movq","latency","4.000272e+00","2.499830e-01" 66 | "m128","movaps [mem]","latency","8.000663e+00","1.249896e-01" 67 | "m128","movaps [mem]","throughput","5.014066e-01","1.994390e+00" 68 | "m128","movdqu [mem+1]","latency","8.000915e+00","1.249857e-01" 69 | "m128","movdqu [mem+1]","throughput","5.000479e-01","1.999808e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400187e+01","7.141905e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000042e+00","9.999576e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.500276e+01","6.665440e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","4.421937e+00","2.261453e-01" 74 | "m128","pcmpistri","throughput","3.000619e+00","3.332645e-01" 75 | "m128","pcmpistri->movq","latency","1.200453e+01","8.330187e-02" 76 | "m128","pcmpistrm","throughput","3.000023e+00","3.333308e-01" 77 | "m128","pcmpistrm","latency","8.906077e+00","1.122829e-01" 78 | "m128","pcmpestri","throughput","4.028193e+00","2.482502e-01" 79 | "m128","pcmpestri->movq","latency","1.300086e+01","7.691797e-02" 80 | "m128","pcmpestrm","throughput","5.028006e+00","1.988860e-01" 81 | "m128","pcmpestrm","latency","1.079436e+01","9.264095e-02" 82 | "m256","movaps [mem]","throughput","5.000403e-01","1.999839e+00" 83 | "m256","movaps [mem] -> movq","latency","9.009906e+00","1.109890e-01" 84 | "m256","vmovdqu [mem+1]","throughput","5.002986e-01","1.998806e+00" 85 | "m256","vmovdqu [mem+1] -> movq","latency","9.000399e+00","1.111062e-01" 86 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.000034e+00","9.999663e-01" 87 | "m256","vmovdqu [mem+63] (cross cache) -> movq","latency","1.400137e+01","7.142157e-02" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","4.400345e+00","2.272549e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page) -> movq","latency","1.600718e+01","6.247197e-02" 90 | "m256","vxorps","latency","2.222697e-01","4.499039e+00" 91 | "m256","vxorps","throughput","2.222585e-01","4.499266e+00" 92 | "m256","vmulps","latency","4.000867e+00","2.499458e-01" 93 | "m256","vmulps","throughput","5.000345e-01","1.999862e+00" 94 | "m256","vaddps","latency","4.000403e+00","2.499748e-01" 95 | "m256","vaddps","throughput","5.000269e-01","1.999892e+00" 96 | "m256","vdivps","latency","1.100874e+01","9.083693e-02" 97 | "m256","vdivps","throughput","5.000423e+00","1.999831e-01" 98 | "m256","vdivpd","latency","1.300109e+01","7.691665e-02" 99 | "m256","vdivpd","throughput","8.001340e+00","1.249791e-01" 100 | "m256","vrsqrtps","latency","4.009785e+00","2.493899e-01" 101 | "m256","vrsqrtps","throughput","1.002040e+00","9.979639e-01" 102 | "m256","vrcpps","latency","4.000269e+00","2.499832e-01" 103 | "m256","vrcpps","throughput","1.001389e+00","9.986127e-01" 104 | "m256","vsqrtps","latency","1.200040e+01","8.333056e-02" 105 | "m256","vsqrtps","throughput","6.000285e+00","1.666587e-01" 106 | "m256","vperm2f128","latency","3.000031e+00","3.333299e-01" 107 | "m256","vperm2f128","throughput","1.000017e+00","9.999830e-01" 108 | "m256","vpxor","latency","2.222642e-01","4.499150e+00" 109 | "m256","vpxor","throughput","2.231992e-01","4.480303e+00" 110 | "m256","vpaddd","latency","1.000022e+00","9.999784e-01" 111 | "m256","vpaddd","throughput","3.333649e-01","2.999716e+00" 112 | "m256","vpermps","latency","3.000303e+00","3.332997e-01" 113 | "m256","vpermps","throughput","1.000034e+00","9.999661e-01" 114 | "m256","vpermpd","latency","3.000274e+00","3.333029e-01" 115 | "m256","vpermpd","throughput","1.000023e+00","9.999773e-01" 116 | "m256","vpblendvb","latency","2.000404e+00","4.998991e-01" 117 | "m256","vpblendvb","throughput","1.000042e+00","9.999583e-01" 118 | "m256","vpmovmskb","throughput","1.000024e+00","9.999758e-01" 119 | "m256","vpmovsxwd","latency","3.000685e+00","3.332572e-01" 120 | "m256","vpmovsxwd","throughput","1.000021e+00","9.999792e-01" 121 | "m256","vpgatherdd","latency","2.200211e+01","4.545018e-02" 122 | "m256","vpgatherdd","throughput","5.002063e+00","1.999175e-01" 123 | "m256","gather32(x8 + perm)","latency","1.636300e+01","6.111349e-02" 124 | "m256","gather32(x8 + perm)","throughput","4.000278e+00","2.499826e-01" 125 | "m256","vgatherdpd","latency","2.000926e+01","4.997687e-02" 126 | "m256","vgatherdpd","throughput","3.000183e+00","3.333130e-01" 127 | "m256","gather64(x4 + perm)","latency","1.245277e+01","8.030341e-02" 128 | "m256","gather64(x4 + perm)","throughput","2.000110e+00","4.999724e-01" 129 | "m256","vpshufb","latency","1.000088e+00","9.999121e-01" 130 | "m256","vpshufb","throughput","5.000280e-01","1.999888e+00" 131 | "m256","vfmaps","latency","4.000145e+00","2.499910e-01" 132 | "m256","vfmaps","throughput","5.000242e-01","1.999903e+00" 133 | "m256","vfmapd","latency","4.000841e+00","2.499475e-01" 134 | "m256","vfmapd","throughput","5.000191e-01","1.999924e+00" 135 | "m128","vfmaps","latency","4.002147e+00","2.498659e-01" 136 | "m128","vfmaps","throughput","5.014265e-01","1.994310e+00" 137 | "m128","vfmapd","latency","4.001518e+00","2.499052e-01" 138 | "m128","vfmapd","throughput","5.000231e-01","1.999908e+00" 139 | "m512","vaddps","latency","4.000508e+00","2.499683e-01" 140 | "m512","vaddps","throughput","1.000027e+00","9.999727e-01" 141 | "m512","vaddpd","latency","4.000145e+00","2.499909e-01" 142 | "m512","vaddpd","throughput","1.000033e+00","9.999674e-01" 143 | "m512","vorps","latency","1.000027e+00","9.999731e-01" 144 | "m512","vorps","throughput","5.000212e-01","1.999915e+00" 145 | "m512","vorpd","latency","1.000029e+00","9.999708e-01" 146 | "m512","vorpd","throughput","5.009081e-01","1.996374e+00" 147 | "m512","vorps reg, reg, [mem]","latency","1.000024e+00","9.999765e-01" 148 | "m512","vorps reg, reg, [mem]","throughput","5.004467e-01","1.998215e+00" 149 | "m512","vfmaps","latency","4.006269e+00","2.496088e-01" 150 | "m512","vfmaps","throughput","1.000514e+00","9.994868e-01" 151 | "m512","vfmapd","latency","4.000405e+00","2.499747e-01" 152 | "m512","vfmapd","throughput","1.000569e+00","9.994309e-01" 153 | "m512","vfmaps reg, reg, [mem]","latency","4.000274e+00","2.499829e-01" 154 | "m512","vfmaps reg, reg, [mem]","throughput","1.000033e+00","9.999674e-01" 155 | "m512","vpexpandd","latency","3.000275e+00","3.333028e-01" 156 | "m512","vpexpandd","throughput","2.000600e+00","4.998500e-01" 157 | "m512","vplzcntq","latency","3.000276e+00","3.333027e-01" 158 | "m512","vplzcntq","throughput","2.001058e+00","4.997357e-01" 159 | "m512","vpconflictd","latency","2.644490e+01","3.781448e-02" 160 | "m512","vpconflictd","throughput","1.939633e+01","5.155616e-02" 161 | "m512","vpermt2d","latency","3.000617e+00","3.332648e-01" 162 | "m512","vpermt2d","throughput","1.000019e+00","9.999809e-01" 163 | "m512","vshufps","latency","1.000071e+00","9.999286e-01" 164 | "m512","vshufps","throughput","1.000996e+00","9.990045e-01" 165 | "m512","vrcp14pd","latency","6.000232e+00","1.666602e-01" 166 | "m512","vrcp14pd","throughput","2.000177e+00","4.999557e-01" 167 | "m512","vpternlogd","latency","1.000021e+00","9.999790e-01" 168 | "m512","vpternlogd","throughput","5.002694e-01","1.998923e+00" 169 | "m256","vpdpwssds","latency","5.000825e+00","1.999670e-01" 170 | "m256","vpdpwssds","throughput","5.006330e-01","1.997471e+00" 171 | "m256","vpdpwssd","latency","5.000394e+00","1.999842e-01" 172 | "m256","vpdpwssd","throughput","5.003736e-01","1.998507e+00" 173 | "m512","vpdpwssds","latency","5.001241e+00","1.999504e-01" 174 | "m512","vpdpwssds","throughput","1.000029e+00","9.999705e-01" 175 | "m512","vpdpwssd","latency","5.000382e+00","1.999847e-01" 176 | "m512","vpdpwssd","throughput","1.000406e+00","9.995943e-01" 177 | "reg64","popcnt","latency","3.001043e+00","3.332175e-01" 178 | "reg64","popcnt","throughput","1.000796e+00","9.992043e-01" 179 | "m128","aesenc","latency","7.000413e+00","1.428487e-01" 180 | "m128","aesenc","throughput","1.000249e+00","9.997510e-01" 181 | "m128","aesenclast","latency","7.001016e+00","1.428364e-01" 182 | "m128","aesenclast","throughput","1.000038e+00","9.999625e-01" 183 | "m128","aesdec","latency","7.000900e+00","1.428388e-01" 184 | "m128","aesdec","throughput","1.000390e+00","9.996100e-01" 185 | "m128","aesdeclast","latency","7.000788e+00","1.428411e-01" 186 | "m128","aesdeclast","throughput","1.000036e+00","9.999642e-01" 187 | "m128","pclmulqdq","latency","6.000985e+00","1.666393e-01" 188 | "m128","pclmulqdq","throughput","2.000534e+00","4.998664e-01" 189 | -------------------------------------------------------------------------------- /logs/linux/12thGenIntel(R)Core(TM)i7-12700K.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000297e+00","9.997033e-01" 3 | "reg64","add","throughput","2.125294e-01","4.705231e+00" 4 | "reg64","lea","latency","1.718954e-01","5.817492e+00" 5 | "reg64","lea","throughput","1.718993e-01","5.817359e+00" 6 | "reg64","xor dst,dst","latency","1.697102e-01","5.892399e+00" 7 | "reg64","xor dst,dst","throughput","1.694911e-01","5.900016e+00" 8 | "reg64","xor","latency","1.793859e-01","5.574574e+00" 9 | "reg64","xor","throughput","1.717637e-01","5.821953e+00" 10 | "reg64","load","latency","5.003329e+00","1.998669e-01" 11 | "reg64","load","throughput","6.251451e-01","1.599629e+00" 12 | "reg64","crc32","latency","3.001437e+00","3.331737e-01" 13 | "reg64","crc32","throughput","1.000009e+00","9.999906e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","7.426915e+00","1.346454e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.241793e+00","8.052871e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","2.004522e+01","4.988721e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.500030e+01","6.666534e-02" 18 | "m128","pxor","latency","2.063372e-01","4.846436e+00" 19 | "m128","pxor","throughput","1.805962e-01","5.537214e+00" 20 | "m128","padd","latency","1.000741e+00","9.992596e-01" 21 | "m128","padd","throughput","3.334391e-01","2.999049e+00" 22 | "m128","pmuldq","latency","5.000553e+00","1.999779e-01" 23 | "m128","pmuldq","throughput","5.000229e-01","1.999908e+00" 24 | "m128","loadps","throughput","5.033739e-01","1.986595e+00" 25 | "m128","loadps->movq","latency","8.001011e+00","1.249842e-01" 26 | "m128","movq->movq","latency","4.001302e+00","2.499186e-01" 27 | "m128","movq->movq","throughput","1.000235e+00","9.997648e-01" 28 | "m128","xorps","latency","1.713344e-01","5.836540e+00" 29 | "m128","xorps","throughput","1.806024e-01","5.537025e+00" 30 | "m128","addps","latency","2.000382e+00","4.999044e-01" 31 | "m128","addps","throughput","5.000322e-01","1.999871e+00" 32 | "m128","mulps","latency","4.033630e+00","2.479157e-01" 33 | "m128","mulps","throughput","5.002833e-01","1.998867e+00" 34 | "m128","divps","latency","1.102414e+01","9.070998e-02" 35 | "m128","divps","throughput","3.000101e+00","3.333221e-01" 36 | "m128","divpd","latency","1.300096e+01","7.691740e-02" 37 | "m128","divpd","throughput","4.000167e+00","2.499896e-01" 38 | "m128","rsqrtps","latency","4.000464e+00","2.499710e-01" 39 | "m128","rsqrtps","throughput","1.000018e+00","9.999822e-01" 40 | "m128","rcpps","latency","4.000090e+00","2.499944e-01" 41 | "m128","rcpps","throughput","1.025604e+00","9.750356e-01" 42 | "m128","blendps","latency","1.000268e+00","9.997320e-01" 43 | "m128","blendps","throughput","3.333662e-01","2.999704e+00" 44 | "m128","blendvps","latency","1.000036e+00","9.999644e-01" 45 | "m128","blendvps","throughput","3.611535e-01","2.768906e+00" 46 | "m128","pshufb","latency","1.000036e+00","9.999644e-01" 47 | "m128","pshufb","throughput","5.000218e-01","1.999913e+00" 48 | "m128","shufps","latency","1.000025e+00","9.999754e-01" 49 | "m128","shufps","throughput","5.000206e-01","1.999918e+00" 50 | "m128","pmullw","latency","5.000160e+00","1.999936e-01" 51 | "m128","pmullw","throughput","5.000218e-01","1.999913e+00" 52 | "m128","phaddd","latency","2.072281e+00","4.825601e-01" 53 | "m128","phaddd","throughput","1.053396e+00","9.493108e-01" 54 | "m128","haddps","latency","2.072739e+00","4.824534e-01" 55 | "m128","haddps","throughput","1.053440e+00","9.492713e-01" 56 | "m128","pinsrd","latency","1.055546e+00","9.473772e-01" 57 | "m128","pinsrd","throughput","1.057414e+00","9.457032e-01" 58 | "m128","pinsrd->pextr","latency","6.006960e+00","1.664736e-01" 59 | "m128","dpps","latency","1.400069e+01","7.142507e-02" 60 | "m128","dpps","throughput","4.028213e+00","2.482490e-01" 61 | "m128","cvtps2dq","latency","4.000091e+00","2.499943e-01" 62 | "m128","cvtps2dq","throughput","5.289453e-01","1.890555e+00" 63 | "m128","pmovmskb","throughput","1.000474e+00","9.995261e-01" 64 | "m128","pmovmskb->movq","latency","4.000358e+00","2.499776e-01" 65 | "m128","movq->movq","latency","4.029157e+00","2.481908e-01" 66 | "m128","movaps [mem]","latency","8.030424e+00","1.245264e-01" 67 | "m128","movaps [mem]","throughput","5.033970e-01","1.986504e+00" 68 | "m128","movdqu [mem+1]","latency","8.027951e+00","1.245648e-01" 69 | "m128","movdqu [mem+1]","throughput","5.033304e-01","1.986766e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.500082e+01","6.666301e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.064669e+00","9.392594e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.500086e+01","6.666285e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","3.310987e+00","3.020247e-01" 74 | "m128","pcmpistri","throughput","3.000102e+00","3.333220e-01" 75 | "m128","pcmpistri->movq","latency","1.200067e+01","8.332867e-02" 76 | "m128","pcmpistrm","throughput","3.000479e+00","3.332801e-01" 77 | "m128","pcmpistrm","latency","8.795205e+00","1.136983e-01" 78 | "m128","pcmpestri","throughput","4.046442e+00","2.471307e-01" 79 | "m128","pcmpestri->movq","latency","1.299891e+01","7.692953e-02" 80 | "m128","pcmpestrm","throughput","5.110236e+00","1.956857e-01" 81 | "m128","pcmpestrm","latency","1.070724e+01","9.339476e-02" 82 | "m256","movaps [mem]","throughput","5.032779e-01","1.986974e+00" 83 | "m256","movaps [mem] -> movq","latency","9.000714e+00","1.111023e-01" 84 | "m256","vmovdqu [mem+1]","throughput","5.033188e-01","1.986812e+00" 85 | "m256","vmovdqu [mem+1] -> movq","latency","9.000474e+00","1.111053e-01" 86 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.064051e+00","9.398045e-01" 87 | "m256","vmovdqu [mem+63] (cross cache) -> movq","latency","1.600078e+01","6.249693e-02" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","3.309666e+00","3.021453e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page) -> movq","latency","1.600064e+01","6.249751e-02" 90 | "m256","vxorps","latency","2.563364e-01","3.901123e+00" 91 | "m256","vxorps","throughput","1.805975e-01","5.537175e+00" 92 | "m256","vmulps","latency","4.000170e+00","2.499894e-01" 93 | "m256","vmulps","throughput","5.000208e-01","1.999917e+00" 94 | "m256","vaddps","latency","2.046626e+00","4.886091e-01" 95 | "m256","vaddps","throughput","5.003257e-01","1.998698e+00" 96 | "m256","vdivps","latency","1.102704e+01","9.068620e-02" 97 | "m256","vdivps","throughput","5.028982e+00","1.988474e-01" 98 | "m256","vdivpd","latency","1.302524e+01","7.677399e-02" 99 | "m256","vdivpd","throughput","8.001091e+00","1.249830e-01" 100 | "m256","vrsqrtps","latency","4.023346e+00","2.485494e-01" 101 | "m256","vrsqrtps","throughput","1.000483e+00","9.995170e-01" 102 | "m256","vrcpps","latency","4.000350e+00","2.499781e-01" 103 | "m256","vrcpps","throughput","1.000023e+00","9.999773e-01" 104 | "m256","vsqrtps","latency","1.200070e+01","8.332845e-02" 105 | "m256","vsqrtps","throughput","6.000417e+00","1.666551e-01" 106 | "m256","vperm2f128","latency","3.000432e+00","3.332854e-01" 107 | "m256","vperm2f128","throughput","1.000015e+00","9.999847e-01" 108 | "m256","vpxor","latency","1.805918e-01","5.537350e+00" 109 | "m256","vpxor","throughput","1.805844e-01","5.537578e+00" 110 | "m256","vpaddd","latency","1.000021e+00","9.999794e-01" 111 | "m256","vpaddd","throughput","3.333615e-01","2.999746e+00" 112 | "m256","vpermps","latency","3.001713e+00","3.331431e-01" 113 | "m256","vpermps","throughput","1.000520e+00","9.994802e-01" 114 | "m256","vpermpd","latency","3.000170e+00","3.333144e-01" 115 | "m256","vpermpd","throughput","1.000387e+00","9.996127e-01" 116 | "m256","vpblendvb","latency","3.009427e+00","3.322892e-01" 117 | "m256","vpblendvb","throughput","1.033340e+00","9.677359e-01" 118 | "m256","vpmovmskb","throughput","1.000075e+00","9.999248e-01" 119 | "m256","vpmovsxwd","latency","3.000538e+00","3.332736e-01" 120 | "m256","vpmovsxwd","throughput","1.000146e+00","9.998538e-01" 121 | "m256","vpgatherdd","latency","2.400084e+01","4.166520e-02" 122 | "m256","vpgatherdd","throughput","3.033261e+00","3.296782e-01" 123 | "m256","gather32(x8 + perm)","latency","1.624801e+01","6.154600e-02" 124 | "m256","gather32(x8 + perm)","throughput","3.500381e+00","2.856832e-01" 125 | "m256","vgatherdpd","latency","2.300064e+01","4.347705e-02" 126 | "m256","vgatherdpd","throughput","2.001638e+00","4.995909e-01" 127 | "m256","gather64(x4 + perm)","latency","1.242894e+01","8.045739e-02" 128 | "m256","gather64(x4 + perm)","throughput","1.501580e+00","6.659650e-01" 129 | "m256","vpshufb","latency","1.000326e+00","9.996746e-01" 130 | "m256","vpshufb","throughput","5.000170e-01","1.999932e+00" 131 | "m256","vfmaps","latency","4.000091e+00","2.499943e-01" 132 | "m256","vfmaps","throughput","5.000157e-01","1.999937e+00" 133 | "m256","vfmapd","latency","4.006047e+00","2.496226e-01" 134 | "m256","vfmapd","throughput","5.000280e-01","1.999888e+00" 135 | "m128","vfmaps","latency","4.000302e+00","2.499812e-01" 136 | "m128","vfmaps","throughput","5.000208e-01","1.999917e+00" 137 | "m128","vfmapd","latency","4.000153e+00","2.499904e-01" 138 | "m128","vfmapd","throughput","5.000424e-01","1.999830e+00" 139 | "reg64","popcnt","latency","3.015148e+00","3.316587e-01" 140 | "reg64","popcnt","throughput","1.000010e+00","9.999897e-01" 141 | "m128","aesenc","latency","3.000300e+00","3.333000e-01" 142 | "m128","aesenc","throughput","5.000570e-01","1.999772e+00" 143 | "m128","aesenclast","latency","3.034904e+00","3.294997e-01" 144 | "m128","aesenclast","throughput","5.001204e-01","1.999519e+00" 145 | "m128","aesdec","latency","3.000024e+00","3.333307e-01" 146 | "m128","aesdec","throughput","5.000403e-01","1.999839e+00" 147 | "m128","aesdeclast","latency","3.035157e+00","3.294723e-01" 148 | "m128","aesdeclast","throughput","5.003471e-01","1.998612e+00" 149 | "m128","pclmulqdq","latency","3.000084e+00","3.333240e-01" 150 | "m128","pclmulqdq","throughput","1.000106e+00","9.998936e-01" 151 | -------------------------------------------------------------------------------- /logs/linux/AMDRyzen52400GwithRadeonVegaGraphics.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000156e+00","9.998441e-01" 3 | "reg64","add","throughput","3.983821e-01","2.510153e+00" 4 | "reg64","lea","latency","1.000091e+00","9.999088e-01" 5 | "reg64","lea","throughput","2.578309e-01","3.878512e+00" 6 | "reg64","xor dst,dst","latency","2.578326e-01","3.878485e+00" 7 | "reg64","xor dst,dst","throughput","2.578307e-01","3.878513e+00" 8 | "reg64","xor","latency","2.579951e-01","3.876042e+00" 9 | "reg64","xor","throughput","2.578340e-01","3.878465e+00" 10 | "reg64","load","latency","4.000354e+00","2.499779e-01" 11 | "reg64","load","throughput","6.252091e-01","1.599465e+00" 12 | "reg64","crc32","latency","3.015986e+00","3.315666e-01" 13 | "reg64","crc32","throughput","3.015822e+00","3.315846e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","3.177721e+01","3.146909e-02" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","2.745000e+00","3.642988e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","3.700222e+01","2.702541e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.700382e+01","5.881030e-02" 18 | "m128","pxor","latency","2.502956e-01","3.995275e+00" 19 | "m128","pxor","throughput","2.500424e-01","3.999322e+00" 20 | "m128","padd","latency","1.000023e+00","9.999767e-01" 21 | "m128","padd","throughput","3.333605e-01","2.999756e+00" 22 | "m128","pmuldq","latency","3.000340e+00","3.332956e-01" 23 | "m128","pmuldq","throughput","1.000018e+00","9.999816e-01" 24 | "m128","loadps","throughput","5.000331e-01","1.999868e+00" 25 | "m128","loadps->movq","latency","9.097036e+00","1.099259e-01" 26 | "m128","movq->movq","latency","6.001968e+00","1.666120e-01" 27 | "m128","movq->movq","throughput","1.000389e+00","9.996111e-01" 28 | "m128","xorps","latency","2.500454e-01","3.999274e+00" 29 | "m128","xorps","throughput","2.500517e-01","3.999173e+00" 30 | "m128","addps","latency","3.000175e+00","3.333139e-01" 31 | "m128","addps","throughput","5.000303e-01","1.999879e+00" 32 | "m128","mulps","latency","3.000439e+00","3.332846e-01" 33 | "m128","mulps","throughput","5.000231e-01","1.999908e+00" 34 | "m128","divps","latency","1.000086e+01","9.999137e-02" 35 | "m128","divps","throughput","3.000362e+00","3.332931e-01" 36 | "m128","divpd","latency","8.000488e+00","1.249924e-01" 37 | "m128","divpd","throughput","4.000357e+00","2.499777e-01" 38 | "m128","rsqrtps","latency","5.000958e+00","1.999617e-01" 39 | "m128","rsqrtps","throughput","1.000324e+00","9.996765e-01" 40 | "m128","rcpps","latency","5.000295e+00","1.999882e-01" 41 | "m128","rcpps","throughput","1.000326e+00","9.996739e-01" 42 | "m128","blendps","latency","1.000319e+00","9.996812e-01" 43 | "m128","blendps","throughput","5.000288e-01","1.999885e+00" 44 | "m128","blendvps","latency","1.000019e+00","9.999809e-01" 45 | "m128","blendvps","throughput","5.000379e-01","1.999848e+00" 46 | "m128","pshufb","latency","1.000025e+00","9.999746e-01" 47 | "m128","pshufb","throughput","5.000271e-01","1.999891e+00" 48 | "m128","shufps","latency","1.000023e+00","9.999769e-01" 49 | "m128","shufps","throughput","5.004787e-01","1.998087e+00" 50 | "m128","pmullw","latency","3.000161e+00","3.333154e-01" 51 | "m128","pmullw","throughput","1.000022e+00","9.999780e-01" 52 | "m128","phaddd","latency","2.000191e+00","4.999524e-01" 53 | "m128","phaddd","throughput","2.000461e+00","4.998847e-01" 54 | "m128","haddps","latency","2.000181e+00","4.999547e-01" 55 | "m128","haddps","throughput","2.000041e+00","4.999898e-01" 56 | "m128","pinsrd","latency","1.637952e+00","6.105185e-01" 57 | "m128","pinsrd","throughput","1.310634e+00","7.629898e-01" 58 | "m128","pinsrd->pexr","latency","8.000570e+00","1.249911e-01" 59 | "m128","dpps","latency","1.500072e+01","6.666346e-02" 60 | "m128","dpps","throughput","4.000471e+00","2.499706e-01" 61 | "m128","cvtps2dq","latency","4.000338e+00","2.499789e-01" 62 | "m128","cvtps2dq","throughput","1.000405e+00","9.995947e-01" 63 | "reg64","popcnt","latency","1.000016e+00","9.999844e-01" 64 | "reg64","popcnt","throughput","2.578347e-01","3.878454e+00" 65 | "m128","aesenc","latency","4.000161e+00","2.499900e-01" 66 | "m128","aesenc","throughput","5.000280e-01","1.999888e+00" 67 | "m128","aesenclast","latency","4.000314e+00","2.499804e-01" 68 | "m128","aesenclast","throughput","5.000318e-01","1.999873e+00" 69 | "m128","aesdec","latency","4.000164e+00","2.499897e-01" 70 | "m128","aesdec","throughput","5.000295e-01","1.999882e+00" 71 | "m128","aesdeclast","latency","4.000485e+00","2.499697e-01" 72 | "m128","aesdeclast","throughput","5.004705e-01","1.998120e+00" 73 | "m256","movaps [mem]","latency","1.000345e+00","9.996551e-01" 74 | "m256","movaps [mem]","throughput","1.000315e+00","9.996854e-01" 75 | "m256","vmovdqu [mem+1]","latency","1.500044e+00","6.666470e-01" 76 | "m256","vmovdqu [mem+1]","throughput","1.500181e+00","6.665861e-01" 77 | "m256","vmovdqu [mem+63] (cross cache)","latency","1.500570e+00","6.664135e-01" 78 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.500042e+00","6.666478e-01" 79 | "m256","vmovdqu [mem+2MB-1] (cross page)","latency","1.500412e+00","6.664835e-01" 80 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","1.500045e+00","6.666469e-01" 81 | "m256","xorps","latency","5.000407e-01","1.999837e+00" 82 | "m256","xorps","throughput","5.005629e-01","1.997751e+00" 83 | "m256","mulps","latency","3.000166e+00","3.333149e-01" 84 | "m256","mulps","throughput","1.000021e+00","9.999788e-01" 85 | "m256","addps","latency","3.000290e+00","3.333011e-01" 86 | "m256","addps","throughput","1.000328e+00","9.996725e-01" 87 | "m256","divps","latency","1.000058e+01","9.999418e-02" 88 | "m256","divps","throughput","6.000354e+00","1.666568e-01" 89 | "m256","divpd","latency","8.000800e+00","1.249875e-01" 90 | "m256","divpd","throughput","8.000667e+00","1.249896e-01" 91 | "m256","rsqrtps","latency","5.000446e+00","1.999822e-01" 92 | "m256","rsqrtps","throughput","2.000303e+00","4.999243e-01" 93 | "m256","rcpps","latency","5.000389e+00","1.999844e-01" 94 | "m256","rcpps","throughput","2.001960e+00","4.995104e-01" 95 | "m256","sqrtps","latency","8.003994e+00","1.249376e-01" 96 | "m256","sqrtps","throughput","8.000455e+00","1.249929e-01" 97 | "m256","vperm2f128","latency","3.002846e+00","3.330174e-01" 98 | "m256","vperm2f128","throughput","3.000183e+00","3.333130e-01" 99 | "m256","pxor","latency","5.000390e-01","1.999844e+00" 100 | "m256","pxor","throughput","5.000379e-01","1.999848e+00" 101 | "m256","paddd","latency","1.001350e+00","9.986514e-01" 102 | "m256","paddd","throughput","6.670384e-01","1.499164e+00" 103 | "m256","vpermps","latency","5.000856e+00","1.999658e-01" 104 | "m256","vpermps","throughput","4.000613e+00","2.499617e-01" 105 | "m256","vpermpd","latency","2.001621e+00","4.995951e-01" 106 | "m256","vpermpd","throughput","2.000335e+00","4.999162e-01" 107 | "m256","vpmovsxwd","latency","2.000231e+00","4.999422e-01" 108 | "m256","vpmovsxwd","throughput","2.006091e+00","4.984819e-01" 109 | "m256","vpgatherdd","latency","2.098003e+01","4.766438e-02" 110 | "m256","vpgatherdd","throughput","2.000948e+01","4.997631e-02" 111 | "m256","gather32(x8 + perm)","latency","1.735889e+01","5.760737e-02" 112 | "m256","gather32(x8 + perm)","throughput","5.028299e+00","1.988744e-01" 113 | "m256","vgatherdpd","latency","1.580182e+01","6.328386e-02" 114 | "m256","vgatherdpd","throughput","1.200506e+01","8.329824e-02" 115 | "m256","gather64(x4 + perm)","latency","1.291982e+01","7.740046e-02" 116 | "m256","gather64(x4 + perm)","throughput","3.030365e+00","3.299933e-01" 117 | "m256","vpshufb","latency","1.001292e+00","9.987095e-01" 118 | "m256","vpshufb","throughput","1.000028e+00","9.999716e-01" 119 | "m256","vfmaps","latency","5.005436e+00","1.997828e-01" 120 | "m256","vfmaps","throughput","1.000034e+00","9.999659e-01" 121 | "m256","vfmapd","latency","5.005187e+00","1.997927e-01" 122 | "m256","vfmapd","throughput","1.000028e+00","9.999716e-01" 123 | "m128","vfmaps","latency","5.000966e+00","1.999614e-01" 124 | "m128","vfmaps","throughput","5.000282e-01","1.999887e+00" 125 | "m128","vfmapd","latency","5.001020e+00","1.999592e-01" 126 | "m128","vfmapd","throughput","5.000305e-01","1.999878e+00" 127 | -------------------------------------------------------------------------------- /logs/linux/AMDRyzen71700XEight-CoreProcessor.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.049214e+00","9.530948e-01" 3 | "reg64","add","throughput","2.578324e-01","3.878488e+00" 4 | "reg64","lea","latency","1.001316e+00","9.986853e-01" 5 | "reg64","lea","throughput","2.578300e-01","3.878524e+00" 6 | "reg64","xor dst,dst","latency","2.578338e-01","3.878467e+00" 7 | "reg64","xor dst,dst","throughput","2.580744e-01","3.874852e+00" 8 | "reg64","xor","latency","2.578309e-01","3.878512e+00" 9 | "reg64","xor","throughput","2.709609e-01","3.690569e+00" 10 | "reg64","load","latency","4.000268e+00","2.499833e-01" 11 | "reg64","load","throughput","6.252173e-01","1.599444e+00" 12 | "reg64","crc32","latency","3.015817e+00","3.315851e-01" 13 | "reg64","crc32","throughput","3.015818e+00","3.315850e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","2.617534e+01","3.820390e-02" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","2.745052e+00","3.642918e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","3.132426e+01","3.192413e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.700187e+01","5.881707e-02" 18 | "m128","pxor","latency","2.500386e-01","3.999383e+00" 19 | "m128","pxor","throughput","2.500377e-01","3.999397e+00" 20 | "m128","padd","latency","1.000258e+00","9.997419e-01" 21 | "m128","padd","throughput","3.333666e-01","2.999701e+00" 22 | "m128","pmuldq","latency","3.002322e+00","3.330755e-01" 23 | "m128","pmuldq","throughput","1.001453e+00","9.985493e-01" 24 | "m128","loadps","throughput","5.000367e-01","1.999853e+00" 25 | "m128","loadps->movq","latency","9.000456e+00","1.111055e-01" 26 | "m128","movq->movq","latency","6.027003e+00","1.659200e-01" 27 | "m128","movq->movq","throughput","1.000237e+00","9.997635e-01" 28 | "m128","xorps","latency","2.500341e-01","3.999454e+00" 29 | "m128","xorps","throughput","2.500405e-01","3.999352e+00" 30 | "m128","addps","latency","3.000170e+00","3.333145e-01" 31 | "m128","addps","throughput","5.007428e-01","1.997033e+00" 32 | "m128","mulps","latency","3.008931e+00","3.323440e-01" 33 | "m128","mulps","throughput","5.001918e-01","1.999233e+00" 34 | "m128","divps","latency","1.000089e+01","9.999114e-02" 35 | "m128","divps","throughput","3.000165e+00","3.333150e-01" 36 | "m128","divpd","latency","8.001370e+00","1.249786e-01" 37 | "m128","divpd","throughput","4.000181e+00","2.499887e-01" 38 | "m128","rsqrtps","latency","5.000315e+00","1.999874e-01" 39 | "m128","rsqrtps","throughput","1.000021e+00","9.999788e-01" 40 | "m128","rcpps","latency","5.006612e+00","1.997359e-01" 41 | "m128","rcpps","throughput","1.009147e+00","9.909363e-01" 42 | "m128","blendps","latency","1.000189e+00","9.998112e-01" 43 | "m128","blendps","throughput","5.000265e-01","1.999894e+00" 44 | "m128","blendvps","latency","1.000018e+00","9.999824e-01" 45 | "m128","blendvps","throughput","5.000392e-01","1.999843e+00" 46 | "m128","pshufb","latency","1.000024e+00","9.999763e-01" 47 | "m128","pshufb","throughput","5.003348e-01","1.998662e+00" 48 | "m128","shufps","latency","1.000411e+00","9.995895e-01" 49 | "m128","shufps","throughput","5.000178e-01","1.999929e+00" 50 | "m128","pmullw","latency","3.001651e+00","3.331500e-01" 51 | "m128","pmullw","throughput","1.000020e+00","9.999797e-01" 52 | "m128","phaddd","latency","2.000193e+00","4.999516e-01" 53 | "m128","phaddd","throughput","2.000187e+00","4.999532e-01" 54 | "m128","haddps","latency","2.000044e+00","4.999890e-01" 55 | "m128","haddps","throughput","2.003110e+00","4.992238e-01" 56 | "m128","pinsrd","latency","1.703294e+00","5.870978e-01" 57 | "m128","pinsrd","throughput","1.310640e+00","7.629862e-01" 58 | "m128","pinsrd->pextr","latency","8.000448e+00","1.249930e-01" 59 | "m128","dpps","latency","1.500205e+01","6.665755e-02" 60 | "m128","dpps","throughput","4.000327e+00","2.499796e-01" 61 | "m128","cvtps2dq","latency","4.000165e+00","2.499897e-01" 62 | "m128","cvtps2dq","throughput","1.000015e+00","9.999854e-01" 63 | "m128","pmovmskb","throughput","1.000020e+00","9.999803e-01" 64 | "m128","pmovmskb->movq","latency","6.000334e+00","1.666574e-01" 65 | "m128","movq->movq","latency","6.013564e+00","1.662907e-01" 66 | "m128","movaps [mem]","latency","9.000813e+00","1.111011e-01" 67 | "m128","movaps [mem]","throughput","5.004332e-01","1.998269e+00" 68 | "m128","movdqu [mem+1]","latency","1.000060e+01","9.999396e-02" 69 | "m128","movdqu [mem+1]","throughput","5.000309e-01","1.999876e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.100710e+01","9.085045e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000028e+00","9.999716e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.102931e+01","9.066751e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","1.000233e+00","9.997674e-01" 74 | "m128","pcmpistri","throughput","2.000187e+00","4.999532e-01" 75 | "m128","pcmpistri->movq","latency","1.100452e+01","9.087179e-02" 76 | "m128","pcmpistrm","throughput","2.000044e+00","4.999890e-01" 77 | "m128","pcmpistrm","latency","7.000454e+00","1.428479e-01" 78 | "m128","pcmpestri","throughput","3.000192e+00","3.333120e-01" 79 | "m128","pcmpestri->movq","latency","1.100231e+01","9.088999e-02" 80 | "m128","pcmpestrm","throughput","3.013018e+00","3.318931e-01" 81 | "m128","pcmpestrm","latency","8.500591e+00","1.176389e-01" 82 | "m256","movaps [mem]","latency","1.000338e+00","9.996619e-01" 83 | "m256","movaps [mem]","throughput","1.007306e+00","9.927468e-01" 84 | "m256","vmovdqu [mem+1]","latency","1.500034e+00","6.666514e-01" 85 | "m256","vmovdqu [mem+1]","throughput","1.500188e+00","6.665830e-01" 86 | "m256","vmovdqu [mem+63] (cross cache)","latency","1.500285e+00","6.665399e-01" 87 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.500184e+00","6.665849e-01" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","latency","1.500412e+00","6.664836e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","1.500037e+00","6.666503e-01" 90 | "m256","vxorps","latency","5.000343e-01","1.999863e+00" 91 | "m256","vxorps","throughput","5.002458e-01","1.999017e+00" 92 | "m256","vmulps","latency","3.001214e+00","3.331985e-01" 93 | "m256","vmulps","throughput","1.013203e+00","9.869692e-01" 94 | "m256","vaddps","latency","3.000244e+00","3.333062e-01" 95 | "m256","vaddps","throughput","1.000027e+00","9.999727e-01" 96 | "m256","vdivps","latency","1.000059e+01","9.999407e-02" 97 | "m256","vdivps","throughput","6.000537e+00","1.666518e-01" 98 | "m256","vdivpd","latency","8.000481e+00","1.249925e-01" 99 | "m256","vdivpd","throughput","8.005878e+00","1.249082e-01" 100 | "m256","vrsqrtps","latency","5.000400e+00","1.999840e-01" 101 | "m256","vrsqrtps","throughput","2.000166e+00","4.999586e-01" 102 | "m256","vrcpps","latency","5.000306e+00","1.999878e-01" 103 | "m256","vrcpps","throughput","2.000163e+00","4.999593e-01" 104 | "m256","vsqrtps","latency","8.001563e+00","1.249756e-01" 105 | "m256","vsqrtps","throughput","8.000445e+00","1.249930e-01" 106 | "m256","vperm2f128","latency","3.000194e+00","3.333117e-01" 107 | "m256","vperm2f128","throughput","3.209610e+00","3.115644e-01" 108 | "m256","vpxor","latency","5.000352e-01","1.999859e+00" 109 | "m256","vpxor","throughput","5.000345e-01","1.999862e+00" 110 | "m256","vpaddd","latency","1.000030e+00","9.999699e-01" 111 | "m256","vpaddd","throughput","6.666995e-01","1.499926e+00" 112 | "m256","vpermps","latency","5.000301e+00","1.999880e-01" 113 | "m256","vpermps","throughput","4.000216e+00","2.499865e-01" 114 | "m256","vpermpd","latency","2.006287e+00","4.984331e-01" 115 | "m256","vpermpd","throughput","2.000239e+00","4.999401e-01" 116 | "m256","vpblendvb","latency","2.000933e+00","4.997669e-01" 117 | "m256","vpblendvb","throughput","2.000168e+00","4.999580e-01" 118 | "m256","vpmovmskb","throughput","1.000380e+00","9.996199e-01" 119 | "m256","vpmovsxwd","latency","2.000175e+00","4.999562e-01" 120 | "m256","vpmovsxwd","throughput","2.000040e+00","4.999899e-01" 121 | "m256","vpgatherdd","latency","2.130687e+01","4.693321e-02" 122 | "m256","vpgatherdd","throughput","2.000122e+01","4.999696e-02" 123 | "m256","gather32(x8 + perm)","latency","1.739013e+01","5.750389e-02" 124 | "m256","gather32(x8 + perm)","throughput","5.028104e+00","1.988821e-01" 125 | "m256","vgatherdpd","latency","1.814049e+01","5.512532e-02" 126 | "m256","vgatherdpd","throughput","1.200097e+01","8.332660e-02" 127 | "m256","gather64(x4 + perm)","latency","1.301593e+01","7.682891e-02" 128 | "m256","gather64(x4 + perm)","throughput","3.028116e+00","3.302383e-01" 129 | "m256","vpshufb","latency","1.000281e+00","9.997191e-01" 130 | "m256","vpshufb","throughput","1.000412e+00","9.995880e-01" 131 | "m256","vfmaps","latency","5.002667e+00","1.998934e-01" 132 | "m256","vfmaps","throughput","1.033206e+00","9.678609e-01" 133 | "m256","vfmapd","latency","5.000989e+00","1.999605e-01" 134 | "m256","vfmapd","throughput","1.000024e+00","9.999761e-01" 135 | "m128","vfmaps","latency","5.000469e+00","1.999813e-01" 136 | "m128","vfmaps","throughput","5.000328e-01","1.999869e+00" 137 | "m128","vfmapd","latency","5.000348e+00","1.999861e-01" 138 | "m128","vfmapd","throughput","5.000254e-01","1.999898e+00" 139 | "reg64","popcnt","latency","1.000103e+00","9.998974e-01" 140 | "reg64","popcnt","throughput","2.578312e-01","3.878506e+00" 141 | "m128","aesenc","latency","4.000334e+00","2.499791e-01" 142 | "m128","aesenc","throughput","5.000301e-01","1.999880e+00" 143 | "m128","aesenclast","latency","4.000321e+00","2.499799e-01" 144 | "m128","aesenclast","throughput","5.000280e-01","1.999888e+00" 145 | "m128","aesdec","latency","4.000404e+00","2.499748e-01" 146 | "m128","aesdec","throughput","5.116995e-01","1.954272e+00" 147 | "m128","aesdeclast","latency","4.000398e+00","2.499751e-01" 148 | "m128","aesdeclast","throughput","5.000301e-01","1.999880e+00" 149 | "m128","pclmulqdq","latency","4.500374e+00","2.222037e-01" 150 | "m128","pclmulqdq","throughput","2.000182e+00","4.999545e-01" 151 | -------------------------------------------------------------------------------- /logs/linux/AMDRyzen73700X8-CoreProcessor.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.001567e+00","9.984354e-01" 3 | "reg64","add","throughput","2.590998e-01","3.859516e+00" 4 | "reg64","lea","latency","1.000290e+00","9.997099e-01" 5 | "reg64","lea","throughput","2.617524e-01","3.820405e+00" 6 | "reg64","xor dst,dst","latency","2.578328e-01","3.878483e+00" 7 | "reg64","xor dst,dst","throughput","2.578261e-01","3.878583e+00" 8 | "reg64","xor","latency","2.587028e-01","3.865440e+00" 9 | "reg64","xor","throughput","2.589144e-01","3.862281e+00" 10 | "reg64","load","latency","4.002343e+00","2.498537e-01" 11 | "reg64","load","throughput","6.254854e-01","1.598758e+00" 12 | "reg64","crc32","latency","3.000190e+00","3.333123e-01" 13 | "reg64","crc32","throughput","1.003069e+00","9.969403e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","3.755702e+01","2.662618e-02" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","3.911105e+00","2.556822e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","3.738368e+01","2.674964e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.400283e+01","7.141415e-02" 18 | "m128","pxor","latency","2.500379e-01","3.999393e+00" 19 | "m128","pxor","throughput","2.500299e-01","3.999522e+00" 20 | "m128","padd","latency","1.000014e+00","9.999864e-01" 21 | "m128","padd","throughput","3.333530e-01","2.999823e+00" 22 | "m128","pmuldq","latency","3.000024e+00","3.333307e-01" 23 | "m128","pmuldq","throughput","1.000006e+00","9.999936e-01" 24 | "m128","loadps","throughput","5.000371e-01","1.999852e+00" 25 | "m128","loadps->movq","latency","9.000187e+00","1.111088e-01" 26 | "m128","movq->movq","latency","6.000163e+00","1.666621e-01" 27 | "m128","movq->movq","throughput","1.000023e+00","9.999769e-01" 28 | "m128","xorps","latency","2.500390e-01","3.999376e+00" 29 | "m128","xorps","throughput","2.500286e-01","3.999542e+00" 30 | "m128","addps","latency","3.000188e+00","3.333124e-01" 31 | "m128","addps","throughput","5.000235e-01","1.999906e+00" 32 | "m128","mulps","latency","3.000178e+00","3.333135e-01" 33 | "m128","mulps","throughput","5.000189e-01","1.999925e+00" 34 | "m128","divps","latency","1.000158e+01","9.998419e-02" 35 | "m128","divps","throughput","3.500239e+00","2.856948e-01" 36 | "m128","divpd","latency","1.300150e+01","7.691419e-02" 37 | "m128","divpd","throughput","5.000184e+00","1.999927e-01" 38 | "m128","rsqrtps","latency","5.000175e+00","1.999930e-01" 39 | "m128","rsqrtps","throughput","1.000683e+00","9.993170e-01" 40 | "m128","rcpps","latency","5.000022e+00","1.999991e-01" 41 | "m128","rcpps","throughput","1.000007e+00","9.999928e-01" 42 | "m128","blendps","latency","1.001020e+00","9.989808e-01" 43 | "m128","blendps","throughput","3.333609e-01","2.999752e+00" 44 | "m128","blendvps","latency","1.000014e+00","9.999860e-01" 45 | "m128","blendvps","throughput","5.000401e-01","1.999840e+00" 46 | "m128","pshufb","latency","1.000015e+00","9.999847e-01" 47 | "m128","pshufb","throughput","5.000212e-01","1.999915e+00" 48 | "m128","shufps","latency","1.000024e+00","9.999756e-01" 49 | "m128","shufps","throughput","5.000242e-01","1.999903e+00" 50 | "m128","pmullw","latency","3.000105e+00","3.333217e-01" 51 | "m128","pmullw","throughput","1.000027e+00","9.999729e-01" 52 | "m128","phaddd","latency","2.000134e+00","4.999664e-01" 53 | "m128","phaddd","throughput","2.000201e+00","4.999497e-01" 54 | "m128","haddps","latency","2.000036e+00","4.999910e-01" 55 | "m128","haddps","throughput","2.000439e+00","4.998904e-01" 56 | "m128","pinsrd","latency","1.625045e+00","6.153677e-01" 57 | "m128","pinsrd","throughput","1.280720e+00","7.808108e-01" 58 | "m128","pinsrd->pextr","latency","8.000406e+00","1.249937e-01" 59 | "m128","dpps","latency","1.500143e+01","6.666030e-02" 60 | "m128","dpps","throughput","4.000031e+00","2.499981e-01" 61 | "m128","cvtps2dq","latency","3.000020e+00","3.333311e-01" 62 | "m128","cvtps2dq","throughput","1.000015e+00","9.999852e-01" 63 | "m128","pmovmskb","throughput","1.000021e+00","9.999790e-01" 64 | "m128","pmovmskb->movq","latency","6.000898e+00","1.666417e-01" 65 | "m128","movq->movq","latency","6.000184e+00","1.666616e-01" 66 | "m128","movaps [mem]","latency","9.000192e+00","1.111087e-01" 67 | "m128","movaps [mem]","throughput","5.000307e-01","1.999877e+00" 68 | "m128","movdqu [mem+1]","latency","1.000170e+01","9.998296e-02" 69 | "m128","movdqu [mem+1]","throughput","5.000390e-01","1.999844e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.100016e+01","9.090778e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000030e+00","9.999701e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.100053e+01","9.090470e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","1.000031e+00","9.999686e-01" 74 | "m128","pcmpistri","throughput","2.000018e+00","4.999955e-01" 75 | "m128","pcmpistri->movq","latency","1.100017e+01","9.090770e-02" 76 | "m128","pcmpistrm","throughput","2.000042e+00","4.999895e-01" 77 | "m128","pcmpistrm","latency","7.000353e+00","1.428499e-01" 78 | "m128","pcmpestri","throughput","3.000219e+00","3.333090e-01" 79 | "m128","pcmpestri->movq","latency","1.100015e+01","9.090782e-02" 80 | "m128","pcmpestrm","throughput","3.000220e+00","3.333089e-01" 81 | "m128","pcmpestrm","latency","7.333523e+00","1.363601e-01" 82 | "m256","movaps [mem]","throughput","5.000381e-01","1.999847e+00" 83 | "m256","movaps [mem] -> movq","latency","9.000158e+00","1.111092e-01" 84 | "m256","vmovdqu [mem+1]","throughput","5.002961e-01","1.998816e+00" 85 | "m256","vmovdqu [mem+1] -> movq","latency","1.000015e+01","9.999853e-02" 86 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.000033e+00","9.999672e-01" 87 | "m256","vmovdqu [mem+63] (cross cache) -> movq","latency","1.100016e+01","9.090779e-02" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","1.000035e+00","9.999655e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page) -> movq","latency","1.100018e+01","9.090758e-02" 90 | "m256","vxorps","latency","2.500320e-01","3.999488e+00" 91 | "m256","vxorps","throughput","2.500441e-01","3.999295e+00" 92 | "m256","vmulps","latency","3.000024e+00","3.333306e-01" 93 | "m256","vmulps","throughput","5.000129e-01","1.999948e+00" 94 | "m256","vaddps","latency","3.000014e+00","3.333318e-01" 95 | "m256","vaddps","throughput","5.000131e-01","1.999947e+00" 96 | "m256","vdivps","latency","1.000016e+01","9.999839e-02" 97 | "m256","vdivps","throughput","3.500014e+00","2.857131e-01" 98 | "m256","vdivpd","latency","1.300015e+01","7.692218e-02" 99 | "m256","vdivpd","throughput","5.000017e+00","1.999993e-01" 100 | "m256","vrsqrtps","latency","5.000014e+00","1.999994e-01" 101 | "m256","vrsqrtps","throughput","1.000007e+00","9.999930e-01" 102 | "m256","vrcpps","latency","5.000158e+00","1.999937e-01" 103 | "m256","vrcpps","throughput","1.000007e+00","9.999930e-01" 104 | "m256","vsqrtps","latency","1.400030e+01","7.142704e-02" 105 | "m256","vsqrtps","throughput","5.500180e+00","1.818122e-01" 106 | "m256","vperm2f128","latency","3.000207e+00","3.333103e-01" 107 | "m256","vperm2f128","throughput","1.000007e+00","9.999926e-01" 108 | "m256","vpxor","latency","2.500326e-01","3.999478e+00" 109 | "m256","vpxor","throughput","2.500286e-01","3.999542e+00" 110 | "m256","vpaddd","latency","1.000013e+00","9.999873e-01" 111 | "m256","vpaddd","throughput","3.333528e-01","2.999825e+00" 112 | "m256","vpermps","latency","8.000145e+00","1.249977e-01" 113 | "m256","vpermps","throughput","2.000246e+00","4.999385e-01" 114 | "m256","vpermpd","latency","6.000173e+00","1.666619e-01" 115 | "m256","vpermpd","throughput","1.309286e+00","7.637749e-01" 116 | "m256","vpblendvb","latency","1.000189e+00","9.998112e-01" 117 | "m256","vpblendvb","throughput","1.000015e+00","9.999850e-01" 118 | "m256","vpmovmskb","throughput","1.000015e+00","9.999852e-01" 119 | "m256","vpmovsxwd","latency","4.000177e+00","2.499890e-01" 120 | "m256","vpmovsxwd","throughput","1.150022e+00","8.695484e-01" 121 | "m256","vpgatherdd","latency","2.300447e+01","4.346981e-02" 122 | "m256","vpgatherdd","throughput","1.600529e+01","6.247934e-02" 123 | "m256","gather32(x8 + perm)","latency","1.875247e+01","5.332630e-02" 124 | "m256","gather32(x8 + perm)","throughput","4.006449e+00","2.495976e-01" 125 | "m256","vgatherdpd","latency","1.900315e+01","5.262285e-02" 126 | "m256","vgatherdpd","throughput","9.020558e+00","1.108579e-01" 127 | "m256","gather64(x4 + perm)","latency","1.482739e+01","6.744274e-02" 128 | "m256","gather64(x4 + perm)","throughput","2.003266e+00","4.991849e-01" 129 | "m256","vpshufb","latency","1.000014e+00","9.999858e-01" 130 | "m256","vpshufb","throughput","5.000155e-01","1.999938e+00" 131 | "m256","vfmaps","latency","5.000030e+00","1.999988e-01" 132 | "m256","vfmaps","throughput","5.000267e-01","1.999893e+00" 133 | "m256","vfmapd","latency","5.001647e+00","1.999341e-01" 134 | "m256","vfmapd","throughput","5.000184e-01","1.999926e+00" 135 | "m128","vfmaps","latency","5.001112e+00","1.999555e-01" 136 | "m128","vfmaps","throughput","5.000201e-01","1.999919e+00" 137 | "m128","vfmapd","latency","5.000026e+00","1.999990e-01" 138 | "m128","vfmapd","throughput","5.000102e-01","1.999959e+00" 139 | "reg64","popcnt","latency","1.000025e+00","9.999750e-01" 140 | "reg64","popcnt","throughput","2.588030e-01","3.863943e+00" 141 | "m128","aesenc","latency","4.000739e+00","2.499538e-01" 142 | "m128","aesenc","throughput","5.000197e-01","1.999921e+00" 143 | "m128","aesenclast","latency","4.001445e+00","2.499097e-01" 144 | "m128","aesenclast","throughput","5.000276e-01","1.999890e+00" 145 | "m128","aesdec","latency","4.000883e+00","2.499448e-01" 146 | "m128","aesdec","throughput","5.000210e-01","1.999916e+00" 147 | "m128","aesdeclast","latency","4.000383e+00","2.499761e-01" 148 | "m128","aesdeclast","throughput","5.000146e-01","1.999942e+00" 149 | "m128","pclmulqdq","latency","4.578195e+00","2.184267e-01" 150 | "m128","pclmulqdq","throughput","2.003239e+00","4.991915e-01" 151 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Celeron(R)CPUG3900@2.80GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000119e+00","9.998809e-01" 3 | "reg64","add","throughput","2.539297e-01","3.938097e+00" 4 | "reg64","lea","latency","1.000082e+00","9.999178e-01" 5 | "reg64","lea","throughput","5.001687e-01","1.999326e+00" 6 | "reg64","xor dst,dst","latency","2.539347e-01","3.938020e+00" 7 | "reg64","xor dst,dst","throughput","2.539333e-01","3.938042e+00" 8 | "reg64","xor","latency","2.539316e-01","3.938068e+00" 9 | "reg64","xor","throughput","2.539319e-01","3.938064e+00" 10 | "reg64","load","latency","5.000207e+00","1.999917e-01" 11 | "reg64","load","throughput","6.251367e-01","1.599650e+00" 12 | "reg64","crc32","latency","3.000139e+00","3.333179e-01" 13 | "reg64","crc32","throughput","1.000078e+00","9.999218e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","7.748479e+00","1.290576e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.349518e+00","7.410055e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","1.900092e+01","5.262902e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.300067e+01","7.691911e-02" 18 | "m128","pxor","latency","2.569947e-01","3.891131e+00" 19 | "m128","pxor","throughput","2.572746e-01","3.886897e+00" 20 | "m128","padd","latency","1.000324e+00","9.996765e-01" 21 | "m128","padd","throughput","3.333658e-01","2.999708e+00" 22 | "m128","pmuldq","latency","5.000254e+00","1.999898e-01" 23 | "m128","pmuldq","throughput","5.000362e-01","1.999855e+00" 24 | "m128","loadps","throughput","5.000432e-01","1.999827e+00" 25 | "m128","loadps->movq","latency","8.000384e+00","1.249940e-01" 26 | "m128","movq->movq","latency","4.000250e+00","2.499844e-01" 27 | "m128","movq->movq","throughput","1.000299e+00","9.997015e-01" 28 | "m128","xorps","latency","2.569864e-01","3.891256e+00" 29 | "m128","xorps","throughput","2.569856e-01","3.891269e+00" 30 | "m128","addps","latency","4.000245e+00","2.499847e-01" 31 | "m128","addps","throughput","5.000288e-01","1.999885e+00" 32 | "m128","mulps","latency","4.000247e+00","2.499846e-01" 33 | "m128","mulps","throughput","5.000305e-01","1.999878e+00" 34 | "m128","divps","latency","1.100059e+01","9.090425e-02" 35 | "m128","divps","throughput","3.000259e+00","3.333046e-01" 36 | "m128","divpd","latency","1.300070e+01","7.691891e-02" 37 | "m128","divpd","throughput","4.000137e+00","2.499914e-01" 38 | "m128","rsqrtps","latency","4.000247e+00","2.499845e-01" 39 | "m128","rsqrtps","throughput","1.000293e+00","9.997074e-01" 40 | "m128","rcpps","latency","4.000247e+00","2.499845e-01" 41 | "m128","rcpps","throughput","1.000030e+00","9.999697e-01" 42 | "m128","blendps","latency","1.000023e+00","9.999773e-01" 43 | "m128","blendps","throughput","3.336739e-01","2.996938e+00" 44 | "m128","blendvps","latency","1.000025e+00","9.999750e-01" 45 | "m128","blendvps","throughput","5.069909e-01","1.972422e+00" 46 | "m128","pshufb","latency","1.000029e+00","9.999710e-01" 47 | "m128","pshufb","throughput","1.000027e+00","9.999733e-01" 48 | "m128","shufps","latency","1.000028e+00","9.999725e-01" 49 | "m128","shufps","throughput","1.000291e+00","9.997093e-01" 50 | "m128","pmullw","latency","5.000778e+00","1.999689e-01" 51 | "m128","pmullw","throughput","5.000284e-01","1.999886e+00" 52 | "m128","phaddd","latency","3.000152e+00","3.333164e-01" 53 | "m128","phaddd","throughput","2.000288e+00","4.999281e-01" 54 | "m128","haddps","latency","3.000150e+00","3.333167e-01" 55 | "m128","haddps","throughput","2.000157e+00","4.999608e-01" 56 | "m128","pinsrd","latency","2.000140e+00","4.999650e-01" 57 | "m128","pinsrd","throughput","2.000025e+00","4.999937e-01" 58 | "m128","pinsrd->pextr","latency","6.000374e+00","1.666563e-01" 59 | "m128","dpps","latency","1.300070e+01","7.691892e-02" 60 | "m128","dpps","throughput","1.598587e+00","6.255524e-01" 61 | "m128","cvtps2dq","latency","4.000249e+00","2.499844e-01" 62 | "m128","cvtps2dq","throughput","5.000248e-01","1.999901e+00" 63 | "m128","pmovmskb","throughput","1.000027e+00","9.999733e-01" 64 | "m128","pmovmskb->movq","latency","4.000257e+00","2.499840e-01" 65 | "m128","movq->movq","latency","4.000245e+00","2.499847e-01" 66 | "m128","movaps [mem]","latency","8.000373e+00","1.249942e-01" 67 | "m128","movaps [mem]","throughput","5.003295e-01","1.998683e+00" 68 | "m128","movdqu [mem+1]","latency","8.000449e+00","1.249930e-01" 69 | "m128","movdqu [mem+1]","throughput","5.000761e-01","1.999696e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400072e+01","7.142491e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000327e+00","9.996735e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.500087e+01","6.666280e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","3.771873e+00","2.651203e-01" 74 | "m128","pcmpistri","throughput","3.000340e+00","3.332955e-01" 75 | "m128","pcmpistri->movq","latency","1.200059e+01","8.332925e-02" 76 | "m128","pcmpistrm","throughput","3.000256e+00","3.333049e-01" 77 | "m128","pcmpistrm","latency","8.848392e+00","1.130149e-01" 78 | "m128","pcmpestri","throughput","4.028063e+00","2.482583e-01" 79 | "m128","pcmpestri->movq","latency","1.200059e+01","8.332921e-02" 80 | "m128","pcmpestrm","throughput","5.028056e+00","1.988840e-01" 81 | "m128","pcmpestrm","latency","9.000479e+00","1.111052e-01" 82 | "reg64","popcnt","latency","3.000202e+00","3.333109e-01" 83 | "reg64","popcnt","throughput","1.000165e+00","9.998355e-01" 84 | "m128","aesenc","latency","4.000248e+00","2.499845e-01" 85 | "m128","aesenc","throughput","1.000424e+00","9.995761e-01" 86 | "m128","aesenclast","latency","4.000134e+00","2.499916e-01" 87 | "m128","aesenclast","throughput","1.000027e+00","9.999733e-01" 88 | "m128","aesdec","latency","4.000247e+00","2.499845e-01" 89 | "m128","aesdec","throughput","1.000297e+00","9.997028e-01" 90 | "m128","aesdeclast","latency","4.000250e+00","2.499844e-01" 91 | "m128","aesdeclast","throughput","1.000025e+00","9.999752e-01" 92 | "m128","pclmulqdq","latency","7.000359e+00","1.428498e-01" 93 | "m128","pclmulqdq","throughput","1.000024e+00","9.999765e-01" 94 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Celeron(R)CPUN2807@1.58GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.028321e+00","9.724590e-01" 3 | "reg64","add","throughput","5.315419e-01","1.881319e+00" 4 | "reg64","lea","latency","1.005761e+00","9.942716e-01" 5 | "reg64","lea","throughput","5.313487e-01","1.882003e+00" 6 | "reg64","xor dst,dst","latency","1.028310e+00","9.724694e-01" 7 | "reg64","xor dst,dst","throughput","5.316453e-01","1.880953e+00" 8 | "reg64","xor","latency","1.028483e+00","9.723056e-01" 9 | "reg64","xor","throughput","5.315483e-01","1.881297e+00" 10 | "reg64","load","latency","3.000618e+00","3.332647e-01" 11 | "reg64","load","throughput","1.000683e+00","9.993173e-01" 12 | "reg64","crc32","latency","5.984970e+00","1.670852e-01" 13 | "reg64","crc32","throughput","5.985072e+00","1.670824e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","4.000382e+00","2.499761e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","2.000764e+00","4.998092e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","1.400162e+01","7.142031e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.160263e+01","8.618734e-02" 18 | "m128","pxor","latency","5.557671e-01","1.799315e+00" 19 | "m128","pxor","throughput","5.561805e-01","1.797977e+00" 20 | "m128","padd","latency","1.000501e+00","9.994988e-01" 21 | "m128","padd","throughput","5.557198e-01","1.799468e+00" 22 | "m128","pmuldq","latency","5.973042e+00","1.674189e-01" 23 | "m128","pmuldq","throughput","4.139433e+00","2.415790e-01" 24 | "m128","loadps","throughput","1.000492e+00","9.995079e-01" 25 | "m128","loadps->movq","latency","8.000939e+00","1.249853e-01" 26 | "m128","movq->movq","latency","7.000837e+00","1.428401e-01" 27 | "m128","movq->movq","throughput","7.000857e+00","1.428396e-01" 28 | "m128","xorps","latency","5.562657e-01","1.797702e+00" 29 | "m128","xorps","throughput","5.557408e-01","1.799400e+00" 30 | "m128","addps","latency","3.000614e+00","3.332651e-01" 31 | "m128","addps","throughput","1.000480e+00","9.995198e-01" 32 | "m128","mulps","latency","5.000687e+00","1.999725e-01" 33 | "m128","mulps","throughput","2.000454e+00","4.998866e-01" 34 | "m128","divps","latency","2.700320e+01","3.703264e-02" 35 | "m128","divps","throughput","2.700290e+01","3.703306e-02" 36 | "m128","divpd","latency","2.700289e+01","3.703307e-02" 37 | "m128","divpd","throughput","2.700293e+01","3.703302e-02" 38 | "m128","rsqrtps","latency","9.001052e+00","1.110981e-01" 39 | "m128","rsqrtps","throughput","8.000885e+00","1.249862e-01" 40 | "m128","rcpps","latency","9.001118e+00","1.110973e-01" 41 | "m128","rcpps","throughput","8.000845e+00","1.249868e-01" 42 | "m128","blendps","latency","5.972919e+00","1.674223e-01" 43 | "m128","blendps","throughput","4.139342e+00","2.415843e-01" 44 | "m128","blendvps","latency","7.028747e+00","1.422729e-01" 45 | "m128","blendvps","throughput","8.973247e+00","1.114424e-01" 46 | "m128","pshufb","latency","8.028865e+00","1.245506e-01" 47 | "m128","pshufb","throughput","7.028687e+00","1.422741e-01" 48 | "m128","shufps","latency","1.000183e+00","9.998169e-01" 49 | "m128","shufps","throughput","1.000683e+00","9.993176e-01" 50 | "m128","pmullw","latency","5.000558e+00","1.999777e-01" 51 | "m128","pmullw","throughput","2.000323e+00","4.999192e-01" 52 | "m128","phaddd","latency","9.028890e+00","1.107556e-01" 53 | "m128","phaddd","throughput","8.028685e+00","1.245534e-01" 54 | "m128","haddps","latency","9.028869e+00","1.107558e-01" 55 | "m128","haddps","throughput","8.028851e+00","1.245508e-01" 56 | "m128","pinsrd","latency","5.972923e+00","1.674222e-01" 57 | "m128","pinsrd","throughput","4.139449e+00","2.415781e-01" 58 | "m128","pinsrd->pextr","latency","1.302917e+01","7.675089e-02" 59 | "m128","dpps","latency","1.500189e+01","6.665825e-02" 60 | "m128","dpps","throughput","1.302940e+01","7.674952e-02" 61 | "m128","cvtps2dq","latency","5.000626e+00","1.999750e-01" 62 | "m128","cvtps2dq","throughput","2.000366e+00","4.999085e-01" 63 | "m128","pmovmskb","throughput","1.333693e+00","7.497975e-01" 64 | "m128","pmovmskb->movq","latency","7.000712e+00","1.428426e-01" 65 | "m128","movq->movq","latency","7.000925e+00","1.428383e-01" 66 | "m128","movaps [mem]","latency","8.000980e+00","1.249847e-01" 67 | "m128","movaps [mem]","throughput","1.000170e+00","9.998296e-01" 68 | "m128","movdqu [mem+1]","latency","8.001162e+00","1.249819e-01" 69 | "m128","movdqu [mem+1]","throughput","1.000476e+00","9.995244e-01" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400203e+01","7.141820e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","3.001225e+00","3.331973e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.700249e+01","5.881493e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","9.529294e+00","1.049396e-01" 74 | "m128","pcmpistri","throughput","1.700175e+01","5.881748e-02" 75 | "m128","pcmpistri->movq","latency","1.800192e+01","5.554964e-02" 76 | "m128","pcmpistrm","throughput","1.300149e+01","7.691426e-02" 77 | "m128","pcmpistrm","latency","1.300150e+01","7.691422e-02" 78 | "m128","pcmpestri","throughput","2.100219e+01","4.761409e-02" 79 | "m128","pcmpestri->movq","latency","2.300247e+01","4.347359e-02" 80 | "m128","pcmpestrm","throughput","1.700194e+01","5.881683e-02" 81 | "m128","pcmpestrm","latency","1.700191e+01","5.881691e-02" 82 | "reg64","popcnt","latency","3.033586e+00","3.296429e-01" 83 | "reg64","popcnt","throughput","1.140946e+00","8.764656e-01" 84 | "m128","pclmulqdq","latency","1.202934e+01","8.313010e-02" 85 | "m128","pclmulqdq","throughput","1.102907e+01","9.066948e-02" 86 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Celeron(R)CPUN3450@1.10GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.007032e+00","9.930174e-01" 3 | "reg64","add","throughput","3.515793e-01","2.844308e+00" 4 | "reg64","lea","latency","1.000146e+00","9.998545e-01" 5 | "reg64","lea","throughput","3.517376e-01","2.843028e+00" 6 | "reg64","xor dst,dst","latency","3.516530e-01","2.843713e+00" 7 | "reg64","xor dst,dst","throughput","3.515844e-01","2.844267e+00" 8 | "reg64","xor","latency","3.517035e-01","2.843304e+00" 9 | "reg64","xor","throughput","3.516030e-01","2.844117e+00" 10 | "reg64","load","latency","3.000242e+00","3.333065e-01" 11 | "reg64","load","throughput","1.000135e+00","9.998654e-01" 12 | "reg64","crc32","latency","3.000229e+00","3.333079e-01" 13 | "reg64","crc32","throughput","1.245751e+00","8.027289e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","5.000406e+00","1.999838e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.094181e+00","9.139255e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","1.302316e+01","7.678626e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","9.344440e+00","1.070155e-01" 18 | "m128","pxor","latency","5.000551e-01","1.999780e+00" 19 | "m128","pxor","throughput","5.000447e-01","1.999821e+00" 20 | "m128","padd","latency","1.000364e+00","9.996360e-01" 21 | "m128","padd","throughput","5.000333e-01","1.999867e+00" 22 | "m128","pmuldq","latency","4.000534e+00","2.499667e-01" 23 | "m128","pmuldq","throughput","1.000375e+00","9.996252e-01" 24 | "m128","loadps","throughput","1.000284e+00","9.997163e-01" 25 | "m128","loadps->movq","latency","8.000585e+00","1.249909e-01" 26 | "m128","movq->movq","latency","8.000462e+00","1.249928e-01" 27 | "m128","movq->movq","throughput","2.250298e+00","4.443856e-01" 28 | "m128","xorps","latency","5.000331e-01","1.999868e+00" 29 | "m128","xorps","throughput","5.004205e-01","1.998320e+00" 30 | "m128","addps","latency","3.000260e+00","3.333045e-01" 31 | "m128","addps","throughput","1.000063e+00","9.999368e-01" 32 | "m128","mulps","latency","4.000248e+00","2.499845e-01" 33 | "m128","mulps","throughput","1.000056e+00","9.999436e-01" 34 | "m128","divps","latency","2.000133e+01","4.999667e-02" 35 | "m128","divps","throughput","1.900127e+01","5.262805e-02" 36 | "m128","divpd","latency","2.000137e+01","4.999657e-02" 37 | "m128","divpd","throughput","1.900141e+01","5.262766e-02" 38 | "m128","rsqrtps","latency","9.000630e+00","1.111033e-01" 39 | "m128","rsqrtps","throughput","6.028249e+00","1.658857e-01" 40 | "m128","rcpps","latency","9.000598e+00","1.111037e-01" 41 | "m128","rcpps","throughput","6.028185e+00","1.658874e-01" 42 | "m128","blendps","latency","1.000269e+00","9.997311e-01" 43 | "m128","blendps","throughput","5.000369e-01","1.999853e+00" 44 | "m128","blendvps","latency","4.028202e+00","2.482497e-01" 45 | "m128","blendvps","throughput","5.972618e+00","1.674308e-01" 46 | "m128","pshufb","latency","1.000256e+00","9.997443e-01" 47 | "m128","pshufb","throughput","1.000221e+00","9.997792e-01" 48 | "m128","shufps","latency","1.000049e+00","9.999510e-01" 49 | "m128","shufps","throughput","5.003173e-01","1.998732e+00" 50 | "m128","pmullw","latency","4.000310e+00","2.499806e-01" 51 | "m128","pmullw","throughput","1.000185e+00","9.998150e-01" 52 | "m128","phaddd","latency","4.083267e+00","2.449020e-01" 53 | "m128","phaddd","throughput","4.034765e+00","2.478459e-01" 54 | "m128","haddps","latency","4.030989e+00","2.480781e-01" 55 | "m128","haddps","throughput","4.030980e+00","2.480786e-01" 56 | "m128","pinsrd","latency","1.027933e+00","9.728261e-01" 57 | "m128","pinsrd","throughput","1.027827e+00","9.729260e-01" 58 | "m128","pinsrd->pextr","latency","8.000925e+00","1.249855e-01" 59 | "m128","dpps","latency","1.373245e+01","7.282020e-02" 60 | "m128","dpps","throughput","1.002853e+01","9.971550e-02" 61 | "m128","cvtps2dq","latency","4.000254e+00","2.499841e-01" 62 | "m128","cvtps2dq","throughput","1.000417e+00","9.995831e-01" 63 | "m128","pmovmskb","throughput","1.258920e+00","7.943318e-01" 64 | "m128","pmovmskb->movq","latency","8.000628e+00","1.249902e-01" 65 | "m128","movq->movq","latency","8.000462e+00","1.249928e-01" 66 | "m128","movaps [mem]","latency","8.000560e+00","1.249913e-01" 67 | "m128","movaps [mem]","throughput","1.000470e+00","9.995300e-01" 68 | "m128","movdqu [mem+1]","latency","8.000648e+00","1.249899e-01" 69 | "m128","movdqu [mem+1]","throughput","1.000045e+00","9.999551e-01" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400098e+01","7.142359e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","2.000180e+00","4.999551e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.735308e+01","5.762667e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","1.235277e+01","8.095349e-02" 74 | "m128","pcmpistri","throughput","8.000857e+00","1.249866e-01" 75 | "m128","pcmpistri->movq","latency","1.800149e+01","5.555095e-02" 76 | "m128","pcmpistrm","throughput","7.001250e+00","1.428316e-01" 77 | "m128","pcmpistrm","latency","1.200225e+01","8.331768e-02" 78 | "m128","pcmpestri","throughput","1.300111e+01","7.691652e-02" 79 | "m128","pcmpestri->movq","latency","2.300176e+01","4.347493e-02" 80 | "m128","pcmpestrm","throughput","1.402905e+01","7.128067e-02" 81 | "m128","pcmpestrm","latency","1.701866e+01","5.875903e-02" 82 | "reg64","popcnt","latency","3.001136e+00","3.332072e-01" 83 | "reg64","popcnt","throughput","1.248057e+00","8.012454e-01" 84 | "m128","aesenc","latency","6.002019e+00","1.666106e-01" 85 | "m128","aesenc","throughput","2.000284e+00","4.999289e-01" 86 | "m128","aesenclast","latency","6.000530e+00","1.666520e-01" 87 | "m128","aesenclast","throughput","2.000319e+00","4.999204e-01" 88 | "m128","aesdec","latency","6.000445e+00","1.666543e-01" 89 | "m128","aesdec","throughput","2.000785e+00","4.998039e-01" 90 | "m128","aesdeclast","latency","6.000695e+00","1.666474e-01" 91 | "m128","aesdeclast","throughput","2.000209e+00","4.999477e-01" 92 | "m128","pclmulqdq","latency","6.757296e+00","1.479882e-01" 93 | "m128","pclmulqdq","throughput","4.028118e+00","2.482549e-01" 94 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Core(TM)i5-8250UCPU@1.60GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000572e+00","9.994279e-01" 3 | "reg64","add","throughput","2.540511e-01","3.936216e+00" 4 | "reg64","lea","latency","1.000022e+00","9.999783e-01" 5 | "reg64","lea","throughput","5.000192e-01","1.999923e+00" 6 | "reg64","xor dst,dst","latency","2.539327e-01","3.938051e+00" 7 | "reg64","xor dst,dst","throughput","2.539347e-01","3.938020e+00" 8 | "reg64","xor","latency","2.539340e-01","3.938031e+00" 9 | "reg64","xor","throughput","2.539349e-01","3.938018e+00" 10 | "reg64","load","latency","5.000271e+00","1.999892e-01" 11 | "reg64","load","throughput","6.250216e-01","1.599945e+00" 12 | "reg64","crc32","latency","3.000280e+00","3.333022e-01" 13 | "reg64","crc32","throughput","1.000180e+00","9.998205e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","7.875058e+00","1.269832e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.349259e+00","7.411477e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","1.900094e+01","5.262896e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.300067e+01","7.691909e-02" 18 | "m128","pxor","latency","2.569987e-01","3.891070e+00" 19 | "m128","pxor","throughput","2.570135e-01","3.890846e+00" 20 | "m128","padd","latency","1.000040e+00","9.999595e-01" 21 | "m128","padd","throughput","3.333641e-01","2.999723e+00" 22 | "m128","pmuldq","latency","5.000251e+00","1.999900e-01" 23 | "m128","pmuldq","throughput","5.002602e-01","1.998960e+00" 24 | "m128","loadps","throughput","5.000441e-01","1.999824e+00" 25 | "m128","loadps->movq","latency","8.000376e+00","1.249941e-01" 26 | "m128","movq->movq","latency","4.000185e+00","2.499884e-01" 27 | "m128","movq->movq","throughput","1.000032e+00","9.999684e-01" 28 | "m128","xorps","latency","2.570404e-01","3.890438e+00" 29 | "m128","xorps","throughput","2.569892e-01","3.891215e+00" 30 | "m128","addps","latency","4.000415e+00","2.499741e-01" 31 | "m128","addps","throughput","5.000839e-01","1.999664e+00" 32 | "m128","mulps","latency","4.000250e+00","2.499844e-01" 33 | "m128","mulps","throughput","5.000286e-01","1.999886e+00" 34 | "m128","divps","latency","1.100049e+01","9.090506e-02" 35 | "m128","divps","throughput","3.000174e+00","3.333140e-01" 36 | "m128","divpd","latency","1.300060e+01","7.691955e-02" 37 | "m128","divpd","throughput","4.000171e+00","2.499893e-01" 38 | "m128","rsqrtps","latency","4.000260e+00","2.499838e-01" 39 | "m128","rsqrtps","throughput","1.000323e+00","9.996771e-01" 40 | "m128","rcpps","latency","4.000148e+00","2.499908e-01" 41 | "m128","rcpps","throughput","1.000031e+00","9.999693e-01" 42 | "m128","blendps","latency","1.000025e+00","9.999754e-01" 43 | "m128","blendps","throughput","3.336826e-01","2.996860e+00" 44 | "m128","blendvps","latency","1.000436e+00","9.995645e-01" 45 | "m128","blendvps","throughput","5.070523e-01","1.972183e+00" 46 | "m128","pshufb","latency","1.000030e+00","9.999699e-01" 47 | "m128","pshufb","throughput","1.000031e+00","9.999686e-01" 48 | "m128","shufps","latency","1.000030e+00","9.999701e-01" 49 | "m128","shufps","throughput","1.000377e+00","9.996229e-01" 50 | "m128","pmullw","latency","5.000262e+00","1.999895e-01" 51 | "m128","pmullw","throughput","5.000405e-01","1.999838e+00" 52 | "m128","phaddd","latency","3.000162e+00","3.333153e-01" 53 | "m128","phaddd","throughput","2.000153e+00","4.999619e-01" 54 | "m128","haddps","latency","3.000145e+00","3.333172e-01" 55 | "m128","haddps","throughput","2.000157e+00","4.999607e-01" 56 | "m128","pinsrd","latency","2.000037e+00","4.999908e-01" 57 | "m128","pinsrd","throughput","2.000154e+00","4.999614e-01" 58 | "m128","pinsrd->pextr","latency","6.000298e+00","1.666584e-01" 59 | "m128","dpps","latency","1.300062e+01","7.691939e-02" 60 | "m128","dpps","throughput","1.591809e+00","6.282160e-01" 61 | "m128","cvtps2dq","latency","4.000168e+00","2.499895e-01" 62 | "m128","cvtps2dq","throughput","5.000511e-01","1.999796e+00" 63 | "m128","pmovmskb","throughput","1.000552e+00","9.994478e-01" 64 | "m128","pmovmskb->movq","latency","4.000297e+00","2.499814e-01" 65 | "m128","movq->movq","latency","4.000450e+00","2.499719e-01" 66 | "m128","movaps [mem]","latency","8.000376e+00","1.249941e-01" 67 | "m128","movaps [mem]","throughput","5.000460e-01","1.999816e+00" 68 | "m128","movdqu [mem+1]","latency","8.000503e+00","1.249921e-01" 69 | "m128","movdqu [mem+1]","throughput","5.001424e-01","1.999431e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400071e+01","7.142494e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000094e+00","9.999061e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.500085e+01","6.666290e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","3.790094e+00","2.638457e-01" 74 | "m128","pcmpistri","throughput","3.000348e+00","3.332947e-01" 75 | "m128","pcmpistri->movq","latency","1.200059e+01","8.332922e-02" 76 | "m128","pcmpistrm","throughput","3.000206e+00","3.333104e-01" 77 | "m128","pcmpistrm","latency","8.843095e+00","1.130826e-01" 78 | "m128","pcmpestri","throughput","4.028120e+00","2.482547e-01" 79 | "m128","pcmpestri->movq","latency","1.222113e+01","8.182546e-02" 80 | "m128","pcmpestrm","throughput","5.028100e+00","1.988823e-01" 81 | "m128","pcmpestrm","latency","9.000483e+00","1.111051e-01" 82 | "m256","movaps [mem]","throughput","5.000769e-01","1.999692e+00" 83 | "m256","movaps [mem] -> movq","latency","9.000519e+00","1.111047e-01" 84 | "m256","vmovdqu [mem+1]","throughput","5.000727e-01","1.999709e+00" 85 | "m256","vmovdqu [mem+1] -> movq","latency","9.000529e+00","1.111046e-01" 86 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.000410e+00","9.995901e-01" 87 | "m256","vmovdqu [mem+63] (cross cache) -> movq","latency","1.400071e+01","7.142495e-02" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","3.779146e+00","2.646100e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page) -> movq","latency","1.600177e+01","6.249307e-02" 90 | "m256","vxorps","latency","2.570243e-01","3.890682e+00" 91 | "m256","vxorps","throughput","2.570322e-01","3.890563e+00" 92 | "m256","vmulps","latency","4.000289e+00","2.499819e-01" 93 | "m256","vmulps","throughput","5.000335e-01","1.999866e+00" 94 | "m256","vaddps","latency","4.000249e+00","2.499845e-01" 95 | "m256","vaddps","throughput","5.000305e-01","1.999878e+00" 96 | "m256","vdivps","latency","1.100066e+01","9.090361e-02" 97 | "m256","vdivps","throughput","5.000296e+00","1.999882e-01" 98 | "m256","vdivpd","latency","1.300074e+01","7.691869e-02" 99 | "m256","vdivpd","throughput","8.000452e+00","1.249929e-01" 100 | "m256","vrsqrtps","latency","4.000327e+00","2.499796e-01" 101 | "m256","vrsqrtps","throughput","1.000382e+00","9.996183e-01" 102 | "m256","vrcpps","latency","4.000168e+00","2.499895e-01" 103 | "m256","vrcpps","throughput","1.000066e+00","9.999339e-01" 104 | "m256","vsqrtps","latency","1.200064e+01","8.332892e-02" 105 | "m256","vsqrtps","throughput","6.000417e+00","1.666551e-01" 106 | "m256","vperm2f128","latency","3.000172e+00","3.333142e-01" 107 | "m256","vperm2f128","throughput","1.000059e+00","9.999411e-01" 108 | "m256","vpxor","latency","2.573901e-01","3.885153e+00" 109 | "m256","vpxor","throughput","2.570265e-01","3.890650e+00" 110 | "m256","vpaddd","latency","1.000558e+00","9.994425e-01" 111 | "m256","vpaddd","throughput","3.334179e-01","2.999239e+00" 112 | "m256","vpermps","latency","3.000142e+00","3.333175e-01" 113 | "m256","vpermps","throughput","1.000496e+00","9.995041e-01" 114 | "m256","vpermpd","latency","3.000169e+00","3.333146e-01" 115 | "m256","vpermpd","throughput","1.000066e+00","9.999343e-01" 116 | "m256","vpblendvb","latency","2.000192e+00","4.999520e-01" 117 | "m256","vpblendvb","throughput","1.000040e+00","9.999604e-01" 118 | "m256","vpmovmskb","throughput","1.000061e+00","9.999390e-01" 119 | "m256","vpmovsxwd","latency","3.000250e+00","3.333055e-01" 120 | "m256","vpmovsxwd","throughput","1.000360e+00","9.996396e-01" 121 | "m256","vpgatherdd","latency","2.200110e+01","4.545227e-02" 122 | "m256","vpgatherdd","throughput","5.000291e+00","1.999884e-01" 123 | "m256","gather32(x8 + perm)","latency","1.822330e+01","5.487481e-02" 124 | "m256","gather32(x8 + perm)","throughput","7.000430e+00","1.428484e-01" 125 | "m256","vgatherdpd","latency","2.000098e+01","4.999755e-02" 126 | "m256","vgatherdpd","throughput","4.000281e+00","2.499824e-01" 127 | "m256","gather64(x4 + perm)","latency","1.300076e+01","7.691856e-02" 128 | "m256","gather64(x4 + perm)","throughput","3.001265e+00","3.331928e-01" 129 | "m256","vpshufb","latency","1.000063e+00","9.999371e-01" 130 | "m256","vpshufb","throughput","1.000056e+00","9.999443e-01" 131 | "m256","vfmaps","latency","4.000254e+00","2.499841e-01" 132 | "m256","vfmaps","throughput","5.000632e-01","1.999747e+00" 133 | "m256","vfmapd","latency","4.000286e+00","2.499821e-01" 134 | "m256","vfmapd","throughput","5.000761e-01","1.999696e+00" 135 | "m128","vfmaps","latency","4.000250e+00","2.499844e-01" 136 | "m128","vfmaps","throughput","5.000244e-01","1.999903e+00" 137 | "m128","vfmapd","latency","4.000352e+00","2.499780e-01" 138 | "m128","vfmapd","throughput","5.000307e-01","1.999877e+00" 139 | "reg64","popcnt","latency","3.000151e+00","3.333165e-01" 140 | "reg64","popcnt","throughput","1.000082e+00","9.999185e-01" 141 | "m128","aesenc","latency","4.000251e+00","2.499843e-01" 142 | "m128","aesenc","throughput","1.000029e+00","9.999708e-01" 143 | "m128","aesenclast","latency","4.000256e+00","2.499840e-01" 144 | "m128","aesenclast","throughput","1.000032e+00","9.999680e-01" 145 | "m128","aesdec","latency","4.000257e+00","2.499839e-01" 146 | "m128","aesdec","throughput","1.000269e+00","9.997316e-01" 147 | "m128","aesdeclast","latency","4.000144e+00","2.499910e-01" 148 | "m128","aesdeclast","throughput","1.000030e+00","9.999697e-01" 149 | "m128","pclmulqdq","latency","7.000372e+00","1.428496e-01" 150 | "m128","pclmulqdq","throughput","1.000024e+00","9.999763e-01" 151 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Core(TM)i7-4700MQCPU@2.40GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000219e+00","9.997815e-01" 3 | "reg64","add","throughput","2.656757e-01","3.763988e+00" 4 | "reg64","lea","latency","1.000019e+00","9.999806e-01" 5 | "reg64","lea","throughput","5.000532e-01","1.999787e+00" 6 | "reg64","xor dst,dst","latency","2.661717e-01","3.756973e+00" 7 | "reg64","xor dst,dst","throughput","2.659013e-01","3.760794e+00" 8 | "reg64","xor","latency","2.656692e-01","3.764079e+00" 9 | "reg64","xor","throughput","2.662120e-01","3.756405e+00" 10 | "reg64","load","latency","5.010743e+00","1.995712e-01" 11 | "reg64","load","throughput","6.251683e-01","1.599569e+00" 12 | "reg64","crc32","latency","3.000149e+00","3.333167e-01" 13 | "reg64","crc32","throughput","1.000017e+00","9.999827e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","1.219199e+01","8.202110e-02" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.270070e+00","7.873581e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","2.000366e+01","4.999085e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.400065e+01","7.142527e-02" 18 | "m128","pxor","latency","2.779562e-01","3.597689e+00" 19 | "m128","pxor","throughput","2.778522e-01","3.599036e+00" 20 | "m128","padd","latency","1.000526e+00","9.994741e-01" 21 | "m128","padd","throughput","5.001977e-01","1.999209e+00" 22 | "m128","pmuldq","latency","5.000263e+00","1.999895e-01" 23 | "m128","pmuldq","throughput","1.000072e+00","9.999282e-01" 24 | "m128","loadps","throughput","5.001884e-01","1.999247e+00" 25 | "m128","loadps->movq","latency","7.000783e+00","1.428412e-01" 26 | "m128","movq->movq","latency","2.000164e+00","4.999590e-01" 27 | "m128","movq->movq","throughput","1.000105e+00","9.998949e-01" 28 | "m128","xorps","latency","2.779717e-01","3.597489e+00" 29 | "m128","xorps","throughput","2.779439e-01","3.597848e+00" 30 | "m128","addps","latency","3.000144e+00","3.333173e-01" 31 | "m128","addps","throughput","1.000313e+00","9.996873e-01" 32 | "m128","mulps","latency","5.000260e+00","1.999896e-01" 33 | "m128","mulps","throughput","5.001286e-01","1.999486e+00" 34 | "m128","divps","latency","1.000843e+01","9.991579e-02" 35 | "m128","divps","throughput","7.034067e+00","1.421653e-01" 36 | "m128","divpd","latency","1.000920e+01","9.990812e-02" 37 | "m128","divpd","throughput","8.019140e+00","1.247017e-01" 38 | "m128","rsqrtps","latency","5.000263e+00","1.999895e-01" 39 | "m128","rsqrtps","throughput","1.000215e+00","9.997852e-01" 40 | "m128","rcpps","latency","5.000263e+00","1.999895e-01" 41 | "m128","rcpps","throughput","1.000246e+00","9.997536e-01" 42 | "m128","blendps","latency","1.000251e+00","9.997491e-01" 43 | "m128","blendps","throughput","3.334940e-01","2.998555e+00" 44 | "m128","blendvps","latency","2.000229e+00","4.999427e-01" 45 | "m128","blendvps","throughput","2.000036e+00","4.999909e-01" 46 | "m128","pshufb","latency","1.000888e+00","9.991128e-01" 47 | "m128","pshufb","throughput","1.000065e+00","9.999345e-01" 48 | "m128","shufps","latency","1.000062e+00","9.999383e-01" 49 | "m128","shufps","throughput","1.000093e+00","9.999074e-01" 50 | "m128","pmullw","latency","5.000262e+00","1.999895e-01" 51 | "m128","pmullw","throughput","1.000091e+00","9.999091e-01" 52 | "m128","phaddd","latency","3.000480e+00","3.332800e-01" 53 | "m128","phaddd","throughput","2.000186e+00","4.999535e-01" 54 | "m128","haddps","latency","3.000158e+00","3.333157e-01" 55 | "m128","haddps","throughput","2.000041e+00","4.999898e-01" 56 | "m128","pinsrd","latency","2.000191e+00","4.999524e-01" 57 | "m128","pinsrd","throughput","2.000156e+00","4.999610e-01" 58 | "m128","pinsrd->pexr","latency","4.000277e+00","2.499827e-01" 59 | "m128","dpps","latency","1.401425e+01","7.135593e-02" 60 | "m128","dpps","throughput","2.000167e+00","4.999582e-01" 61 | "m128","cvtps2dq","latency","3.000262e+00","3.333042e-01" 62 | "m128","cvtps2dq","throughput","1.000062e+00","9.999381e-01" 63 | "reg64","popcnt","latency","3.000237e+00","3.333070e-01" 64 | "reg64","popcnt","throughput","1.000123e+00","9.998775e-01" 65 | "m128","aesenc","latency","7.000410e+00","1.428488e-01" 66 | "m128","aesenc","throughput","1.000065e+00","9.999347e-01" 67 | "m128","aesenclast","latency","7.000379e+00","1.428494e-01" 68 | "m128","aesenclast","throughput","1.000276e+00","9.997244e-01" 69 | "m128","aesdec","latency","7.000263e+00","1.428518e-01" 70 | "m128","aesdec","throughput","1.000096e+00","9.999038e-01" 71 | "m128","aesdeclast","latency","7.000263e+00","1.428518e-01" 72 | "m128","aesdeclast","throughput","1.000134e+00","9.998659e-01" 73 | "m256","movaps [mem]","latency","1.000940e+00","9.990608e-01" 74 | "m256","movaps [mem]","throughput","5.001163e-01","1.999535e+00" 75 | "m256","vmovdqu [mem+1]","latency","1.000076e+00","9.999241e-01" 76 | "m256","vmovdqu [mem+1]","throughput","5.010683e-01","1.995736e+00" 77 | "m256","vmovdqu [mem+63] (cross cache)","latency","1.000083e+00","9.999167e-01" 78 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.000128e+00","9.998720e-01" 79 | "m256","vmovdqu [mem+2MB-1] (cross page)","latency","3.100160e+01","3.225640e-02" 80 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","3.100160e+01","3.225640e-02" 81 | "m256","xorps","latency","2.779607e-01","3.597631e+00" 82 | "m256","xorps","throughput","2.779371e-01","3.597936e+00" 83 | "m256","mulps","latency","5.000263e+00","1.999895e-01" 84 | "m256","mulps","throughput","5.000718e-01","1.999713e+00" 85 | "m256","addps","latency","3.000144e+00","3.333174e-01" 86 | "m256","addps","throughput","1.000639e+00","9.993617e-01" 87 | "m256","divps","latency","1.809844e+01","5.525338e-02" 88 | "m256","divps","throughput","1.410729e+01","7.088532e-02" 89 | "m256","divpd","latency","1.907993e+01","5.241109e-02" 90 | "m256","divpd","throughput","1.613453e+01","6.197888e-02" 91 | "m256","rsqrtps","latency","7.004499e+00","1.427654e-01" 92 | "m256","rsqrtps","throughput","2.000162e+00","4.999595e-01" 93 | "m256","rcpps","latency","7.001399e+00","1.428286e-01" 94 | "m256","rcpps","throughput","2.001819e+00","4.995457e-01" 95 | "m256","sqrtps","latency","1.808644e+01","5.529004e-02" 96 | "m256","sqrtps","throughput","1.410156e+01","7.091412e-02" 97 | "m256","vperm2f128","latency","3.002276e+00","3.330807e-01" 98 | "m256","vperm2f128","throughput","1.000279e+00","9.997212e-01" 99 | "m256","pxor","latency","2.780134e-01","3.596948e+00" 100 | "m256","pxor","throughput","2.779922e-01","3.597223e+00" 101 | "m256","paddd","latency","1.000516e+00","9.994840e-01" 102 | "m256","paddd","throughput","5.001782e-01","1.999287e+00" 103 | "m256","vpermps","latency","3.000144e+00","3.333174e-01" 104 | "m256","vpermps","throughput","1.000099e+00","9.999013e-01" 105 | "m256","vpermpd","latency","3.000812e+00","3.332432e-01" 106 | "m256","vpermpd","throughput","1.000895e+00","9.991063e-01" 107 | "m256","vpmovsxwd","latency","3.001752e+00","3.331388e-01" 108 | "m256","vpmovsxwd","throughput","1.000093e+00","9.999074e-01" 109 | "m256","vpgatherdd","latency","2.012555e+01","4.968808e-02" 110 | "m256","vpgatherdd","throughput","1.108836e+01","9.018468e-02" 111 | "m256","gather32(x8 + perm)","latency","1.700188e+01","5.881703e-02" 112 | "m256","gather32(x8 + perm)","throughput","8.013089e+00","1.247958e-01" 113 | "m256","vgatherdpd","latency","1.514010e+01","6.604976e-02" 114 | "m256","vgatherdpd","throughput","8.087612e+00","1.236459e-01" 115 | "m256","gather64(x4 + perm)","latency","1.201304e+01","8.324284e-02" 116 | "m256","gather64(x4 + perm)","throughput","4.000316e+00","2.499802e-01" 117 | "m256","vpshufb","latency","1.000215e+00","9.997847e-01" 118 | "m256","vpshufb","throughput","1.000261e+00","9.997390e-01" 119 | "m256","vfmaps","latency","5.001401e+00","1.999440e-01" 120 | "m256","vfmaps","throughput","5.011008e-01","1.995607e+00" 121 | "m256","vfmapd","latency","5.000264e+00","1.999895e-01" 122 | "m256","vfmapd","throughput","5.002753e-01","1.998899e+00" 123 | "m128","vfmaps","latency","5.000296e+00","1.999881e-01" 124 | "m128","vfmaps","throughput","5.010043e-01","1.995991e+00" 125 | "m128","vfmapd","latency","5.000297e+00","1.999881e-01" 126 | "m128","vfmapd","throughput","5.002647e-01","1.998942e+00" 127 | -------------------------------------------------------------------------------- /logs/linux/Intel(R)Core(TM)i7-6700CPU@3.40GHz.csv: -------------------------------------------------------------------------------- 1 | class,inst,l/t,cpi,ipc 2 | "reg64","add","latency","1.000020e+00","9.999802e-01" 3 | "reg64","add","throughput","2.540855e-01","3.935682e+00" 4 | "reg64","lea","latency","1.000575e+00","9.994257e-01" 5 | "reg64","lea","throughput","5.000175e-01","1.999930e+00" 6 | "reg64","xor dst,dst","latency","2.545247e-01","3.928892e+00" 7 | "reg64","xor dst,dst","throughput","2.539338e-01","3.938034e+00" 8 | "reg64","xor","latency","2.539352e-01","3.938012e+00" 9 | "reg64","xor","throughput","2.542155e-01","3.933671e+00" 10 | "reg64","load","latency","5.000976e+00","1.999610e-01" 11 | "reg64","load","throughput","6.258216e-01","1.597899e+00" 12 | "reg64","crc32","latency","3.000742e+00","3.332509e-01" 13 | "reg64","crc32","throughput","1.000013e+00","9.999871e-01" 14 | "reg64","store [mem+0]->load[mem+0]","latency","7.748371e+00","1.290594e-01" 15 | "reg64","store [mem+0]->load[mem+0]","throughput","1.349531e+00","7.409984e-01" 16 | "reg64","store [mem+0]->load[mem+1]","latency","1.900308e+01","5.262306e-02" 17 | "reg64","store [mem+0]->load[mem+1]","throughput","1.300233e+01","7.690931e-02" 18 | "m128","pxor","latency","2.569936e-01","3.891147e+00" 19 | "m128","pxor","throughput","2.569970e-01","3.891096e+00" 20 | "m128","padd","latency","1.000039e+00","9.999612e-01" 21 | "m128","padd","throughput","3.333666e-01","2.999701e+00" 22 | "m128","pmuldq","latency","5.000141e+00","1.999943e-01" 23 | "m128","pmuldq","throughput","5.000324e-01","1.999870e+00" 24 | "m128","loadps","throughput","5.000481e-01","1.999808e+00" 25 | "m128","loadps->movq","latency","8.003585e+00","1.249440e-01" 26 | "m128","movq->movq","latency","4.000140e+00","2.499912e-01" 27 | "m128","movq->movq","throughput","1.000037e+00","9.999629e-01" 28 | "m128","xorps","latency","2.569900e-01","3.891202e+00" 29 | "m128","xorps","throughput","2.569875e-01","3.891240e+00" 30 | "m128","addps","latency","4.000186e+00","2.499884e-01" 31 | "m128","addps","throughput","5.000316e-01","1.999874e+00" 32 | "m128","mulps","latency","4.001615e+00","2.498991e-01" 33 | "m128","mulps","throughput","5.000318e-01","1.999873e+00" 34 | "m128","divps","latency","1.100285e+01","9.088551e-02" 35 | "m128","divps","throughput","3.001339e+00","3.331846e-01" 36 | "m128","divpd","latency","1.300140e+01","7.691480e-02" 37 | "m128","divpd","throughput","4.001186e+00","2.499259e-01" 38 | "m128","rsqrtps","latency","4.001809e+00","2.498870e-01" 39 | "m128","rsqrtps","throughput","1.000234e+00","9.997665e-01" 40 | "m128","rcpps","latency","4.000138e+00","2.499914e-01" 41 | "m128","rcpps","throughput","1.000035e+00","9.999652e-01" 42 | "m128","blendps","latency","1.000021e+00","9.999786e-01" 43 | "m128","blendps","throughput","3.333778e-01","2.999600e+00" 44 | "m128","blendvps","latency","1.000026e+00","9.999737e-01" 45 | "m128","blendvps","throughput","5.069906e-01","1.972423e+00" 46 | "m128","pshufb","latency","1.000025e+00","9.999746e-01" 47 | "m128","pshufb","throughput","1.000025e+00","9.999750e-01" 48 | "m128","shufps","latency","1.000530e+00","9.994707e-01" 49 | "m128","shufps","throughput","1.000024e+00","9.999756e-01" 50 | "m128","pmullw","latency","5.001155e+00","1.999538e-01" 51 | "m128","pmullw","throughput","5.000261e-01","1.999896e+00" 52 | "m128","phaddd","latency","3.001378e+00","3.331803e-01" 53 | "m128","phaddd","throughput","2.000195e+00","4.999513e-01" 54 | "m128","haddps","latency","3.000156e+00","3.333161e-01" 55 | "m128","haddps","throughput","2.000036e+00","4.999910e-01" 56 | "m128","pinsrd","latency","2.000031e+00","4.999923e-01" 57 | "m128","pinsrd","throughput","2.000147e+00","4.999632e-01" 58 | "m128","pinsrd->pextr","latency","6.001041e+00","1.666378e-01" 59 | "m128","dpps","latency","1.300147e+01","7.691435e-02" 60 | "m128","dpps","throughput","1.598178e+00","6.257124e-01" 61 | "m128","cvtps2dq","latency","4.002718e+00","2.498302e-01" 62 | "m128","cvtps2dq","throughput","5.000326e-01","1.999869e+00" 63 | "m128","pmovmskb","throughput","1.000967e+00","9.990343e-01" 64 | "m128","pmovmskb->movq","latency","4.001719e+00","2.498926e-01" 65 | "m128","movq->movq","latency","4.001336e+00","2.499165e-01" 66 | "m128","movaps [mem]","latency","8.001709e+00","1.249733e-01" 67 | "m128","movaps [mem]","throughput","5.000579e-01","1.999769e+00" 68 | "m128","movdqu [mem+1]","latency","8.001207e+00","1.249811e-01" 69 | "m128","movdqu [mem+1]","throughput","5.000439e-01","1.999825e+00" 70 | "m128","movdqu [mem+63] (cross cache)","latency","1.400232e+01","7.141676e-02" 71 | "m128","movdqu [mem+63] (cross cache)","throughput","1.000043e+00","9.999570e-01" 72 | "m128","movdqu [mem+2MB-1] (cross page)","latency","1.500256e+01","6.665527e-02" 73 | "m128","movdqu [mem+2MB-1] (cross page)","throughput","3.898752e+00","2.564924e-01" 74 | "m128","pcmpistri","throughput","3.002261e+00","3.330822e-01" 75 | "m128","pcmpistri->movq","latency","1.200223e+01","8.331787e-02" 76 | "m128","pcmpistrm","throughput","3.000148e+00","3.333169e-01" 77 | "m128","pcmpistrm","latency","8.852356e+00","1.129643e-01" 78 | "m128","pcmpestri","throughput","4.027953e+00","2.482650e-01" 79 | "m128","pcmpestri->movq","latency","1.200179e+01","8.332091e-02" 80 | "m128","pcmpestrm","throughput","5.029309e+00","1.988345e-01" 81 | "m128","pcmpestrm","latency","9.003110e+00","1.110727e-01" 82 | "m256","movaps [mem]","latency","1.000074e+00","9.999260e-01" 83 | "m256","movaps [mem]","throughput","5.000869e-01","1.999652e+00" 84 | "m256","vmovdqu [mem+1]","latency","1.000069e+00","9.999313e-01" 85 | "m256","vmovdqu [mem+1]","throughput","5.000909e-01","1.999636e+00" 86 | "m256","vmovdqu [mem+63] (cross cache)","latency","1.000072e+00","9.999275e-01" 87 | "m256","vmovdqu [mem+63] (cross cache)","throughput","1.000073e+00","9.999271e-01" 88 | "m256","vmovdqu [mem+2MB-1] (cross page)","latency","3.903431e+00","2.561849e-01" 89 | "m256","vmovdqu [mem+2MB-1] (cross page)","throughput","3.845849e+00","2.600206e-01" 90 | "m256","vxorps","latency","2.570284e-01","3.890621e+00" 91 | "m256","vxorps","throughput","2.570220e-01","3.890717e+00" 92 | "m256","vmulps","latency","4.001090e+00","2.499319e-01" 93 | "m256","vmulps","throughput","5.000680e-01","1.999728e+00" 94 | "m256","vaddps","latency","4.001816e+00","2.498865e-01" 95 | "m256","vaddps","throughput","5.000663e-01","1.999735e+00" 96 | "m256","vdivps","latency","1.100190e+01","9.089342e-02" 97 | "m256","vdivps","throughput","5.001701e+00","1.999320e-01" 98 | "m256","vdivpd","latency","1.300177e+01","7.691261e-02" 99 | "m256","vdivpd","throughput","8.001240e+00","1.249806e-01" 100 | "m256","vrsqrtps","latency","4.001621e+00","2.498988e-01" 101 | "m256","vrsqrtps","throughput","1.000062e+00","9.999381e-01" 102 | "m256","vrcpps","latency","4.000178e+00","2.499888e-01" 103 | "m256","vrcpps","throughput","1.000067e+00","9.999332e-01" 104 | "m256","vsqrtps","latency","1.200285e+01","8.331357e-02" 105 | "m256","vsqrtps","throughput","6.001575e+00","1.666229e-01" 106 | "m256","vperm2f128","latency","3.001290e+00","3.331900e-01" 107 | "m256","vperm2f128","throughput","1.000071e+00","9.999288e-01" 108 | "m256","vpxor","latency","2.570248e-01","3.890676e+00" 109 | "m256","vpxor","throughput","2.570250e-01","3.890672e+00" 110 | "m256","vpaddd","latency","1.000072e+00","9.999279e-01" 111 | "m256","vpaddd","throughput","3.334043e-01","2.999361e+00" 112 | "m256","vpermps","latency","3.002213e+00","3.330877e-01" 113 | "m256","vpermps","throughput","1.000402e+00","9.995983e-01" 114 | "m256","vpermpd","latency","3.000221e+00","3.333088e-01" 115 | "m256","vpermpd","throughput","1.000064e+00","9.999356e-01" 116 | "m256","vpblendvb","latency","2.000234e+00","4.999416e-01" 117 | "m256","vpblendvb","throughput","1.000075e+00","9.999246e-01" 118 | "m256","vpmovmskb","throughput","1.000065e+00","9.999352e-01" 119 | "m256","vpmovsxwd","latency","3.001592e+00","3.331565e-01" 120 | "m256","vpmovsxwd","throughput","1.000063e+00","9.999375e-01" 121 | "m256","vpgatherdd","latency","2.200167e+01","4.545109e-02" 122 | "m256","vpgatherdd","throughput","5.001285e+00","1.999486e-01" 123 | "m256","gather32(x8 + perm)","latency","1.800249e+01","5.554786e-02" 124 | "m256","gather32(x8 + perm)","throughput","7.001212e+00","1.428324e-01" 125 | "m256","vgatherdpd","latency","2.000172e+01","4.999569e-02" 126 | "m256","vgatherdpd","throughput","4.001633e+00","2.498980e-01" 127 | "m256","gather64(x4 + perm)","latency","1.300153e+01","7.691404e-02" 128 | "m256","gather64(x4 + perm)","throughput","3.000079e+00","3.333246e-01" 129 | "m256","vpshufb","latency","1.000061e+00","9.999385e-01" 130 | "m256","vpshufb","throughput","1.000555e+00","9.994453e-01" 131 | "m256","vfmaps","latency","4.001305e+00","2.499185e-01" 132 | "m256","vfmaps","throughput","5.002579e-01","1.998969e+00" 133 | "m256","vfmapd","latency","4.000172e+00","2.499892e-01" 134 | "m256","vfmapd","throughput","5.000678e-01","1.999729e+00" 135 | "m128","vfmaps","latency","4.001049e+00","2.499345e-01" 136 | "m128","vfmaps","throughput","5.000265e-01","1.999894e+00" 137 | "m128","vfmapd","latency","4.000136e+00","2.499915e-01" 138 | "m128","vfmapd","throughput","5.000242e-01","1.999903e+00" 139 | "reg64","popcnt","latency","3.000594e+00","3.332673e-01" 140 | "reg64","popcnt","throughput","1.000012e+00","9.999878e-01" 141 | "m128","aesenc","latency","4.001003e+00","2.499373e-01" 142 | "m128","aesenc","throughput","1.001865e+00","9.981389e-01" 143 | "m128","aesenclast","latency","4.000143e+00","2.499911e-01" 144 | "m128","aesenclast","throughput","1.001960e+00","9.980439e-01" 145 | "m128","aesdec","latency","4.001334e+00","2.499166e-01" 146 | "m128","aesdec","throughput","1.000026e+00","9.999741e-01" 147 | "m128","aesdeclast","latency","4.000175e+00","2.499890e-01" 148 | "m128","aesdeclast","throughput","1.000025e+00","9.999746e-01" 149 | "m128","pclmulqdq","latency","7.001219e+00","1.428323e-01" 150 | "m128","pclmulqdq","throughput","1.000031e+00","9.999691e-01" 151 | -------------------------------------------------------------------------------- /logs/w32/.tmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanakamura/instruction-bench/f8a5205266cbddfc8361fdaaae050355fd981e13/logs/w32/.tmp -------------------------------------------------------------------------------- /skl-sub.log: -------------------------------------------------------------------------------- 1 | Intel(R) Celeron(R) CPU G3900 @ 2.80GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.00, IPC= 1.00 4 | reg64: add:throughput: CPI= 0.25, IPC= 3.94 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.50, IPC= 2.00 7 | reg64: xor dst,dst: latency: CPI= 0.25, IPC= 3.94 8 | reg64: xor dst,dst:throughput: CPI= 0.25, IPC= 3.94 9 | reg64: xor: latency: CPI= 0.25, IPC= 3.94 10 | reg64: xor:throughput: CPI= 0.25, IPC= 3.94 11 | reg64: load: latency: CPI= 5.00, IPC= 0.20 12 | reg64: load:throughput: CPI= 0.63, IPC= 1.60 13 | reg64: crc32: latency: CPI= 3.00, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.00, IPC= 1.00 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 7.75, IPC= 0.13 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 1.35, IPC= 0.74 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 19.00, IPC= 0.05 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 13.00, IPC= 0.08 19 | m128: pxor: latency: CPI= 0.26, IPC= 3.89 20 | m128: pxor:throughput: CPI= 0.26, IPC= 3.89 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.33, IPC= 3.00 23 | m128: pmuldq: latency: CPI= 5.00, IPC= 0.20 24 | m128: pmuldq:throughput: CPI= 0.50, IPC= 2.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 2.00 26 | m128: loadps->movq: latency: CPI= 8.00, IPC= 0.12 27 | m128: movq->movq: latency: CPI= 4.00, IPC= 0.25 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.26, IPC= 3.89 30 | m128: xorps:throughput: CPI= 0.26, IPC= 3.89 31 | m128: addps: latency: CPI= 4.00, IPC= 0.25 32 | m128: addps:throughput: CPI= 0.50, IPC= 2.00 33 | m128: mulps: latency: CPI= 4.00, IPC= 0.25 34 | m128: mulps:throughput: CPI= 0.50, IPC= 2.00 35 | m128: divps: latency: CPI= 11.00, IPC= 0.09 36 | m128: divps:throughput: CPI= 3.00, IPC= 0.33 37 | m128: divpd: latency: CPI= 13.00, IPC= 0.08 38 | m128: divpd:throughput: CPI= 4.00, IPC= 0.25 39 | m128: rsqrtps: latency: CPI= 4.00, IPC= 0.25 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 4.00, IPC= 0.25 42 | m128: rcpps:throughput: CPI= 1.00, IPC= 1.00 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.33, IPC= 3.00 45 | m128: blendvps: latency: CPI= 1.00, IPC= 1.00 46 | m128: blendvps:throughput: CPI= 0.51, IPC= 1.97 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 1.00, IPC= 1.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 1.00, IPC= 1.00 51 | m128: pmullw: latency: CPI= 5.00, IPC= 0.20 52 | m128: pmullw:throughput: CPI= 0.50, IPC= 2.00 53 | m128: phaddd: latency: CPI= 3.00, IPC= 0.33 54 | m128: phaddd:throughput: CPI= 2.00, IPC= 0.50 55 | m128: haddps: latency: CPI= 3.00, IPC= 0.33 56 | m128: haddps:throughput: CPI= 2.00, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 2.00, IPC= 0.50 58 | m128: pinsrd:throughput: CPI= 2.00, IPC= 0.50 59 | m128: pinsrd->pextr: latency: CPI= 6.00, IPC= 0.17 60 | m128: dpps: latency: CPI= 13.00, IPC= 0.08 61 | m128: dpps:throughput: CPI= 1.60, IPC= 0.63 62 | m128: cvtps2dq: latency: CPI= 4.00, IPC= 0.25 63 | m128: cvtps2dq:throughput: CPI= 0.50, IPC= 2.00 64 | m128: pmovmskb:throughput: CPI= 1.00, IPC= 1.00 65 | m128: pmovmskb->movq: latency: CPI= 4.00, IPC= 0.25 66 | m128: movq->movq: latency: CPI= 4.00, IPC= 0.25 67 | m128: movaps [mem]: latency: CPI= 8.00, IPC= 0.12 68 | m128: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 69 | m128: movdqu [mem+1]: latency: CPI= 8.00, IPC= 0.12 70 | m128: movdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 14.00, IPC= 0.07 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 15.00, IPC= 0.07 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 3.77, IPC= 0.27 75 | m128: pcmpistri:throughput: CPI= 3.00, IPC= 0.33 76 | m128: pcmpistri->movq: latency: CPI= 12.00, IPC= 0.08 77 | m128: pcmpistrm:throughput: CPI= 3.00, IPC= 0.33 78 | m128: pcmpistrm: latency: CPI= 8.85, IPC= 0.11 79 | m128: pcmpestri:throughput: CPI= 4.03, IPC= 0.25 80 | m128: pcmpestri->movq: latency: CPI= 12.00, IPC= 0.08 81 | m128: pcmpestrm:throughput: CPI= 5.03, IPC= 0.20 82 | m128: pcmpestrm: latency: CPI= 9.00, IPC= 0.11 83 | reg64: popcnt: latency: CPI= 3.00, IPC= 0.33 84 | reg64: popcnt:throughput: CPI= 1.00, IPC= 1.00 85 | m128: aesenc: latency: CPI= 4.00, IPC= 0.25 86 | m128: aesenc:throughput: CPI= 1.00, IPC= 1.00 87 | m128: aesenclast: latency: CPI= 4.00, IPC= 0.25 88 | m128: aesenclast:throughput: CPI= 1.00, IPC= 1.00 89 | m128: aesdec: latency: CPI= 4.00, IPC= 0.25 90 | m128: aesdec:throughput: CPI= 1.00, IPC= 1.00 91 | m128: aesdeclast: latency: CPI= 4.00, IPC= 0.25 92 | m128: aesdeclast:throughput: CPI= 1.00, IPC= 1.00 93 | m128: pclmulqdq: latency: CPI= 7.00, IPC= 0.14 94 | m128: pclmulqdq:throughput: CPI= 1.00, IPC= 1.00 95 | -------------------------------------------------------------------------------- /skl.log: -------------------------------------------------------------------------------- 1 | Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.00, IPC= 1.00 4 | reg64: add:throughput: CPI= 0.25, IPC= 3.94 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.50, IPC= 2.00 7 | reg64: xor dst,dst: latency: CPI= 0.25, IPC= 3.93 8 | reg64: xor dst,dst:throughput: CPI= 0.25, IPC= 3.94 9 | reg64: xor: latency: CPI= 0.25, IPC= 3.94 10 | reg64: xor:throughput: CPI= 0.25, IPC= 3.93 11 | reg64: load: latency: CPI= 5.00, IPC= 0.20 12 | reg64: load:throughput: CPI= 0.63, IPC= 1.60 13 | reg64: crc32: latency: CPI= 3.00, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.00, IPC= 1.00 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 7.75, IPC= 0.13 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 1.35, IPC= 0.74 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 19.00, IPC= 0.05 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 13.00, IPC= 0.08 19 | m128: pxor: latency: CPI= 0.26, IPC= 3.89 20 | m128: pxor:throughput: CPI= 0.26, IPC= 3.89 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.33, IPC= 3.00 23 | m128: pmuldq: latency: CPI= 5.00, IPC= 0.20 24 | m128: pmuldq:throughput: CPI= 0.50, IPC= 2.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 2.00 26 | m128: loadps->movq: latency: CPI= 8.00, IPC= 0.12 27 | m128: movq->movq: latency: CPI= 4.00, IPC= 0.25 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.26, IPC= 3.89 30 | m128: xorps:throughput: CPI= 0.26, IPC= 3.89 31 | m128: addps: latency: CPI= 4.00, IPC= 0.25 32 | m128: addps:throughput: CPI= 0.50, IPC= 2.00 33 | m128: mulps: latency: CPI= 4.00, IPC= 0.25 34 | m128: mulps:throughput: CPI= 0.50, IPC= 2.00 35 | m128: divps: latency: CPI= 11.00, IPC= 0.09 36 | m128: divps:throughput: CPI= 3.00, IPC= 0.33 37 | m128: divpd: latency: CPI= 13.00, IPC= 0.08 38 | m128: divpd:throughput: CPI= 4.00, IPC= 0.25 39 | m128: rsqrtps: latency: CPI= 4.00, IPC= 0.25 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 4.00, IPC= 0.25 42 | m128: rcpps:throughput: CPI= 1.00, IPC= 1.00 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.33, IPC= 3.00 45 | m128: blendvps: latency: CPI= 1.00, IPC= 1.00 46 | m128: blendvps:throughput: CPI= 0.51, IPC= 1.97 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 1.00, IPC= 1.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 1.00, IPC= 1.00 51 | m128: pmullw: latency: CPI= 5.00, IPC= 0.20 52 | m128: pmullw:throughput: CPI= 0.50, IPC= 2.00 53 | m128: phaddd: latency: CPI= 3.00, IPC= 0.33 54 | m128: phaddd:throughput: CPI= 2.00, IPC= 0.50 55 | m128: haddps: latency: CPI= 3.00, IPC= 0.33 56 | m128: haddps:throughput: CPI= 2.00, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 2.00, IPC= 0.50 58 | m128: pinsrd:throughput: CPI= 2.00, IPC= 0.50 59 | m128: pinsrd->pextr: latency: CPI= 6.00, IPC= 0.17 60 | m128: dpps: latency: CPI= 13.00, IPC= 0.08 61 | m128: dpps:throughput: CPI= 1.60, IPC= 0.63 62 | m128: cvtps2dq: latency: CPI= 4.00, IPC= 0.25 63 | m128: cvtps2dq:throughput: CPI= 0.50, IPC= 2.00 64 | m128: pmovmskb:throughput: CPI= 1.00, IPC= 1.00 65 | m128: pmovmskb->movq: latency: CPI= 4.00, IPC= 0.25 66 | m128: movq->movq: latency: CPI= 4.00, IPC= 0.25 67 | m128: movaps [mem]: latency: CPI= 8.00, IPC= 0.12 68 | m128: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 69 | m128: movdqu [mem+1]: latency: CPI= 8.00, IPC= 0.12 70 | m128: movdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 14.00, IPC= 0.07 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 15.00, IPC= 0.07 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 3.90, IPC= 0.26 75 | m128: pcmpistri:throughput: CPI= 3.00, IPC= 0.33 76 | m128: pcmpistri->movq: latency: CPI= 12.00, IPC= 0.08 77 | m128: pcmpistrm:throughput: CPI= 3.00, IPC= 0.33 78 | m128: pcmpistrm: latency: CPI= 8.85, IPC= 0.11 79 | m128: pcmpestri:throughput: CPI= 4.03, IPC= 0.25 80 | m128: pcmpestri->movq: latency: CPI= 12.00, IPC= 0.08 81 | m128: pcmpestrm:throughput: CPI= 5.03, IPC= 0.20 82 | m128: pcmpestrm: latency: CPI= 9.00, IPC= 0.11 83 | m256: movaps [mem]: latency: CPI= 1.00, IPC= 1.00 84 | m256: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 85 | m256: vmovdqu [mem+1]: latency: CPI= 1.00, IPC= 1.00 86 | m256: vmovdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 87 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.00, IPC= 1.00 88 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 89 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 3.90, IPC= 0.26 90 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 3.85, IPC= 0.26 91 | m256: vxorps: latency: CPI= 0.26, IPC= 3.89 92 | m256: vxorps:throughput: CPI= 0.26, IPC= 3.89 93 | m256: vmulps: latency: CPI= 4.00, IPC= 0.25 94 | m256: vmulps:throughput: CPI= 0.50, IPC= 2.00 95 | m256: vaddps: latency: CPI= 4.00, IPC= 0.25 96 | m256: vaddps:throughput: CPI= 0.50, IPC= 2.00 97 | m256: vdivps: latency: CPI= 11.00, IPC= 0.09 98 | m256: vdivps:throughput: CPI= 5.00, IPC= 0.20 99 | m256: vdivpd: latency: CPI= 13.00, IPC= 0.08 100 | m256: vdivpd:throughput: CPI= 8.00, IPC= 0.12 101 | m256: vrsqrtps: latency: CPI= 4.00, IPC= 0.25 102 | m256: vrsqrtps:throughput: CPI= 1.00, IPC= 1.00 103 | m256: vrcpps: latency: CPI= 4.00, IPC= 0.25 104 | m256: vrcpps:throughput: CPI= 1.00, IPC= 1.00 105 | m256: vsqrtps: latency: CPI= 12.00, IPC= 0.08 106 | m256: vsqrtps:throughput: CPI= 6.00, IPC= 0.17 107 | m256: vperm2f128: latency: CPI= 3.00, IPC= 0.33 108 | m256: vperm2f128:throughput: CPI= 1.00, IPC= 1.00 109 | m256: vpxor: latency: CPI= 0.26, IPC= 3.89 110 | m256: vpxor:throughput: CPI= 0.26, IPC= 3.89 111 | m256: vpaddd: latency: CPI= 1.00, IPC= 1.00 112 | m256: vpaddd:throughput: CPI= 0.33, IPC= 3.00 113 | m256: vpermps: latency: CPI= 3.00, IPC= 0.33 114 | m256: vpermps:throughput: CPI= 1.00, IPC= 1.00 115 | m256: vpermpd: latency: CPI= 3.00, IPC= 0.33 116 | m256: vpermpd:throughput: CPI= 1.00, IPC= 1.00 117 | m256: vpblendvb: latency: CPI= 2.00, IPC= 0.50 118 | m256: vpblendvb:throughput: CPI= 1.00, IPC= 1.00 119 | m256: vpmovmskb:throughput: CPI= 1.00, IPC= 1.00 120 | m256: vpmovsxwd: latency: CPI= 3.00, IPC= 0.33 121 | m256: vpmovsxwd:throughput: CPI= 1.00, IPC= 1.00 122 | m256: vpgatherdd: latency: CPI= 22.00, IPC= 0.05 123 | m256: vpgatherdd:throughput: CPI= 5.00, IPC= 0.20 124 | m256: gather32(x8 + perm): latency: CPI= 18.00, IPC= 0.06 125 | m256: gather32(x8 + perm):throughput: CPI= 7.00, IPC= 0.14 126 | m256: vgatherdpd: latency: CPI= 20.00, IPC= 0.05 127 | m256: vgatherdpd:throughput: CPI= 4.00, IPC= 0.25 128 | m256: gather64(x4 + perm): latency: CPI= 13.00, IPC= 0.08 129 | m256: gather64(x4 + perm):throughput: CPI= 3.00, IPC= 0.33 130 | m256: vpshufb: latency: CPI= 1.00, IPC= 1.00 131 | m256: vpshufb:throughput: CPI= 1.00, IPC= 1.00 132 | m256: vfmaps: latency: CPI= 4.00, IPC= 0.25 133 | m256: vfmaps:throughput: CPI= 0.50, IPC= 2.00 134 | m256: vfmapd: latency: CPI= 4.00, IPC= 0.25 135 | m256: vfmapd:throughput: CPI= 0.50, IPC= 2.00 136 | m128: vfmaps: latency: CPI= 4.00, IPC= 0.25 137 | m128: vfmaps:throughput: CPI= 0.50, IPC= 2.00 138 | m128: vfmapd: latency: CPI= 4.00, IPC= 0.25 139 | m128: vfmapd:throughput: CPI= 0.50, IPC= 2.00 140 | reg64: popcnt: latency: CPI= 3.00, IPC= 0.33 141 | reg64: popcnt:throughput: CPI= 1.00, IPC= 1.00 142 | m128: aesenc: latency: CPI= 4.00, IPC= 0.25 143 | m128: aesenc:throughput: CPI= 1.00, IPC= 1.00 144 | m128: aesenclast: latency: CPI= 4.00, IPC= 0.25 145 | m128: aesenclast:throughput: CPI= 1.00, IPC= 1.00 146 | m128: aesdec: latency: CPI= 4.00, IPC= 0.25 147 | m128: aesdec:throughput: CPI= 1.00, IPC= 1.00 148 | m128: aesdeclast: latency: CPI= 4.00, IPC= 0.25 149 | m128: aesdeclast:throughput: CPI= 1.00, IPC= 1.00 150 | m128: pclmulqdq: latency: CPI= 7.00, IPC= 0.14 151 | m128: pclmulqdq:throughput: CPI= 1.00, IPC= 1.00 152 | -------------------------------------------------------------------------------- /slm.log: -------------------------------------------------------------------------------- 1 | Intel(R) Celeron(R) CPU N2807 @ 1.58GHz 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.03, IPC= 0.97 4 | reg64: add:throughput: CPI= 0.53, IPC= 1.88 5 | reg64: lea: latency: CPI= 1.01, IPC= 0.99 6 | reg64: lea:throughput: CPI= 0.53, IPC= 1.88 7 | reg64: xor dst,dst: latency: CPI= 1.03, IPC= 0.97 8 | reg64: xor dst,dst:throughput: CPI= 0.53, IPC= 1.88 9 | reg64: xor: latency: CPI= 1.03, IPC= 0.97 10 | reg64: xor:throughput: CPI= 0.53, IPC= 1.88 11 | reg64: load: latency: CPI= 3.00, IPC= 0.33 12 | reg64: load:throughput: CPI= 1.00, IPC= 1.00 13 | reg64: crc32: latency: CPI= 5.98, IPC= 0.17 14 | reg64: crc32:throughput: CPI= 5.99, IPC= 0.17 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 4.00, IPC= 0.25 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 2.00, IPC= 0.50 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 14.00, IPC= 0.07 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 11.60, IPC= 0.09 19 | m128: pxor: latency: CPI= 0.56, IPC= 1.80 20 | m128: pxor:throughput: CPI= 0.56, IPC= 1.80 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.56, IPC= 1.80 23 | m128: pmuldq: latency: CPI= 5.97, IPC= 0.17 24 | m128: pmuldq:throughput: CPI= 4.14, IPC= 0.24 25 | m128: loadps:throughput: CPI= 1.00, IPC= 1.00 26 | m128: loadps->movq: latency: CPI= 8.00, IPC= 0.12 27 | m128: movq->movq: latency: CPI= 7.00, IPC= 0.14 28 | m128: movq->movq:throughput: CPI= 7.00, IPC= 0.14 29 | m128: xorps: latency: CPI= 0.56, IPC= 1.80 30 | m128: xorps:throughput: CPI= 0.56, IPC= 1.80 31 | m128: addps: latency: CPI= 3.00, IPC= 0.33 32 | m128: addps:throughput: CPI= 1.00, IPC= 1.00 33 | m128: mulps: latency: CPI= 5.00, IPC= 0.20 34 | m128: mulps:throughput: CPI= 2.00, IPC= 0.50 35 | m128: divps: latency: CPI= 27.00, IPC= 0.04 36 | m128: divps:throughput: CPI= 27.00, IPC= 0.04 37 | m128: divpd: latency: CPI= 27.00, IPC= 0.04 38 | m128: divpd:throughput: CPI= 27.00, IPC= 0.04 39 | m128: rsqrtps: latency: CPI= 9.00, IPC= 0.11 40 | m128: rsqrtps:throughput: CPI= 8.00, IPC= 0.12 41 | m128: rcpps: latency: CPI= 9.00, IPC= 0.11 42 | m128: rcpps:throughput: CPI= 8.00, IPC= 0.12 43 | m128: blendps: latency: CPI= 5.97, IPC= 0.17 44 | m128: blendps:throughput: CPI= 4.14, IPC= 0.24 45 | m128: blendvps: latency: CPI= 7.03, IPC= 0.14 46 | m128: blendvps:throughput: CPI= 8.97, IPC= 0.11 47 | m128: pshufb: latency: CPI= 8.03, IPC= 0.12 48 | m128: pshufb:throughput: CPI= 7.03, IPC= 0.14 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 1.00, IPC= 1.00 51 | m128: pmullw: latency: CPI= 5.00, IPC= 0.20 52 | m128: pmullw:throughput: CPI= 2.00, IPC= 0.50 53 | m128: phaddd: latency: CPI= 9.03, IPC= 0.11 54 | m128: phaddd:throughput: CPI= 8.03, IPC= 0.12 55 | m128: haddps: latency: CPI= 9.03, IPC= 0.11 56 | m128: haddps:throughput: CPI= 8.03, IPC= 0.12 57 | m128: pinsrd: latency: CPI= 5.97, IPC= 0.17 58 | m128: pinsrd:throughput: CPI= 4.14, IPC= 0.24 59 | m128: pinsrd->pextr: latency: CPI= 13.03, IPC= 0.08 60 | m128: dpps: latency: CPI= 15.00, IPC= 0.07 61 | m128: dpps:throughput: CPI= 13.03, IPC= 0.08 62 | m128: cvtps2dq: latency: CPI= 5.00, IPC= 0.20 63 | m128: cvtps2dq:throughput: CPI= 2.00, IPC= 0.50 64 | m128: pmovmskb:throughput: CPI= 1.33, IPC= 0.75 65 | m128: pmovmskb->movq: latency: CPI= 7.00, IPC= 0.14 66 | m128: movq->movq: latency: CPI= 7.00, IPC= 0.14 67 | m128: movaps [mem]: latency: CPI= 8.00, IPC= 0.12 68 | m128: movaps [mem]:throughput: CPI= 1.00, IPC= 1.00 69 | m128: movdqu [mem+1]: latency: CPI= 8.00, IPC= 0.12 70 | m128: movdqu [mem+1]:throughput: CPI= 1.00, IPC= 1.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 14.00, IPC= 0.07 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 3.00, IPC= 0.33 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 17.00, IPC= 0.06 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 9.53, IPC= 0.10 75 | m128: pcmpistri:throughput: CPI= 17.00, IPC= 0.06 76 | m128: pcmpistri->movq: latency: CPI= 18.00, IPC= 0.06 77 | m128: pcmpistrm:throughput: CPI= 13.00, IPC= 0.08 78 | m128: pcmpistrm: latency: CPI= 13.00, IPC= 0.08 79 | m128: pcmpestri:throughput: CPI= 21.00, IPC= 0.05 80 | m128: pcmpestri->movq: latency: CPI= 23.00, IPC= 0.04 81 | m128: pcmpestrm:throughput: CPI= 17.00, IPC= 0.06 82 | m128: pcmpestrm: latency: CPI= 17.00, IPC= 0.06 83 | reg64: popcnt: latency: CPI= 3.03, IPC= 0.33 84 | reg64: popcnt:throughput: CPI= 1.14, IPC= 0.88 85 | m128: pclmulqdq: latency: CPI= 12.03, IPC= 0.08 86 | m128: pclmulqdq:throughput: CPI= 11.03, IPC= 0.09 87 | -------------------------------------------------------------------------------- /sse.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | void test_sse() 4 | { 5 | /* 128 */ 6 | GEN_throughput_only(Xmm, "loadps", 7 | (g->movaps(dst, g->ptr[g->rdx])), 8 | false, OT_INT); 9 | 10 | GEN_latency_only(Xmm, "loadps->movq", 11 | (g->movaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, dst));, 12 | false, OT_INT); 13 | 14 | GEN(Xmm, "movq->movq", 15 | (g->movq(g->rdi,src));(g->movq(dst,g->rdi));, 16 | false, OT_INT); 17 | 18 | GEN(Xmm, "xorps", (g->xorps(dst, src)), false, OT_FP32); 19 | GEN(Xmm, "addps", (g->addps(dst, src)), false, OT_FP32); 20 | GEN(Xmm, "mulps", (g->mulps(dst, src)), false, OT_FP32); 21 | GEN(Xmm, "divps", (g->divps(dst, src)), false, OT_FP32); 22 | GEN(Xmm, "divpd", (g->divpd(dst, src)), false, OT_FP64); 23 | GEN(Xmm, "rsqrtps", (g->rsqrtps(dst, dst)), false, OT_FP32); 24 | GEN(Xmm, "rcpps", (g->rcpps(dst, dst)), false, OT_FP32); 25 | GEN(Xmm, "blendps", (g->blendps(dst, src, 0)), false, OT_FP32); 26 | GEN_latency(Xmm, "blendvps", 27 | (g->blendvps(dst, src));(g->xorps(dst,dst)), 28 | (g->blendvps(dst, src)), 29 | false, OT_FP32); 30 | GEN(Xmm, "pshufb", (g->pshufb(dst, src)), false, OT_INT); 31 | GEN(Xmm, "shufps", (g->shufps(dst, src, 0)), false, OT_FP32); 32 | GEN(Xmm, "pmullw", (g->pmullw(dst, src)), false, OT_INT); 33 | GEN(Xmm, "phaddd", (g->phaddd(dst, src)), false, OT_INT); 34 | GEN(Xmm, "haddps", (g->phaddd(dst, src)), false, OT_FP32); 35 | 36 | GEN(Xmm, "pinsrd", 37 | (g->pinsrb(dst, g->edx, 0)), false, OT_INT); 38 | GEN_latency_only(Xmm, "pinsrd->pextr", (g->pinsrb(dst, g->edx, 0));(g->pextrd(g->edx,dst,0)), false, OT_INT); 39 | GEN(Xmm, "dpps", (g->dpps(dst, src, 0xff)), false, OT_FP32); 40 | GEN(Xmm, "cvtps2dq", (g->cvtps2dq(dst, src)), false, OT_FP32); 41 | GEN_throughput_only(Xmm, "pmovmskb", (g->pmovmskb(g->edx,src)), false, OT_INT); 42 | GEN_latency_only(Xmm, "pmovmskb->movq", 43 | (g->pmovmskb(g->edx,src));(g->movq(src,g->rdx)), 44 | false, OT_INT); 45 | GEN_latency_only(Xmm, "movq->movq", 46 | (g->movq(g->rdx,src));(g->movq(src,g->rdx)), 47 | false, OT_INT); 48 | 49 | 50 | GEN_latency(Xmm, "movaps [mem]", 51 | (g->movaps(dst, g->ptr[g->rdx])), 52 | (g->movaps(dst, g->ptr[g->rdx + g->rdi])); (g->movq(g->rdi, dst)); , 53 | false, OT_FP32); 54 | 55 | GEN_latency(Xmm, "movdqu [mem+1]", 56 | (g->movdqu(dst, g->ptr[g->rdx + 1])), 57 | (g->movdqu(dst, g->ptr[g->rdx + g->rdi + 1])); (g->movq(g->rdi, dst)); , 58 | false, OT_FP32); 59 | 60 | GEN_latency(Xmm, "movdqu [mem+63] (cross cache)", 61 | (g->movdqu(dst, g->ptr[g->rdx + 63])), 62 | (g->movdqu(dst, g->ptr[g->rdx + g->rdi + 63])); (g->movq(g->rdi, dst)); , 63 | false, OT_FP32); 64 | 65 | GEN_latency(Xmm, "movdqu [mem+2MB-1] (cross page)", 66 | (g->movdqu(dst, g->ptr[g->rdx + (2048*1024-1)])), 67 | (g->movdqu(dst, g->ptr[g->rdx + g->rdi + (2048*1024-1)])); (g->movq(g->rdi, dst)); , 68 | false, OT_FP32); 69 | 70 | if (info.have_sse42) { 71 | GEN_throughput_only_rcx_clobber(Xmm, "pcmpistri", 72 | (g->pcmpistri(src,src,0)), 73 | false,OT_INT); 74 | GEN_latency_only_rcx_clobber(Xmm, "pcmpistri->movq", 75 | (g->pcmpistri(src,src,0)); 76 | (g->movq(src,g->rcx)); 77 | , 78 | false,OT_INT); 79 | 80 | GEN_throughput_only_rcx_clobber(Xmm, "pcmpistrm", 81 | (g->pcmpistrm(g->xmm1,g->xmm1,0)), 82 | false,OT_INT); 83 | 84 | GEN_latency_only_rcx_clobber(Xmm, "pcmpistrm", 85 | (g->pcmpistrm(g->xmm0,g->xmm0,0)), 86 | false,OT_INT); 87 | 88 | GEN_throughput_only_rcx_clobber(Xmm, "pcmpestri", 89 | (g->pcmpestri(src,src,0)), 90 | false,OT_INT); 91 | GEN_latency_only_rcx_clobber(Xmm, "pcmpestri->movq", 92 | (g->pcmpestri(src,src,0)); 93 | (g->movq(src,g->rcx)); 94 | , 95 | false,OT_INT); 96 | 97 | GEN_throughput_only_rcx_clobber(Xmm, "pcmpestrm", 98 | (g->pcmpestrm(g->xmm1,g->xmm1,0)), 99 | false,OT_INT); 100 | 101 | GEN_latency_only_rcx_clobber(Xmm, "pcmpestrm", 102 | (g->pcmpestrm(g->xmm0,g->xmm0,0)), 103 | false,OT_INT); 104 | } 105 | } -------------------------------------------------------------------------------- /test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | char data[4096]; 5 | 6 | struct G 7 | :public Xbyak::CodeGenerator 8 | { 9 | G() { 10 | vfmadd132ps(zmm0, zmm1, zmm2); 11 | } 12 | }; 13 | 14 | int main() 15 | { 16 | G g; 17 | char *p = (char*)g.getCode(); 18 | int sz = g.getSize(); 19 | FILE *fp = fopen("out.bin", "wb"); 20 | 21 | for (int i=0; iload[mem+0]: latency: CPI= 26.18, IPC= 0.04 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 2.75, IPC= 0.36 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 31.32, IPC= 0.03 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 17.00, IPC= 0.06 19 | m128: pxor: latency: CPI= 0.25, IPC= 4.00 20 | m128: pxor:throughput: CPI= 0.25, IPC= 4.00 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.33, IPC= 3.00 23 | m128: pmuldq: latency: CPI= 3.00, IPC= 0.33 24 | m128: pmuldq:throughput: CPI= 1.00, IPC= 1.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 2.00 26 | m128: loadps->movq: latency: CPI= 9.00, IPC= 0.11 27 | m128: movq->movq: latency: CPI= 6.03, IPC= 0.17 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.25, IPC= 4.00 30 | m128: xorps:throughput: CPI= 0.25, IPC= 4.00 31 | m128: addps: latency: CPI= 3.00, IPC= 0.33 32 | m128: addps:throughput: CPI= 0.50, IPC= 2.00 33 | m128: mulps: latency: CPI= 3.01, IPC= 0.33 34 | m128: mulps:throughput: CPI= 0.50, IPC= 2.00 35 | m128: divps: latency: CPI= 10.00, IPC= 0.10 36 | m128: divps:throughput: CPI= 3.00, IPC= 0.33 37 | m128: divpd: latency: CPI= 8.00, IPC= 0.12 38 | m128: divpd:throughput: CPI= 4.00, IPC= 0.25 39 | m128: rsqrtps: latency: CPI= 5.00, IPC= 0.20 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 5.01, IPC= 0.20 42 | m128: rcpps:throughput: CPI= 1.01, IPC= 0.99 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.50, IPC= 2.00 45 | m128: blendvps: latency: CPI= 1.00, IPC= 1.00 46 | m128: blendvps:throughput: CPI= 0.50, IPC= 2.00 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 0.50, IPC= 2.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 0.50, IPC= 2.00 51 | m128: pmullw: latency: CPI= 3.00, IPC= 0.33 52 | m128: pmullw:throughput: CPI= 1.00, IPC= 1.00 53 | m128: phaddd: latency: CPI= 2.00, IPC= 0.50 54 | m128: phaddd:throughput: CPI= 2.00, IPC= 0.50 55 | m128: haddps: latency: CPI= 2.00, IPC= 0.50 56 | m128: haddps:throughput: CPI= 2.00, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 1.70, IPC= 0.59 58 | m128: pinsrd:throughput: CPI= 1.31, IPC= 0.76 59 | m128: pinsrd->pextr: latency: CPI= 8.00, IPC= 0.12 60 | m128: dpps: latency: CPI= 15.00, IPC= 0.07 61 | m128: dpps:throughput: CPI= 4.00, IPC= 0.25 62 | m128: cvtps2dq: latency: CPI= 4.00, IPC= 0.25 63 | m128: cvtps2dq:throughput: CPI= 1.00, IPC= 1.00 64 | m128: pmovmskb:throughput: CPI= 1.00, IPC= 1.00 65 | m128: pmovmskb->movq: latency: CPI= 6.00, IPC= 0.17 66 | m128: movq->movq: latency: CPI= 6.01, IPC= 0.17 67 | m128: movaps [mem]: latency: CPI= 9.00, IPC= 0.11 68 | m128: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 69 | m128: movdqu [mem+1]: latency: CPI= 10.00, IPC= 0.10 70 | m128: movdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 11.01, IPC= 0.09 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 11.03, IPC= 0.09 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 1.00, IPC= 1.00 75 | m128: pcmpistri:throughput: CPI= 2.00, IPC= 0.50 76 | m128: pcmpistri->movq: latency: CPI= 11.00, IPC= 0.09 77 | m128: pcmpistrm:throughput: CPI= 2.00, IPC= 0.50 78 | m128: pcmpistrm: latency: CPI= 7.00, IPC= 0.14 79 | m128: pcmpestri:throughput: CPI= 3.00, IPC= 0.33 80 | m128: pcmpestri->movq: latency: CPI= 11.00, IPC= 0.09 81 | m128: pcmpestrm:throughput: CPI= 3.01, IPC= 0.33 82 | m128: pcmpestrm: latency: CPI= 8.50, IPC= 0.12 83 | m256: movaps [mem]: latency: CPI= 1.00, IPC= 1.00 84 | m256: movaps [mem]:throughput: CPI= 1.01, IPC= 0.99 85 | m256: vmovdqu [mem+1]: latency: CPI= 1.50, IPC= 0.67 86 | m256: vmovdqu [mem+1]:throughput: CPI= 1.50, IPC= 0.67 87 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.50, IPC= 0.67 88 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.50, IPC= 0.67 89 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 1.50, IPC= 0.67 90 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 1.50, IPC= 0.67 91 | m256: vxorps: latency: CPI= 0.50, IPC= 2.00 92 | m256: vxorps:throughput: CPI= 0.50, IPC= 2.00 93 | m256: vmulps: latency: CPI= 3.00, IPC= 0.33 94 | m256: vmulps:throughput: CPI= 1.01, IPC= 0.99 95 | m256: vaddps: latency: CPI= 3.00, IPC= 0.33 96 | m256: vaddps:throughput: CPI= 1.00, IPC= 1.00 97 | m256: vdivps: latency: CPI= 10.00, IPC= 0.10 98 | m256: vdivps:throughput: CPI= 6.00, IPC= 0.17 99 | m256: vdivpd: latency: CPI= 8.00, IPC= 0.12 100 | m256: vdivpd:throughput: CPI= 8.01, IPC= 0.12 101 | m256: vrsqrtps: latency: CPI= 5.00, IPC= 0.20 102 | m256: vrsqrtps:throughput: CPI= 2.00, IPC= 0.50 103 | m256: vrcpps: latency: CPI= 5.00, IPC= 0.20 104 | m256: vrcpps:throughput: CPI= 2.00, IPC= 0.50 105 | m256: vsqrtps: latency: CPI= 8.00, IPC= 0.12 106 | m256: vsqrtps:throughput: CPI= 8.00, IPC= 0.12 107 | m256: vperm2f128: latency: CPI= 3.00, IPC= 0.33 108 | m256: vperm2f128:throughput: CPI= 3.21, IPC= 0.31 109 | m256: vpxor: latency: CPI= 0.50, IPC= 2.00 110 | m256: vpxor:throughput: CPI= 0.50, IPC= 2.00 111 | m256: vpaddd: latency: CPI= 1.00, IPC= 1.00 112 | m256: vpaddd:throughput: CPI= 0.67, IPC= 1.50 113 | m256: vpermps: latency: CPI= 5.00, IPC= 0.20 114 | m256: vpermps:throughput: CPI= 4.00, IPC= 0.25 115 | m256: vpermpd: latency: CPI= 2.01, IPC= 0.50 116 | m256: vpermpd:throughput: CPI= 2.00, IPC= 0.50 117 | m256: vpblendvb: latency: CPI= 2.00, IPC= 0.50 118 | m256: vpblendvb:throughput: CPI= 2.00, IPC= 0.50 119 | m256: vpmovmskb:throughput: CPI= 1.00, IPC= 1.00 120 | m256: vpmovsxwd: latency: CPI= 2.00, IPC= 0.50 121 | m256: vpmovsxwd:throughput: CPI= 2.00, IPC= 0.50 122 | m256: vpgatherdd: latency: CPI= 21.31, IPC= 0.05 123 | m256: vpgatherdd:throughput: CPI= 20.00, IPC= 0.05 124 | m256: gather32(x8 + perm): latency: CPI= 17.39, IPC= 0.06 125 | m256: gather32(x8 + perm):throughput: CPI= 5.03, IPC= 0.20 126 | m256: vgatherdpd: latency: CPI= 18.14, IPC= 0.06 127 | m256: vgatherdpd:throughput: CPI= 12.00, IPC= 0.08 128 | m256: gather64(x4 + perm): latency: CPI= 13.02, IPC= 0.08 129 | m256: gather64(x4 + perm):throughput: CPI= 3.03, IPC= 0.33 130 | m256: vpshufb: latency: CPI= 1.00, IPC= 1.00 131 | m256: vpshufb:throughput: CPI= 1.00, IPC= 1.00 132 | m256: vfmaps: latency: CPI= 5.00, IPC= 0.20 133 | m256: vfmaps:throughput: CPI= 1.03, IPC= 0.97 134 | m256: vfmapd: latency: CPI= 5.00, IPC= 0.20 135 | m256: vfmapd:throughput: CPI= 1.00, IPC= 1.00 136 | m128: vfmaps: latency: CPI= 5.00, IPC= 0.20 137 | m128: vfmaps:throughput: CPI= 0.50, IPC= 2.00 138 | m128: vfmapd: latency: CPI= 5.00, IPC= 0.20 139 | m128: vfmapd:throughput: CPI= 0.50, IPC= 2.00 140 | reg64: popcnt: latency: CPI= 1.00, IPC= 1.00 141 | reg64: popcnt:throughput: CPI= 0.26, IPC= 3.88 142 | m128: aesenc: latency: CPI= 4.00, IPC= 0.25 143 | m128: aesenc:throughput: CPI= 0.50, IPC= 2.00 144 | m128: aesenclast: latency: CPI= 4.00, IPC= 0.25 145 | m128: aesenclast:throughput: CPI= 0.50, IPC= 2.00 146 | m128: aesdec: latency: CPI= 4.00, IPC= 0.25 147 | m128: aesdec:throughput: CPI= 0.51, IPC= 1.95 148 | m128: aesdeclast: latency: CPI= 4.00, IPC= 0.25 149 | m128: aesdeclast:throughput: CPI= 0.50, IPC= 2.00 150 | m128: pclmulqdq: latency: CPI= 4.50, IPC= 0.22 151 | m128: pclmulqdq:throughput: CPI= 2.00, IPC= 0.50 152 | -------------------------------------------------------------------------------- /znver2.log: -------------------------------------------------------------------------------- 1 | AMD Ryzen 7 3700X 8-Core Processor 2 | == latency/throughput == 3 | reg64: add: latency: CPI= 1.00, IPC= 1.00 4 | reg64: add:throughput: CPI= 0.26, IPC= 3.86 5 | reg64: lea: latency: CPI= 1.00, IPC= 1.00 6 | reg64: lea:throughput: CPI= 0.26, IPC= 3.86 7 | reg64: xor dst,dst: latency: CPI= 0.26, IPC= 3.88 8 | reg64: xor dst,dst:throughput: CPI= 0.26, IPC= 3.88 9 | reg64: xor: latency: CPI= 0.26, IPC= 3.88 10 | reg64: xor:throughput: CPI= 0.26, IPC= 3.88 11 | reg64: load: latency: CPI= 4.00, IPC= 0.25 12 | reg64: load:throughput: CPI= 0.63, IPC= 1.60 13 | reg64: crc32: latency: CPI= 3.00, IPC= 0.33 14 | reg64: crc32:throughput: CPI= 1.00, IPC= 1.00 15 | reg64: store [mem+0]->load[mem+0]: latency: CPI= 37.56, IPC= 0.03 16 | reg64: store [mem+0]->load[mem+0]:throughput: CPI= 3.91, IPC= 0.26 17 | reg64: store [mem+0]->load[mem+1]: latency: CPI= 37.38, IPC= 0.03 18 | reg64: store [mem+0]->load[mem+1]:throughput: CPI= 14.00, IPC= 0.07 19 | m128: pxor: latency: CPI= 0.25, IPC= 4.00 20 | m128: pxor:throughput: CPI= 0.25, IPC= 4.00 21 | m128: padd: latency: CPI= 1.00, IPC= 1.00 22 | m128: padd:throughput: CPI= 0.33, IPC= 3.00 23 | m128: pmuldq: latency: CPI= 3.00, IPC= 0.33 24 | m128: pmuldq:throughput: CPI= 1.00, IPC= 1.00 25 | m128: loadps:throughput: CPI= 0.50, IPC= 2.00 26 | m128: loadps->movq: latency: CPI= 9.00, IPC= 0.11 27 | m128: movq->movq: latency: CPI= 6.00, IPC= 0.17 28 | m128: movq->movq:throughput: CPI= 1.00, IPC= 1.00 29 | m128: xorps: latency: CPI= 0.25, IPC= 4.00 30 | m128: xorps:throughput: CPI= 0.25, IPC= 4.00 31 | m128: addps: latency: CPI= 3.00, IPC= 0.33 32 | m128: addps:throughput: CPI= 0.50, IPC= 2.00 33 | m128: mulps: latency: CPI= 3.00, IPC= 0.33 34 | m128: mulps:throughput: CPI= 0.50, IPC= 2.00 35 | m128: divps: latency: CPI= 10.00, IPC= 0.10 36 | m128: divps:throughput: CPI= 3.50, IPC= 0.29 37 | m128: divpd: latency: CPI= 13.00, IPC= 0.08 38 | m128: divpd:throughput: CPI= 5.00, IPC= 0.20 39 | m128: rsqrtps: latency: CPI= 5.00, IPC= 0.20 40 | m128: rsqrtps:throughput: CPI= 1.00, IPC= 1.00 41 | m128: rcpps: latency: CPI= 5.00, IPC= 0.20 42 | m128: rcpps:throughput: CPI= 1.00, IPC= 1.00 43 | m128: blendps: latency: CPI= 1.00, IPC= 1.00 44 | m128: blendps:throughput: CPI= 0.33, IPC= 3.00 45 | m128: blendvps: latency: CPI= 1.00, IPC= 1.00 46 | m128: blendvps:throughput: CPI= 0.50, IPC= 2.00 47 | m128: pshufb: latency: CPI= 1.00, IPC= 1.00 48 | m128: pshufb:throughput: CPI= 0.50, IPC= 2.00 49 | m128: shufps: latency: CPI= 1.00, IPC= 1.00 50 | m128: shufps:throughput: CPI= 0.50, IPC= 2.00 51 | m128: pmullw: latency: CPI= 3.00, IPC= 0.33 52 | m128: pmullw:throughput: CPI= 1.00, IPC= 1.00 53 | m128: phaddd: latency: CPI= 2.00, IPC= 0.50 54 | m128: phaddd:throughput: CPI= 2.00, IPC= 0.50 55 | m128: haddps: latency: CPI= 2.00, IPC= 0.50 56 | m128: haddps:throughput: CPI= 2.00, IPC= 0.50 57 | m128: pinsrd: latency: CPI= 1.79, IPC= 0.56 58 | m128: pinsrd:throughput: CPI= 1.28, IPC= 0.78 59 | m128: pinsrd->pextr: latency: CPI= 8.00, IPC= 0.12 60 | m128: dpps: latency: CPI= 15.00, IPC= 0.07 61 | m128: dpps:throughput: CPI= 4.00, IPC= 0.25 62 | m128: cvtps2dq: latency: CPI= 3.00, IPC= 0.33 63 | m128: cvtps2dq:throughput: CPI= 1.00, IPC= 1.00 64 | m128: pmovmskb:throughput: CPI= 1.00, IPC= 1.00 65 | m128: pmovmskb->movq: latency: CPI= 6.00, IPC= 0.17 66 | m128: movq->movq: latency: CPI= 6.00, IPC= 0.17 67 | m128: movaps [mem]: latency: CPI= 9.00, IPC= 0.11 68 | m128: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 69 | m128: movdqu [mem+1]: latency: CPI= 10.00, IPC= 0.10 70 | m128: movdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 71 | m128: movdqu [mem+63] (cross cache): latency: CPI= 11.00, IPC= 0.09 72 | m128: movdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 73 | m128: movdqu [mem+2MB-1] (cross page): latency: CPI= 11.00, IPC= 0.09 74 | m128: movdqu [mem+2MB-1] (cross page):throughput: CPI= 1.00, IPC= 1.00 75 | m128: pcmpistri:throughput: CPI= 2.00, IPC= 0.50 76 | m128: pcmpistri->movq: latency: CPI= 11.00, IPC= 0.09 77 | m128: pcmpistrm:throughput: CPI= 2.00, IPC= 0.50 78 | m128: pcmpistrm: latency: CPI= 7.00, IPC= 0.14 79 | m128: pcmpestri:throughput: CPI= 3.00, IPC= 0.33 80 | m128: pcmpestri->movq: latency: CPI= 11.00, IPC= 0.09 81 | m128: pcmpestrm:throughput: CPI= 3.00, IPC= 0.33 82 | m128: pcmpestrm: latency: CPI= 7.33, IPC= 0.14 83 | m256: movaps [mem]: latency: CPI= 1.00, IPC= 1.00 84 | m256: movaps [mem]:throughput: CPI= 0.50, IPC= 2.00 85 | m256: vmovdqu [mem+1]: latency: CPI= 1.00, IPC= 1.00 86 | m256: vmovdqu [mem+1]:throughput: CPI= 0.50, IPC= 2.00 87 | m256: vmovdqu [mem+63] (cross cache): latency: CPI= 1.00, IPC= 1.00 88 | m256: vmovdqu [mem+63] (cross cache):throughput: CPI= 1.00, IPC= 1.00 89 | m256: vmovdqu [mem+2MB-1] (cross page): latency: CPI= 1.00, IPC= 1.00 90 | m256: vmovdqu [mem+2MB-1] (cross page):throughput: CPI= 1.00, IPC= 1.00 91 | m256: vxorps: latency: CPI= 0.25, IPC= 4.00 92 | m256: vxorps:throughput: CPI= 0.25, IPC= 4.00 93 | m256: vmulps: latency: CPI= 3.00, IPC= 0.33 94 | m256: vmulps:throughput: CPI= 0.50, IPC= 2.00 95 | m256: vaddps: latency: CPI= 3.00, IPC= 0.33 96 | m256: vaddps:throughput: CPI= 0.50, IPC= 2.00 97 | m256: vdivps: latency: CPI= 10.00, IPC= 0.10 98 | m256: vdivps:throughput: CPI= 3.50, IPC= 0.29 99 | m256: vdivpd: latency: CPI= 13.00, IPC= 0.08 100 | m256: vdivpd:throughput: CPI= 5.00, IPC= 0.20 101 | m256: vrsqrtps: latency: CPI= 5.00, IPC= 0.20 102 | m256: vrsqrtps:throughput: CPI= 1.00, IPC= 1.00 103 | m256: vrcpps: latency: CPI= 5.00, IPC= 0.20 104 | m256: vrcpps:throughput: CPI= 1.00, IPC= 1.00 105 | m256: vsqrtps: latency: CPI= 14.00, IPC= 0.07 106 | m256: vsqrtps:throughput: CPI= 5.50, IPC= 0.18 107 | m256: vperm2f128: latency: CPI= 3.00, IPC= 0.33 108 | m256: vperm2f128:throughput: CPI= 1.00, IPC= 1.00 109 | m256: vpxor: latency: CPI= 0.25, IPC= 4.00 110 | m256: vpxor:throughput: CPI= 0.25, IPC= 4.00 111 | m256: vpaddd: latency: CPI= 1.00, IPC= 1.00 112 | m256: vpaddd:throughput: CPI= 0.33, IPC= 3.00 113 | m256: vpermps: latency: CPI= 8.00, IPC= 0.12 114 | m256: vpermps:throughput: CPI= 2.00, IPC= 0.50 115 | m256: vpermpd: latency: CPI= 6.00, IPC= 0.17 116 | m256: vpermpd:throughput: CPI= 1.28, IPC= 0.78 117 | m256: vpblendvb: latency: CPI= 1.00, IPC= 1.00 118 | m256: vpblendvb:throughput: CPI= 1.00, IPC= 1.00 119 | m256: vpmovmskb:throughput: CPI= 1.00, IPC= 1.00 120 | m256: vpmovsxwd: latency: CPI= 4.00, IPC= 0.25 121 | m256: vpmovsxwd:throughput: CPI= 1.14, IPC= 0.88 122 | m256: vpgatherdd: latency: CPI= 24.00, IPC= 0.04 123 | m256: vpgatherdd:throughput: CPI= 16.00, IPC= 0.06 124 | m256: gather32(x8 + perm): latency: CPI= 18.75, IPC= 0.05 125 | m256: gather32(x8 + perm):throughput: CPI= 4.00, IPC= 0.25 126 | m256: vgatherdpd: latency: CPI= 19.00, IPC= 0.05 127 | m256: vgatherdpd:throughput: CPI= 9.00, IPC= 0.11 128 | m256: gather64(x4 + perm): latency: CPI= 14.86, IPC= 0.07 129 | m256: gather64(x4 + perm):throughput: CPI= 2.00, IPC= 0.50 130 | m256: vpshufb: latency: CPI= 1.00, IPC= 1.00 131 | m256: vpshufb:throughput: CPI= 0.50, IPC= 2.00 132 | m256: vfmaps: latency: CPI= 5.00, IPC= 0.20 133 | m256: vfmaps:throughput: CPI= 0.50, IPC= 2.00 134 | m256: vfmapd: latency: CPI= 5.00, IPC= 0.20 135 | m256: vfmapd:throughput: CPI= 0.50, IPC= 2.00 136 | m128: vfmaps: latency: CPI= 5.00, IPC= 0.20 137 | m128: vfmaps:throughput: CPI= 0.50, IPC= 2.00 138 | m128: vfmapd: latency: CPI= 5.00, IPC= 0.20 139 | m128: vfmapd:throughput: CPI= 0.50, IPC= 2.00 140 | reg64: popcnt: latency: CPI= 1.00, IPC= 1.00 141 | reg64: popcnt:throughput: CPI= 0.26, IPC= 3.86 142 | m128: aesenc: latency: CPI= 4.00, IPC= 0.25 143 | m128: aesenc:throughput: CPI= 0.50, IPC= 2.00 144 | m128: aesenclast: latency: CPI= 4.00, IPC= 0.25 145 | m128: aesenclast:throughput: CPI= 0.50, IPC= 2.00 146 | m128: aesdec: latency: CPI= 4.00, IPC= 0.25 147 | m128: aesdec:throughput: CPI= 0.50, IPC= 2.00 148 | m128: aesdeclast: latency: CPI= 4.00, IPC= 0.25 149 | m128: aesdeclast:throughput: CPI= 0.50, IPC= 2.00 150 | m128: pclmulqdq: latency: CPI= 4.58, IPC= 0.22 151 | m128: pclmulqdq:throughput: CPI= 2.00, IPC= 0.50 152 | --------------------------------------------------------------------------------