├── Makefile ├── README.md ├── create_benchmarks.py ├── ibench.c ├── include_ARMGCC.mk ├── include_GCC.mk ├── include_ICC.mk ├── include_MIC.mk ├── include_POWER8.mk └── src ├── AVX-512 ├── vaddpd-avx512-TP.S ├── vaddpd-avx512.S ├── vaddps-avx512-TP.S ├── vaddps-avx512.S ├── vdivpd-avx512-TP.S ├── vdivpd-avx512.S ├── vdivps-avx512-TP.S ├── vdivps-avx512.S ├── vfmadd213pd-avx512-TP.S ├── vfmadd213pd-avx512.S ├── vfmadd213ps-avx512-TP.S ├── vfmadd213ps-avx512.S ├── vfmadd213sd-avx512-alt1.S ├── vfmadd213sd-avx512.S ├── vmulpd-avx512-TP.S ├── vmulpd-avx512.S ├── vmulps-avx512-TP.S ├── vmulps-avx512.S ├── vmulsd-avx512.S ├── vrcp14pd-avx512-TP.S ├── vrcp14pd-avx512.S ├── vrcp14ps-avx512-TP.S ├── vrcp14ps-avx512.S ├── vrsqrt14pd-avx512-TP.S ├── vrsqrt14pd-avx512.S ├── vrsqrt14ps-avx512-TP.S ├── vrsqrt14ps-avx512.S ├── vsqrtpd-avx512-TP.S ├── vsqrtpd-avx512.S ├── vsqrtps-avx512-TP.S └── vsqrtps-avx512.S ├── AVX ├── vaddpd-avx-TP.S ├── vaddpd-avx.S ├── vaddps-avx-TP.S ├── vaddps-avx.S ├── vdivpd-avx-TP.S ├── vdivpd-avx.S ├── vdivps-avx-TP.S ├── vdivps-avx.S ├── vmovapd-load-avx-TP.S ├── vmovapd-store-avx-TP.S ├── vmovupd-load-avx-TP.S ├── vmovupd-store-avx-TP.S ├── vmulpd-avx-TP.S ├── vmulpd-avx.S ├── vmulps-avx-TP.S ├── vmulps-avx.S ├── vrcpps-avx-TP.S ├── vrcpps-avx.S ├── vsqrtpd-avx-TP.S ├── vsqrtpd-avx.S ├── vsqrtps-avx-TP.S └── vsqrtps-avx.S ├── FMA ├── vfmadd213pd-avx-TP.S ├── vfmadd213pd-avx.S ├── vfmadd213pd-avx512.S ├── vfmadd213pd-sse-TP.S ├── vfmadd213pd-sse.S ├── vfmadd213ps-avx-TP.S ├── vfmadd213ps-avx.S ├── vfmadd213ps-avx512.S ├── vfmadd213ps-sse-TP.S ├── vfmadd213ps-sse.S ├── vfmadd213sd-TP.S ├── vfmadd213sd.S ├── vfmadd213ss-TP.S └── vfmadd213ss.S ├── NEON ├── adc-x_x_x-LAT.S ├── adc-x_x_x-TP.S ├── add-x_x_i-LAT.S ├── add-x_x_i-TP.S ├── add-x_x_x-LAT.S ├── add-x_x_x-TP.S ├── add-x_x_x-il_1_2-adds-x_x_x-TP.S ├── addp-vd_vd_vd-LAT.S ├── addp-vd_vd_vd-TP.S ├── adds-x_x_i-LAT.S ├── adds-x_x_i-TP.S ├── cmp-w_w-TP.S ├── cmp-x_x-TP.S ├── dup-d_vd-LAT.S ├── dup-d_vd-TP.S ├── dup-d_vd-il_1_2-fadd-vd_vd_vd-TP.S ├── dup-d_vd-il_1_2-fadd_vd_vd_vd-TP.S ├── dup-vd_x-TP.S ├── fadd-LAT_order1.S ├── fadd-LAT_order2.S ├── fadd-d_d_d-LAT.S ├── fadd-d_d_d-TP.S ├── fadd-vd_vd_vd-LAT.S ├── fadd-vd_vd_vd-TP.S ├── fadd-vd_vd_vd-il_1_2-add-x_x_x-TP.S ├── fadd-vd_vd_vd-il_1_2-sub-x_x_i-TP.S ├── fadd-vd_vd_vd-il_1_3-add-x_x_x-TP.S ├── fadd-vd_vd_vd-il_2_1-mul-x_x_x-TP.S ├── fadd-vs_vs_vs-LAT.S ├── fadd-vs_vs_vs-TP.S ├── fadd-vs_vs_vs-il_1_2-add-x_x_x-TP.S ├── fdiv-d_d_d-LAT.S ├── fdiv-d_d_d-TP.S ├── fdiv-vd_vd_vd-LAT.S ├── fdiv-vd_vd_vd-TP.S ├── fdiv-vd_vd_vd-il_1_1-dup-d_vd-TP.S ├── fdiv-vd_vd_vd-il_1_2-fadd-vd_vd_vd-TP.S ├── fdiv-vs_vs_vs-LAT.S ├── fdiv-vs_vs_vs-TP.S ├── fdiv-vs_vs_vs-il_1_32-dup-d_vd-TP.S ├── fdiv-vs_vs_vs-il_1_60-fadd-vs_vs_vs-TP.S ├── fmadd-d_d_d_d-LAT.S ├── fmadd-d_d_d_d-TP.S ├── fmadd-s_s_s_s-LAT.S ├── fmadd-s_s_s_s-TP.S ├── fmla-vd_vd_vd-LAT.S ├── fmla-vd_vd_vd-TP.S ├── fmla-vs_vs_vs-LAT.S ├── fmla-vs_vs_vs-TP.S ├── fmov-d_x-TP.S ├── fmov-s_i-TP.S ├── fmov-x_d-TP.S ├── fmul-d_d_d-LAT.S ├── fmul-d_d_d-TP.S ├── fmul-vd_vd_vd-LAT.S ├── fmul-vd_vd_vd-TP.S ├── fmul-vs_vs_vs-LAT.S ├── fmul-vs_vs_vs-TP.S ├── fneg-vd_vd_vd-LAT.S ├── fneg-vd_vd_vd-TP.S ├── fneg-vs_vs_vs-LAT.S ├── fneg-vs_vs_vs-TP.S ├── frecpe-vd_vd-LAT.S ├── frecpe-vd_vd-TP.S ├── frecpe-vs_vs-LAT.S ├── frecpe-vs_vs-TP.S ├── fsqrt-vd_vd-LAT.S ├── fsqrt-vd_vd-TP.S ├── fsqrt-vs_vs-LAT.S ├── fsqrt-vs_vs-TP.S ├── fsub-vd_vd_vd-LAT.S ├── fsub-vd_vd_vd-TP.S ├── fsub-vs_vs_vs-LAT.S ├── fsub-vs_vs_vs-TP.S ├── ldp-d_d_mbo-TP.S ├── ldp-d_d_mbo-il_1_1-ldr-x_mb-TP.S ├── ldp-q_q_mbo-TP.S ├── ldp-q_q_mbp-LAT.S ├── ldp-q_q_mbp-TP.S ├── ldp-x_x_mbo-TP.S ├── ldr-q_mb-TP.S ├── ldr-q_mb-il_1_1-str-q_mb-TP.S ├── ldr-q_mb-il_1_1_1-str-q_mb-add-x_x_x-TP.S ├── ldr-q_mb-il_2_1-str-q_mb-TP.S ├── ldr-q_mbi-TP.S ├── ldr-q_mbp-LAT.S ├── ldr-q_mbp-TP.S ├── ldr-x_mb-TP.S ├── ldr-x_mb-il_1_1-str-x_mb-TP.S ├── ldr-x_mb-il_2_1-str-x_mb-TP.S ├── mla-vd_vd_vd-LAT.S ├── mla-vd_vd_vd-TP.S ├── mla-vs_vs_vs-LAT.S ├── mla-vs_vs_vs-TP.S ├── mov-vb_vb-LAT.S ├── mov-vb_vb-TP.S ├── mov-x_x-LAT.S ├── mov-x_x-TP.S ├── mul-v2s_v2s_v2s-LAT.S ├── mul-v2s_v2s_v2s-TP.S ├── mul-v4s_v4s_v4s-LAT.S ├── mul-v4s_v4s_v4s-TP.S ├── mul-v8h_v8h_v8h-LAT.S ├── mul-v8h_v8h_v8h-TP.S ├── mul-w_w_w-LAT.S ├── mul-w_w_w-TP.S ├── mul-x_x_x-LAT.S ├── mul-x_x_x-TP.S ├── neg-d_d-LAT.S ├── neg-d_d-TP.S ├── neg-vd_vd-LAT.S ├── neg-vd_vd-TP.S ├── sbc-x_x_x-LAT.S ├── sbc-x_x_x-TP.S ├── scvtf-d_x-TP.S ├── sdiv-w_w_w-LAT.S ├── sdiv-w_w_w-TP.S ├── sdiv-x_x_x-LAT.S ├── sdiv-x_x_x-TP.S ├── sdiv-x_x_x-il_1_1-add-x_x_x-TP.S ├── sdiv-x_x_x-il_1_1-adds-x_x_x-TP.S ├── sdiv-x_x_x-il_1_1-mul-x_x_x-TP.S ├── sdiv-x_x_x-il_1_1-scvtf-d_x-TP.S ├── sdiv-x_x_x-il_1_10-adds-x_x_x-TP.S ├── sdiv-x_x_x-il_1_11-add-x_x_x-TP.S ├── sdiv-x_x_x-il_1_11-adds-x_x_x-TP.S ├── sdiv-x_x_x-il_1_15-add-x_x_x-TP.S ├── sdiv-x_x_x-il_1_2-scvtf-d_x-TP.S ├── sdiv-x_x_x-il_1_3-mul-x_x_x-TP.S ├── sdiv-x_x_x-il_1_4-mul_x_x_x-TP.S ├── sdiv-x_x_x-il_1_5-mul_x_x_x-TP.S ├── sdiv-x_x_x-il_1_5-scvtf-d_x-TP.S ├── sdiv-x_x_x-il_1_6-scvtf-d_x-TP.S ├── smaddl-x_w_w_x-LAT.S ├── smaddl-x_w_w_x-TP.S ├── smull-x_w_w-LAT.S ├── smull-x_w_w-TP.S ├── sshll-vd_vs_i-LAT.S ├── sshll-vd_vs_i-TP.S ├── stp-d_d_mbo-TP.S ├── stp-d_d_mbo-il_1_1-ldr-d_mb-TP.S ├── stp-q_q_mb-TP.S ├── stp-q_q_mbo-TP.S ├── stp-q_q_mbo-il_1_1-ldp-q_q_mbo-TP.S ├── stp-q_q_mbo-il_1_1-ldr-q_mbo-TP.S ├── stp-q_q_mbo-il_1_2-ldp-q_q_mbo-TP.S ├── stp-q_q_mbo-il_1_2-ldr-q_mbo-TP.S ├── stp-x_x_mbo-TP.S ├── stp-x_x_mbo-il_1_1-ldr-x_mbo-TP.S ├── stp-x_x_mbo-il_1_2-ldp-q_q_mbo-TP.S ├── stp-x_x_mbo-il_1_2-ldp-x_x_mbo-TP.S ├── stp-x_x_mbo-il_1_2-ldr-x_mbo-TP.S ├── str-d_mb-TP.S ├── str-d_mbp-TP.S ├── str-q_mb-TP.S ├── str-q_mb-il_1_1-mul-x_x_x-TP.S ├── str-q_mbi-TP.S ├── str-q_mbi-il_1_1-mul-x_x_x-TP.S ├── str-q_mbi-il_1_1_1-mul-x_x_x-add-x_x_x-TP.S ├── str-q_mbp-TP.S ├── str-q_mbp-il_1_1-mul-x_x_x-TP.S ├── str-q_mbp-il_1_1_1-mul-x_x_x-add-x_x_x-TP.S ├── str-x_mb-TP.S ├── str-x_mb-il_1_1-mul-x_x_x-TP.S ├── str-x_mb-il_1_1_1-mul-x_x_x_x-add-x_x_x-TP.S ├── str-x_mb-il_2_1-ldr-x_mb-TP.S ├── str-x_mbi-TP.S ├── str-x_mbi-il_1_1-mul-x_x_x-TP.S ├── str-x_mbp-TP.S ├── str-x_mbp-il_1_1-mul-x_x_x-TP.S ├── str-x_mbp-il_1_1_1-mul-x_x_x_x-add-x_x_x-TP.S ├── stur-d_mb-TP.S ├── stur-q_mb-TP.S ├── stur-q_mb-il_1_1-ldr-x_mb-TP.S ├── sub-w_w_i-LAT.S ├── sub-w_w_i-TP.S ├── subs-x_x_i-LAT.S ├── subs-x_x_i-TP.S └── ubfiz-x_x_i_i-TP.S ├── SSE4.2 ├── vaddpd-sse-TP.S ├── vaddpd-sse.S ├── vaddps-sse-TP.S ├── vaddps-sse.S ├── vdivpd-sse-TP.S ├── vdivpd-sse.S ├── vdivps-sse-TP.S ├── vdivps-sse.S ├── vmovapd-load-sse-TP.S ├── vmovapd-store-sse-TP.S ├── vmovupd-load-sse-TP.S ├── vmovupd-store-sse-TP.S ├── vmulpd-sse-TP.S ├── vmulpd-sse.S ├── vmulps-sse-TP.S ├── vmulps-sse.S ├── vrcpps-sse-TP.S ├── vrcpps-sse.S ├── vsqrtpd-sse-TP.S ├── vsqrtpd-sse.S ├── vsqrtps-sse-TP.S └── vsqrtps-sse.S ├── SVE ├── addpl-x_x_i-LAT.S ├── addpl-x_x_i-TP.S ├── copy-u2-TP.S ├── copy-u2-TP2.S ├── fadd-zd_zd_zd-LAT.S ├── fadd-zd_zd_zd-TP.S ├── fadda-d_p_zd-LAT.S ├── fadda-d_p_zd-TP.S ├── faddv-d_p_zd-LAT.S ├── faddv-d_p_zd-TP.S ├── faddv-d_p_zd-il_1_1-dup-d_vd-TP.S ├── faddv-d_p_zd-il_1_1-fadd-zd_zd_zd-TP.S ├── fmad-zd_p_zd_zd-LAT.S ├── fmad-zd_p_zd_zd-TP.S ├── fmla-zd_p_zd_zd-LAT.S ├── fmla-zd_p_zd_zd-TP.S ├── fmul-zd_zd_zd-LAT.S ├── fmul-zd_zd_zd-TP.S ├── incd-x-TP.S ├── ld1d-z_p_mb-TP.S ├── ld1d-zd_p_mb-TP.S ├── ld1d-zd_p_mbi-TP.S ├── st1d-zd_p_mb-TP.S ├── st1d-zd_p_mb-il_1_1-dup-d_vd-TP.S ├── st1d-zd_p_mb-il_1_1-str-x_mb-TP.S ├── st1d-zd_p_mb-il_1_2-dup-d_vd-TP.S ├── st1d-zd_p_mb-il_1_3-ld1d-z_p_mb-TP.S ├── st1d-zd_p_mbi-TP.S ├── update-u2-TP.S ├── whilelo-pd_x_x-TP.S └── whilelo-pd_x_x-il_1_1-mul-x_x_x-TP.S ├── VSX ├── xvadddp.S ├── xvaddsp.S ├── xvdivdp.S ├── xvdivsp.S ├── xvmaddadp.S ├── xvmuldp.S └── xvmulsp.S └── scalar ├── addsd.S ├── addss.S ├── movsdx2-LAT.S ├── mulsd.S ├── mulss.S ├── rcpss-TP.S ├── rcpss.S ├── sqrtsd-TP.S ├── sqrtsd.S ├── sqrtss-TP.S ├── sqrtss.S ├── vaddsd-TP.S ├── vaddsd.S ├── vaddss-TP.S ├── vaddss.S ├── vdivsd-TP.S ├── vdivsd.S ├── vdivss-TP.S ├── vdivss.S ├── vmulsd-TP.S ├── vmulsd.S ├── vmulss-TP.S └── vmulss.S /Makefile: -------------------------------------------------------------------------------- 1 | # Possible targets: GCC, ICC, MIC, POWER8, ARMGCC 2 | COMPILER=ICC 3 | 4 | TARGET = ibench 5 | SRC_DIR = src 6 | KDIRS += $(patsubst $(SRC_DIR)/%, %, $(wildcard $(SRC_DIR)/*)) 7 | Q = @ 8 | 9 | include include_$(COMPILER).mk 10 | 11 | $(TARGET): ibench.c $(KDIRS) $(KERNELS) 12 | $(Q)echo "===> COMPILING $@" 13 | $(Q)$(CC) $(CFLAGS) $< -o $@ -ldl 14 | 15 | $(KDIRS): 16 | $(Q)mkdir $(KDIRS) 17 | 18 | %.so: 19 | $(Q)echo "===> ASSEMBLING $@" 20 | $(Q)$(AS) $(LFLAGS) $(patsubst %.so, $(SRC_DIR)/%.S, $@) -o $@ 21 | 22 | .PHONY: clean 23 | 24 | clean: 25 | $(Q)echo "===> CLEAN" 26 | $(Q)rm -rf $(KDIRS) 27 | $(Q)rm -f $(TARGET) 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ibench 2 | Measure instruction latency and throughput 3 | -------------------------------------------------------------------------------- /include_ARMGCC.mk: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | AS = gcc 3 | CFLAGS = -O3 4 | # -msve-vector-bits=512 -march=armv8.2-a+sve 5 | LFLAGS = -shared 6 | 7 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/NEON/*.S)) 8 | #KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SVE/*.S)) 9 | -------------------------------------------------------------------------------- /include_GCC.mk: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | AS = gcc 3 | CFLAGS = -O3 -x assembler-with-cpp 4 | LFLAGS = -shared 5 | 6 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SSE4.2/*.S)) 7 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX/*.S)) 8 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX2/*.S)) 9 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX512/*.S)) 10 | -------------------------------------------------------------------------------- /include_ICC.mk: -------------------------------------------------------------------------------- 1 | CC = icc 2 | AS = icc 3 | CFLAGS = -O3 4 | LFLAGS = -shared 5 | 6 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/scalar/*.S)) 7 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/SSE4.2/*.S)) 8 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX/*.S)) 9 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/FMA/*.S)) 10 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/AVX-512/*.S)) 11 | -------------------------------------------------------------------------------- /include_MIC.mk: -------------------------------------------------------------------------------- 1 | CC = icc 2 | AS = icc 3 | CFLAGS = -O3 -mmic 4 | LFLAGS = -shared -mmic 5 | 6 | KERNELS += $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/IMCI/*.S)) 7 | -------------------------------------------------------------------------------- /include_POWER8.mk: -------------------------------------------------------------------------------- 1 | CC = xlc 2 | AS = xlc 3 | CFLAGS = -O3 4 | LFLAGS = -shared 5 | 6 | KERNELS = $(patsubst $(SRC_DIR)/%.S, %.so, $(wildcard $(SRC_DIR)/VSX/*.S)) 7 | -------------------------------------------------------------------------------- /src/AVX-512/vaddpd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm3, zmm0, zmm1 35 | INSTR zmm4, zmm1, zmm0 36 | INSTR zmm5, zmm0, zmm2 37 | cmp i, N 38 | INSTR zmm6, zmm2, zmm0 39 | INSTR zmm7, zmm1, zmm2 40 | INSTR zmm8, zmm2, zmm1 41 | jl loop 42 | done: 43 | mov rsp, rbp 44 | pop rbp 45 | ret 46 | .size latency, .-latency 47 | -------------------------------------------------------------------------------- /src/AVX-512/vaddpd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm0 32 | 33 | # Mark registers as AVX-512 34 | vmovapd zmm0, zmm0 35 | vmovapd zmm1, zmm1 36 | 37 | loop: 38 | inc i 39 | INSTR zmm0, zmm0, zmm1 40 | INSTR zmm0, zmm0, zmm1 41 | INSTR zmm0, zmm0, zmm1 42 | cmp i, N 43 | INSTR zmm0, zmm0, zmm1 44 | INSTR zmm0, zmm0, zmm1 45 | INSTR zmm0, zmm0, zmm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/AVX-512/vaddps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy SP 1.0 31 | vmovaps zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm3, zmm0, zmm1 35 | INSTR zmm4, zmm1, zmm0 36 | INSTR zmm5, zmm0, zmm2 37 | cmp i, N 38 | INSTR zmm6, zmm2, zmm0 39 | INSTR zmm7, zmm1, zmm2 40 | INSTR zmm8, zmm2, zmm1 41 | jl loop 42 | done: 43 | mov rsp, rbp 44 | pop rbp 45 | ret 46 | .size latency, .-latency 47 | -------------------------------------------------------------------------------- /src/AVX-512/vaddps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy SP 1.0 31 | vmovaps zmm1, zmm0 32 | 33 | # Mark registers as AVX-512 34 | vmovapd zmm0, zmm0 35 | vmovapd zmm1, zmm1 36 | 37 | loop: 38 | inc i 39 | INSTR zmm0, zmm0, zmm1 40 | INSTR zmm0, zmm0, zmm1 41 | INSTR zmm0, zmm0, zmm1 42 | cmp i, N 43 | INSTR zmm0, zmm0, zmm1 44 | INSTR zmm0, zmm0, zmm1 45 | INSTR zmm0, zmm0, zmm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/AVX-512/vdivpd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | 31 | vaddpd zmm1, zmm0, zmm0 # create 2.0 32 | vaddpd zmm2, zmm0, zmm1 # create 3.0 33 | vaddpd zmm4, zmm1, zmm1 # create 4.0 34 | vaddpd zmm4, zmm4, zmm4 # create 8.0 35 | vaddpd zmm4, zmm4, zmm4 # create 16.0 36 | vaddpd zmm4, zmm4, zmm4 # create 32.0 37 | vaddpd zmm4, zmm4, zmm4 # create 64.0 38 | vaddpd zmm4, zmm4, zmm4 # create 128.0 39 | vaddpd zmm4, zmm4, zmm4 # create 256.0 40 | vaddpd zmm4, zmm4, zmm4 # create 512.0 41 | vaddpd zmm4, zmm4, zmm4 # create 1024.0 42 | vdivpd zmm1, zmm4, zmm2 # create 341.3333 43 | vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 44 | vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 45 | loop: 46 | inc i 47 | INSTR zmm3, zmm0, zmm1 48 | INSTR zmm4, zmm1, zmm0 49 | INSTR zmm5, zmm0, zmm2 50 | cmp i, N 51 | INSTR zmm6, zmm2, zmm0 52 | INSTR zmm7, zmm1, zmm2 53 | INSTR zmm8, zmm2, zmm1 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/AVX-512/vdivpd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | 31 | vaddpd zmm1, zmm0, zmm0 # create 2.0 32 | vaddpd zmm2, zmm0, zmm1 # create 3.0 33 | vaddpd zmm4, zmm1, zmm1 # create 4.0 34 | vaddpd zmm4, zmm4, zmm4 # create 8.0 35 | vaddpd zmm4, zmm4, zmm4 # create 16.0 36 | vaddpd zmm4, zmm4, zmm4 # create 32.0 37 | vaddpd zmm4, zmm4, zmm4 # create 64.0 38 | vaddpd zmm4, zmm4, zmm4 # create 128.0 39 | vaddpd zmm4, zmm4, zmm4 # create 256.0 40 | vaddpd zmm4, zmm4, zmm4 # create 512.0 41 | vaddpd zmm4, zmm4, zmm4 # create 1024.0 42 | vdivpd zmm1, zmm4, zmm2 # create 341.3333 43 | vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 44 | vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 45 | loop: 46 | inc i 47 | INSTR zmm0, zmm0, zmm1 48 | INSTR zmm0, zmm0, zmm2 49 | INSTR zmm0, zmm0, zmm1 50 | cmp i, N 51 | INSTR zmm0, zmm0, zmm2 52 | INSTR zmm0, zmm0, zmm1 53 | INSTR zmm0, zmm0, zmm2 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/AVX-512/vdivps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | 31 | vaddps zmm1, zmm0, zmm0 # create 2.0 32 | vaddps zmm2, zmm0, zmm1 # create 3.0 33 | vaddps zmm4, zmm1, zmm1 # create 4.0 34 | vaddps zmm4, zmm4, zmm4 # create 8.0 35 | vaddps zmm4, zmm4, zmm4 # create 16.0 36 | vaddps zmm4, zmm4, zmm4 # create 32.0 37 | vaddps zmm4, zmm4, zmm4 # create 64.0 38 | vaddps zmm4, zmm4, zmm4 # create 128.0 39 | vaddps zmm4, zmm4, zmm4 # create 256.0 40 | vaddps zmm4, zmm4, zmm4 # create 512.0 41 | vaddps zmm4, zmm4, zmm4 # create 1024.0 42 | vdivps zmm1, zmm4, zmm2 # create 341.3333 43 | vdivps zmm2, zmm0, zmm1 # create 1/341.3333 44 | vaddps zmm0, zmm1, zmm1 # create 2*341.3333 45 | loop: 46 | inc i 47 | INSTR zmm3, zmm0, zmm1 48 | INSTR zmm4, zmm1, zmm2 49 | INSTR zmm5, zmm0, zmm2 50 | cmp i, N 51 | INSTR zmm6, zmm2, zmm0 52 | INSTR zmm7, zmm1, zmm2 53 | INSTR zmm8, zmm2, zmm1 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/AVX-512/vdivps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | 31 | vaddps zmm1, zmm0, zmm0 # create 2.0 32 | vaddps zmm2, zmm0, zmm1 # create 3.0 33 | vaddps zmm4, zmm1, zmm1 # create 4.0 34 | vaddps zmm4, zmm4, zmm4 # create 8.0 35 | vaddps zmm4, zmm4, zmm4 # create 16.0 36 | vaddps zmm4, zmm4, zmm4 # create 32.0 37 | vaddps zmm4, zmm4, zmm4 # create 64.0 38 | vaddps zmm4, zmm4, zmm4 # create 128.0 39 | vaddps zmm4, zmm4, zmm4 # create 256.0 40 | vaddps zmm4, zmm4, zmm4 # create 512.0 41 | vaddps zmm4, zmm4, zmm4 # create 1024.0 42 | vdivps zmm1, zmm4, zmm2 # create 341.3333 43 | vdivps zmm2, zmm0, zmm1 # create 1/341.3333 44 | vaddps zmm0, zmm1, zmm1 # create 2*341.3333 45 | loop: 46 | inc i 47 | INSTR zmm0, zmm0, zmm1 48 | INSTR zmm0, zmm0, zmm2 49 | INSTR zmm0, zmm0, zmm1 50 | cmp i, N 51 | INSTR zmm0, zmm0, zmm2 52 | INSTR zmm0, zmm0, zmm1 53 | INSTR zmm0, zmm0, zmm2 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213pd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm3, zmm0, zmm1 35 | INSTR zmm4, zmm1, zmm0 36 | INSTR zmm5, zmm0, zmm2 37 | INSTR zmm6, zmm2, zmm0 38 | INSTR zmm7, zmm1, zmm2 39 | INSTR zmm8, zmm2, zmm1 40 | INSTR zmm9, zmm2, zmm1 41 | cmp i, N 42 | INSTR zmm10, zmm2, zmm1 43 | INSTR zmm11, zmm2, zmm1 44 | INSTR zmm12, zmm2, zmm1 45 | INSTR zmm13, zmm2, zmm1 46 | INSTR zmm14, zmm2, zmm1 47 | INSTR zmm15, zmm2, zmm1 48 | jl loop 49 | done: 50 | mov rsp, rbp 51 | pop rbp 52 | ret 53 | .size latency, .-latency 54 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213pd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm0, zmm1, zmm1 35 | INSTR zmm0, zmm1, zmm1 36 | INSTR zmm0, zmm1, zmm1 37 | cmp i, N 38 | INSTR zmm0, zmm1, zmm1 39 | INSTR zmm0, zmm1, zmm1 40 | INSTR zmm0, zmm1, zmm1 41 | jl loop 42 | done: 43 | mov rsp, rbp 44 | pop rbp 45 | ret 46 | .size latency, .-latency 47 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213ps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy SP 1.0 31 | vmovaps zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm3, zmm0, zmm1 35 | INSTR zmm4, zmm1, zmm0 36 | INSTR zmm5, zmm0, zmm2 37 | INSTR zmm6, zmm2, zmm0 38 | INSTR zmm7, zmm1, zmm2 39 | INSTR zmm8, zmm2, zmm1 40 | INSTR zmm9, zmm2, zmm1 41 | cmp i, N 42 | INSTR zmm10, zmm2, zmm1 43 | INSTR zmm11, zmm2, zmm1 44 | INSTR zmm12, zmm2, zmm1 45 | INSTR zmm13, zmm2, zmm1 46 | INSTR zmm14, zmm2, zmm1 47 | INSTR zmm15, zmm2, zmm1 48 | jl loop 49 | done: 50 | mov rsp, rbp 51 | pop rbp 52 | ret 53 | .size latency, .-latency 54 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213ps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy SP 1.0 31 | vmovaps zmm1, zmm0 32 | loop: 33 | inc i 34 | INSTR zmm0, zmm1, zmm1 35 | INSTR zmm0, zmm1, zmm1 36 | INSTR zmm0, zmm1, zmm1 37 | cmp i, N 38 | INSTR zmm0, zmm1, zmm1 39 | INSTR zmm0, zmm1, zmm1 40 | INSTR zmm0, zmm1, zmm1 41 | jl loop 42 | done: 43 | mov rsp, rbp 44 | pop rbp 45 | ret 46 | .size latency, .-latency 47 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213sd-avx512-alt1.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213sd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm2 32 | vpcmpeqw xmm1, xmm1, xmm1 33 | 34 | loop: 35 | inc i 36 | INSTR xmm0, xmm1, xmm1 37 | INSTR xmm0, xmm1, xmm1 38 | INSTR xmm0, xmm1, xmm1 39 | cmp i, N 40 | INSTR xmm0, xmm1, xmm1 41 | INSTR xmm0, xmm1, xmm1 42 | INSTR xmm0, xmm1, xmm1 43 | jl loop 44 | done: 45 | mov rsp, rbp 46 | pop rbp 47 | ret 48 | .size latency, .-latency 49 | -------------------------------------------------------------------------------- /src/AVX-512/vfmadd213sd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213sd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm0, xmm1, xmm1 31 | INSTR xmm0, xmm1, xmm1 32 | INSTR xmm0, xmm1, xmm1 33 | cmp i, N 34 | INSTR xmm0, xmm1, xmm1 35 | INSTR xmm0, xmm1, xmm1 36 | INSTR xmm0, xmm1, xmm1 37 | jl loop 38 | done: 39 | mov rsp, rbp 40 | pop rbp 41 | ret 42 | .size latency, .-latency 43 | -------------------------------------------------------------------------------- /src/AVX-512/vmulpd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # create AVX-512 DP 2.0 31 | vaddpd zmm1, zmm0, zmm0 32 | # create AVX-512 DP 0.5 33 | vdivpd zmm2, zmm0, zmm1 34 | loop: 35 | inc i 36 | INSTR zmm3, zmm0, zmm1 37 | INSTR zmm4, zmm1, zmm0 38 | INSTR zmm5, zmm0, zmm2 39 | cmp i, N 40 | INSTR zmm6, zmm2, zmm0 41 | INSTR zmm7, zmm1, zmm2 42 | INSTR zmm8, zmm2, zmm1 43 | jl loop 44 | done: 45 | mov rsp, rbp 46 | pop rbp 47 | ret 48 | .size latency, .-latency 49 | -------------------------------------------------------------------------------- /src/AVX-512/vmulpd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # create AVX-512 DP 2.0 31 | vaddpd zmm1, zmm0, zmm0 32 | # create AVX-512 DP 0.5 33 | vdivpd zmm2, zmm0, zmm1 34 | 35 | # Mark registers as AVX-512 36 | vmovapd zmm0, zmm0 37 | vmovapd zmm1, zmm1 38 | vmovapd zmm2, zmm2 39 | 40 | loop: 41 | inc i 42 | INSTR zmm0, zmm0, zmm1 43 | INSTR zmm0, zmm0, zmm2 44 | INSTR zmm0, zmm0, zmm1 45 | cmp i, N 46 | INSTR zmm0, zmm0, zmm2 47 | INSTR zmm0, zmm0, zmm1 48 | INSTR zmm0, zmm0, zmm2 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/AVX-512/vmulps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # create AVX-512 DP 2.0 31 | vaddps zmm1, zmm0, zmm0 32 | # create AVX-512 DP 0.5 33 | vdivps zmm2, zmm0, zmm1 34 | loop: 35 | inc i 36 | INSTR zmm3, zmm0, zmm1 37 | INSTR zmm4, zmm1, zmm0 38 | INSTR zmm5, zmm0, zmm2 39 | cmp i, N 40 | INSTR zmm6, zmm2, zmm0 41 | INSTR zmm7, zmm1, zmm2 42 | INSTR zmm8, zmm2, zmm1 43 | jl loop 44 | done: 45 | mov rsp, rbp 46 | pop rbp 47 | ret 48 | .size latency, .-latency 49 | -------------------------------------------------------------------------------- /src/AVX-512/vmulps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # create AVX-512 DP 2.0 31 | vaddps zmm1, zmm0, zmm0 32 | # create AVX-512 DP 0.5 33 | vdivps zmm2, zmm0, zmm1 34 | 35 | # Mark registers as AVX-512 36 | vmovapd zmm0, zmm0 37 | vmovapd zmm1, zmm1 38 | vmovapd zmm2, zmm2 39 | 40 | loop: 41 | inc i 42 | INSTR zmm0, zmm0, zmm1 43 | INSTR zmm0, zmm0, zmm2 44 | INSTR zmm0, zmm0, zmm1 45 | cmp i, N 46 | INSTR zmm0, zmm0, zmm2 47 | INSTR zmm0, zmm0, zmm1 48 | INSTR zmm0, zmm0, zmm2 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/AVX-512/vmulsd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # create AVX-512 DP 2.0 31 | vaddpd zmm1, zmm0, zmm0 32 | # create AVX-512 DP 0.5 33 | vdivpd zmm2, zmm0, zmm1 34 | loop: 35 | inc i 36 | INSTR xmm0, xmm0, xmm1 37 | INSTR xmm0, xmm0, xmm2 38 | INSTR xmm0, xmm0, xmm1 39 | cmp i, N 40 | INSTR xmm0, xmm0, xmm2 41 | INSTR xmm0, xmm0, xmm1 42 | INSTR xmm0, xmm0, xmm2 43 | jl loop 44 | done: 45 | mov rsp, rbp 46 | pop rbp 47 | ret 48 | .size latency, .-latency 49 | -------------------------------------------------------------------------------- /src/AVX-512/vrcp14pd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcp14pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SSE DP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | # expand from SSE to AVX 26 | vinsertf128 ymm0, ymm0, xmm0, 0x1 27 | # expand from AVX to AVX-512 28 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 29 | 30 | vaddpd zmm1, zmm0, zmm0 # create 2.0 31 | vaddpd zmm2, zmm0, zmm1 # create 3.0 32 | vaddpd zmm4, zmm1, zmm1 # create 4.0 33 | vaddpd zmm4, zmm4, zmm4 # create 8.0 34 | vaddpd zmm4, zmm4, zmm4 # create 16.0 35 | vaddpd zmm4, zmm4, zmm4 # create 32.0 36 | vaddpd zmm4, zmm4, zmm4 # create 64.0 37 | vaddpd zmm4, zmm4, zmm4 # create 128.0 38 | vaddpd zmm4, zmm4, zmm4 # create 256.0 39 | vaddpd zmm4, zmm4, zmm4 # create 512.0 40 | vaddpd zmm4, zmm4, zmm4 # create 1024.0 41 | vdivpd zmm1, zmm4, zmm2 # create 341.3333 42 | vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 43 | vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 44 | loop: 45 | inc i 46 | INSTR zmm1, zmm0 47 | INSTR zmm2, zmm1 48 | INSTR zmm3, zmm2 49 | cmp i, N 50 | INSTR zmm4, zmm3 51 | INSTR zmm5, zmm4 52 | INSTR zmm0, zmm5 53 | jl loop 54 | done: 55 | mov rsp, rbp 56 | pop rbp 57 | ret 58 | .size latency, .-latency 59 | -------------------------------------------------------------------------------- /src/AVX-512/vrcp14ps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcp14ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SSE SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | # expand from SSE to AVX 26 | vinsertf128 ymm0, ymm0, xmm0, 0x1 27 | # expand from AVX to AVX-512 28 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 29 | 30 | vaddps zmm1, zmm0, zmm0 # create 2.0 31 | vaddps zmm2, zmm0, zmm1 # create 3.0 32 | vaddps zmm4, zmm1, zmm1 # create 4.0 33 | vaddps zmm4, zmm4, zmm4 # create 8.0 34 | vaddps zmm4, zmm4, zmm4 # create 16.0 35 | vaddps zmm4, zmm4, zmm4 # create 32.0 36 | vaddps zmm4, zmm4, zmm4 # create 64.0 37 | vaddps zmm4, zmm4, zmm4 # create 128.0 38 | vaddps zmm4, zmm4, zmm4 # create 256.0 39 | vaddps zmm4, zmm4, zmm4 # create 512.0 40 | vaddps zmm4, zmm4, zmm4 # create 1024.0 41 | vdivps zmm1, zmm4, zmm2 # create 341.3333 42 | vdivps zmm2, zmm0, zmm1 # create 1/341.3333 43 | vaddps zmm0, zmm1, zmm1 # create 2*341.3333 44 | loop: 45 | inc i 46 | INSTR zmm1, zmm0 47 | INSTR zmm2, zmm1 48 | INSTR zmm3, zmm2 49 | cmp i, N 50 | INSTR zmm4, zmm3 51 | INSTR zmm5, zmm4 52 | INSTR zmm0, zmm5 53 | jl loop 54 | done: 55 | mov rsp, rbp 56 | pop rbp 57 | ret 58 | .size latency, .-latency 59 | -------------------------------------------------------------------------------- /src/AVX-512/vrsqrt14pd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrsqrt14pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 27 | 28 | vaddpd zmm1, zmm0, zmm0 # create 2.0 29 | vaddpd zmm2, zmm0, zmm1 # create 3.0 30 | vaddpd zmm4, zmm1, zmm1 # create 4.0 31 | vaddpd zmm4, zmm4, zmm4 # create 8.0 32 | vaddpd zmm4, zmm4, zmm4 # create 16.0 33 | vaddpd zmm4, zmm4, zmm4 # create 32.0 34 | vaddpd zmm4, zmm4, zmm4 # create 64.0 35 | vaddpd zmm4, zmm4, zmm4 # create 128.0 36 | vaddpd zmm4, zmm4, zmm4 # create 256.0 37 | vaddpd zmm4, zmm4, zmm4 # create 512.0 38 | vaddpd zmm4, zmm4, zmm4 # create 1024.0 39 | vdivpd zmm1, zmm4, zmm2 # create 341.3333 40 | vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 41 | vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 42 | vmovapd zmm1, zmm0 43 | vmovapd zmm2, zmm0 44 | vmovapd zmm3, zmm0 45 | vmovapd zmm4, zmm0 46 | vmovapd zmm5, zmm0 47 | loop: 48 | inc i 49 | INSTR zmm10, zmm0 50 | INSTR zmm11, zmm1 51 | INSTR zmm12, zmm2 52 | cmp i, N 53 | INSTR zmm13, zmm3 54 | INSTR zmm14, zmm4 55 | INSTR zmm15, zmm5 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/AVX-512/vrsqrt14ps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrsqrt14ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 27 | 28 | vaddps zmm1, zmm0, zmm0 # create 2.0 29 | vaddps zmm2, zmm0, zmm1 # create 3.0 30 | vaddps zmm4, zmm1, zmm1 # create 4.0 31 | vaddps zmm4, zmm4, zmm4 # create 8.0 32 | vaddps zmm4, zmm4, zmm4 # create 16.0 33 | vaddps zmm4, zmm4, zmm4 # create 32.0 34 | vaddps zmm4, zmm4, zmm4 # create 64.0 35 | vaddps zmm4, zmm4, zmm4 # create 128.0 36 | vaddps zmm4, zmm4, zmm4 # create 256.0 37 | vaddps zmm4, zmm4, zmm4 # create 512.0 38 | vaddps zmm4, zmm4, zmm4 # create 1024.0 39 | vdivps zmm1, zmm4, zmm2 # create 341.3333 40 | vdivps zmm2, zmm0, zmm1 # create 1/341.3333 41 | vaddps zmm0, zmm1, zmm1 # create 2*341.3333 42 | vmovaps zmm1, zmm0 43 | vmovaps zmm2, zmm0 44 | vmovaps zmm3, zmm0 45 | vmovaps zmm4, zmm0 46 | vmovaps zmm5, zmm0 47 | loop: 48 | inc i 49 | INSTR zmm10, zmm0 50 | INSTR zmm11, zmm1 51 | INSTR zmm12, zmm2 52 | cmp i, N 53 | INSTR zmm13, zmm3 54 | INSTR zmm14, zmm4 55 | INSTR zmm15, zmm5 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/AVX-512/vsqrtpd-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 27 | 28 | vaddpd zmm1, zmm0, zmm0 # create 2.0 29 | vaddpd zmm2, zmm0, zmm1 # create 3.0 30 | vaddpd zmm4, zmm1, zmm1 # create 4.0 31 | vaddpd zmm4, zmm4, zmm4 # create 8.0 32 | vaddpd zmm4, zmm4, zmm4 # create 16.0 33 | vaddpd zmm4, zmm4, zmm4 # create 32.0 34 | vaddpd zmm4, zmm4, zmm4 # create 64.0 35 | vaddpd zmm4, zmm4, zmm4 # create 128.0 36 | vaddpd zmm4, zmm4, zmm4 # create 256.0 37 | vaddpd zmm4, zmm4, zmm4 # create 512.0 38 | vaddpd zmm4, zmm4, zmm4 # create 1024.0 39 | vdivpd zmm1, zmm4, zmm2 # create 341.3333 40 | vdivpd zmm2, zmm0, zmm1 # create 1/341.3333 41 | vaddpd zmm0, zmm1, zmm1 # create 2*341.3333 42 | vmovapd zmm1, zmm0 43 | vmovapd zmm2, zmm0 44 | vmovapd zmm3, zmm0 45 | vmovapd zmm4, zmm0 46 | vmovapd zmm5, zmm0 47 | loop: 48 | inc i 49 | INSTR zmm10, zmm0 50 | INSTR zmm11, zmm1 51 | INSTR zmm12, zmm2 52 | cmp i, N 53 | INSTR zmm13, zmm3 54 | INSTR zmm14, zmm4 55 | INSTR zmm15, zmm5 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/AVX-512/vsqrtps-avx512-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 27 | 28 | vaddps zmm1, zmm0, zmm0 # create 2.0 29 | vaddps zmm2, zmm0, zmm1 # create 3.0 30 | vaddps zmm4, zmm1, zmm1 # create 4.0 31 | vaddps zmm4, zmm4, zmm4 # create 8.0 32 | vaddps zmm4, zmm4, zmm4 # create 16.0 33 | vaddps zmm4, zmm4, zmm4 # create 32.0 34 | vaddps zmm4, zmm4, zmm4 # create 64.0 35 | vaddps zmm4, zmm4, zmm4 # create 128.0 36 | vaddps zmm4, zmm4, zmm4 # create 256.0 37 | vaddps zmm4, zmm4, zmm4 # create 512.0 38 | vaddps zmm4, zmm4, zmm4 # create 1024.0 39 | vdivps zmm1, zmm4, zmm2 # create 341.3333 40 | vdivps zmm2, zmm0, zmm1 # create 1/341.3333 41 | vaddps zmm0, zmm1, zmm1 # create 2*341.3333 42 | vmovaps zmm1, zmm0 43 | vmovaps zmm2, zmm0 44 | vmovaps zmm3, zmm0 45 | vmovaps zmm4, zmm0 46 | vmovaps zmm5, zmm0 47 | loop: 48 | inc i 49 | INSTR zmm10, zmm0 50 | INSTR zmm11, zmm1 51 | INSTR zmm12, zmm2 52 | cmp i, N 53 | INSTR zmm13, zmm3 54 | INSTR zmm14, zmm4 55 | INSTR zmm15, zmm5 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/AVX/vaddpd-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovapd ymm1, ymm0 30 | vmovapd ymm2, ymm0 31 | 32 | # Mark registers as AVX 33 | vmovapd ymm0, ymm0 34 | vmovapd ymm1, ymm1 35 | vmovapd ymm2, ymm2 36 | vmovapd ymm3, ymm3 37 | vmovapd ymm4, ymm4 38 | vmovapd ymm5, ymm5 39 | vmovapd ymm6, ymm6 40 | vmovapd ymm7, ymm7 41 | vmovapd ymm8, ymm8 42 | loop: 43 | inc i 44 | INSTR ymm3, ymm0, ymm1 45 | INSTR ymm4, ymm1, ymm0 46 | INSTR ymm5, ymm0, ymm2 47 | cmp i, N 48 | INSTR ymm6, ymm2, ymm0 49 | INSTR ymm7, ymm1, ymm2 50 | INSTR ymm8, ymm2, ymm1 51 | jl loop 52 | done: 53 | mov rsp, rbp 54 | pop rbp 55 | ret 56 | .size latency, .-latency 57 | -------------------------------------------------------------------------------- /src/AVX/vaddpd-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovapd ymm1, ymm0 30 | 31 | # Mark registers as AVX 32 | vmovapd ymm0, ymm0 33 | vmovapd ymm1, ymm1 34 | 35 | loop: 36 | inc i 37 | INSTR ymm0, ymm0, ymm1 38 | INSTR ymm0, ymm0, ymm1 39 | INSTR ymm0, ymm0, ymm1 40 | cmp i, N 41 | INSTR ymm0, ymm0, ymm1 42 | INSTR ymm0, ymm0, ymm1 43 | INSTR ymm0, ymm0, ymm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/AVX/vaddps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | loop: 31 | inc i 32 | INSTR ymm3, ymm0, ymm1 33 | INSTR ymm4, ymm1, ymm0 34 | INSTR ymm5, ymm0, ymm2 35 | cmp i, N 36 | INSTR ymm6, ymm2, ymm0 37 | INSTR ymm7, ymm1, ymm2 38 | INSTR ymm8, ymm2, ymm1 39 | jl loop 40 | done: 41 | mov rsp, rbp 42 | pop rbp 43 | ret 44 | .size latency, .-latency 45 | -------------------------------------------------------------------------------- /src/AVX/vaddps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | 31 | # Mark registers as AVX 32 | vmovaps ymm0, ymm0 33 | vmovaps ymm1, ymm1 34 | 35 | loop: 36 | inc i 37 | INSTR ymm0, ymm0, ymm1 38 | INSTR ymm0, ymm0, ymm1 39 | INSTR ymm0, ymm0, ymm1 40 | cmp i, N 41 | INSTR ymm0, ymm0, ymm1 42 | INSTR ymm0, ymm0, ymm1 43 | INSTR ymm0, ymm0, ymm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/AVX/vdivpd-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddpd ymm1, ymm0, ymm0 # create 2.0 28 | vaddpd ymm2, ymm0, ymm1 # create 3.0 29 | vaddpd ymm4, ymm1, ymm1 # create 4.0 30 | vaddpd ymm4, ymm4, ymm4 # create 8.0 31 | vaddpd ymm4, ymm4, ymm4 # create 16.0 32 | vaddpd ymm4, ymm4, ymm4 # create 32.0 33 | vaddpd ymm4, ymm4, ymm4 # create 64.0 34 | vaddpd ymm4, ymm4, ymm4 # create 128.0 35 | vaddpd ymm4, ymm4, ymm4 # create 256.0 36 | vaddpd ymm4, ymm4, ymm4 # create 512.0 37 | vaddpd ymm4, ymm4, ymm4 # create 1024.0 38 | vdivpd ymm1, ymm4, ymm2 # create 341.3333 39 | vdivpd ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddpd ymm0, ymm1, ymm1 # create 2*341.3333 41 | loop: 42 | inc i 43 | INSTR ymm3, ymm0, ymm1 44 | INSTR ymm4, ymm1, ymm0 45 | INSTR ymm5, ymm0, ymm2 46 | cmp i, N 47 | INSTR ymm6, ymm2, ymm0 48 | INSTR ymm7, ymm1, ymm2 49 | INSTR ymm8, ymm2, ymm1 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/AVX/vdivpd-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddpd ymm1, ymm0, ymm0 # create 2.0 28 | vaddpd ymm2, ymm0, ymm1 # create 3.0 29 | vaddpd ymm4, ymm1, ymm1 # create 4.0 30 | vaddpd ymm4, ymm4, ymm4 # create 8.0 31 | vaddpd ymm4, ymm4, ymm4 # create 16.0 32 | vaddpd ymm4, ymm4, ymm4 # create 32.0 33 | vaddpd ymm4, ymm4, ymm4 # create 64.0 34 | vaddpd ymm4, ymm4, ymm4 # create 128.0 35 | vaddpd ymm4, ymm4, ymm4 # create 256.0 36 | vaddpd ymm4, ymm4, ymm4 # create 512.0 37 | vaddpd ymm4, ymm4, ymm4 # create 1024.0 38 | vdivpd ymm1, ymm4, ymm2 # create 341.3333 39 | vdivpd ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddpd ymm0, ymm1, ymm1 # create 2*341.3333 41 | loop: 42 | inc i 43 | INSTR ymm0, ymm0, ymm1 44 | INSTR ymm0, ymm0, ymm2 45 | INSTR ymm0, ymm0, ymm1 46 | cmp i, N 47 | INSTR ymm0, ymm0, ymm2 48 | INSTR ymm0, ymm0, ymm1 49 | INSTR ymm0, ymm0, ymm2 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/AVX/vdivps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | loop: 42 | inc i 43 | INSTR ymm3, ymm0, ymm1 44 | INSTR ymm4, ymm1, ymm2 45 | INSTR ymm5, ymm0, ymm2 46 | cmp i, N 47 | INSTR ymm6, ymm2, ymm0 48 | INSTR ymm7, ymm1, ymm2 49 | INSTR ymm8, ymm2, ymm1 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/AVX/vdivps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | loop: 42 | inc i 43 | INSTR ymm0, ymm0, ymm1 44 | INSTR ymm0, ymm0, ymm2 45 | INSTR ymm0, ymm0, ymm1 46 | cmp i, N 47 | INSTR ymm0, ymm0, ymm2 48 | INSTR ymm0, ymm0, ymm1 49 | INSTR ymm0, ymm0, ymm2 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/AVX/vmovapd-load-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovapd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR ymm0, [rip+PI] 28 | INSTR ymm1, [rip+PI] 29 | INSTR ymm2, [rip+PI] 30 | cmp i, N 31 | INSTR ymm3, [rip+PI] 32 | INSTR ymm4, [rip+PI] 33 | INSTR ymm5, [rip+PI] 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/AVX/vmovapd-store-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovapd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR [rip+PI], ymm0 28 | INSTR [rip+PI], ymm1 29 | INSTR [rip+PI], ymm2 30 | cmp i, N 31 | INSTR [rip+PI], ymm3 32 | INSTR [rip+PI], ymm4 33 | INSTR [rip+PI], ymm5 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/AVX/vmovupd-load-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovupd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR ymm0, [rip+PI] 28 | INSTR ymm1, [rip+PI] 29 | INSTR ymm2, [rip+PI] 30 | cmp i, N 31 | INSTR ymm3, [rip+PI] 32 | INSTR ymm4, [rip+PI] 33 | INSTR ymm5, [rip+PI] 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/AVX/vmovupd-store-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovupd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR [rip+PI], ymm0 28 | INSTR [rip+PI], ymm1 29 | INSTR [rip+PI], ymm2 30 | cmp i, N 31 | INSTR [rip+PI], ymm3 32 | INSTR [rip+PI], ymm4 33 | INSTR [rip+PI], ymm5 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/AVX/vmulpd-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # create SP 2.0 29 | vaddpd ymm1, ymm0, ymm0 30 | # create SP 0.5 31 | vdivpd ymm2, ymm0, ymm1 32 | 33 | # Mark registers as AVX 34 | vmovapd ymm0, ymm0 35 | vmovapd ymm1, ymm1 36 | vmovapd ymm2, ymm2 37 | vmovapd ymm3, ymm3 38 | vmovapd ymm4, ymm4 39 | vmovapd ymm5, ymm5 40 | vmovapd ymm6, ymm6 41 | vmovapd ymm7, ymm7 42 | vmovapd ymm8, ymm8 43 | 44 | loop: 45 | inc i 46 | INSTR ymm3, ymm0, ymm1 47 | INSTR ymm4, ymm1, ymm0 48 | INSTR ymm5, ymm0, ymm2 49 | cmp i, N 50 | INSTR ymm6, ymm2, ymm0 51 | INSTR ymm7, ymm1, ymm2 52 | INSTR ymm8, ymm2, ymm1 53 | jl loop 54 | done: 55 | mov rsp, rbp 56 | pop rbp 57 | ret 58 | .size latency, .-latency 59 | -------------------------------------------------------------------------------- /src/AVX/vmulpd-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # create SP 2.0 29 | vaddpd ymm1, ymm0, ymm0 30 | # create SP 0.5 31 | vdivpd ymm2, ymm0, ymm1 32 | 33 | # Mark registers as AVX 34 | vmovapd ymm0, ymm0 35 | vmovapd ymm1, ymm1 36 | vmovapd ymm2, ymm2 37 | 38 | loop: 39 | inc i 40 | INSTR ymm0, ymm0, ymm1 41 | INSTR ymm0, ymm0, ymm2 42 | INSTR ymm0, ymm0, ymm1 43 | cmp i, N 44 | INSTR ymm0, ymm0, ymm2 45 | INSTR ymm0, ymm0, ymm1 46 | INSTR ymm0, ymm0, ymm2 47 | jl loop 48 | done: 49 | mov rsp, rbp 50 | pop rbp 51 | ret 52 | .size latency, .-latency 53 | -------------------------------------------------------------------------------- /src/AVX/vmulps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # create SP 2.0 29 | vaddps ymm1, ymm0, ymm0 30 | # create SP 0.5 31 | vdivps ymm2, ymm0, ymm1 32 | loop: 33 | inc i 34 | INSTR ymm3, ymm0, ymm1 35 | INSTR ymm4, ymm1, ymm0 36 | INSTR ymm5, ymm0, ymm2 37 | cmp i, N 38 | INSTR ymm6, ymm2, ymm0 39 | INSTR ymm7, ymm1, ymm2 40 | INSTR ymm8, ymm2, ymm1 41 | jl loop 42 | done: 43 | mov rsp, rbp 44 | pop rbp 45 | ret 46 | .size latency, .-latency 47 | -------------------------------------------------------------------------------- /src/AVX/vmulps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # create SP 2.0 29 | vaddps ymm1, ymm0, ymm0 30 | # create SP 0.5 31 | vdivps ymm2, ymm0, ymm1 32 | 33 | # Mark registers as AVX 34 | vmovaps ymm0, ymm0 35 | vmovaps ymm1, ymm1 36 | vmovaps ymm2, ymm2 37 | 38 | loop: 39 | inc i 40 | INSTR ymm0, ymm0, ymm1 41 | INSTR ymm0, ymm0, ymm2 42 | INSTR ymm0, ymm0, ymm1 43 | cmp i, N 44 | INSTR ymm0, ymm0, ymm2 45 | INSTR ymm0, ymm0, ymm1 46 | INSTR ymm0, ymm0, ymm2 47 | jl loop 48 | done: 49 | mov rsp, rbp 50 | pop rbp 51 | ret 52 | .size latency, .-latency 53 | -------------------------------------------------------------------------------- /src/AVX/vrcpps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcpps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | vmovaps ymm1, ymm0 42 | vmovaps ymm2, ymm0 43 | vmovaps ymm3, ymm0 44 | vmovaps ymm4, ymm0 45 | vmovaps ymm5, ymm0 46 | loop: 47 | inc i 48 | INSTR ymm10, ymm0 49 | INSTR ymm11, ymm1 50 | INSTR ymm12, ymm2 51 | cmp i, N 52 | INSTR ymm13, ymm3 53 | INSTR ymm14, ymm4 54 | INSTR ymm15, ymm5 55 | jl loop 56 | done: 57 | mov rsp, rbp 58 | pop rbp 59 | ret 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/AVX/vrcpps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcpps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | loop: 42 | inc i 43 | INSTR ymm1, ymm0 44 | INSTR ymm2, ymm1 45 | INSTR ymm3, ymm2 46 | cmp i, N 47 | INSTR ymm4, ymm3 48 | INSTR ymm5, ymm4 49 | INSTR ymm0, ymm5 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/AVX/vsqrtpd-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddpd ymm1, ymm0, ymm0 # create 2.0 28 | vaddpd ymm2, ymm0, ymm1 # create 3.0 29 | vaddpd ymm4, ymm1, ymm1 # create 4.0 30 | vaddpd ymm4, ymm4, ymm4 # create 8.0 31 | vaddpd ymm4, ymm4, ymm4 # create 16.0 32 | vaddpd ymm4, ymm4, ymm4 # create 32.0 33 | vaddpd ymm4, ymm4, ymm4 # create 64.0 34 | vaddpd ymm4, ymm4, ymm4 # create 128.0 35 | vaddpd ymm4, ymm4, ymm4 # create 256.0 36 | vaddpd ymm4, ymm4, ymm4 # create 512.0 37 | vaddpd ymm4, ymm4, ymm4 # create 1024.0 38 | vdivpd ymm1, ymm4, ymm2 # create 341.3333 39 | vdivpd ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddpd ymm0, ymm1, ymm1 # create 2*341.3333 41 | vmovapd ymm1, ymm0 42 | vmovapd ymm2, ymm0 43 | vmovapd ymm3, ymm0 44 | vmovapd ymm4, ymm0 45 | vmovapd ymm5, ymm0 46 | loop: 47 | inc i 48 | INSTR ymm10, ymm0 49 | INSTR ymm11, ymm1 50 | INSTR ymm12, ymm2 51 | cmp i, N 52 | INSTR ymm13, ymm3 53 | INSTR ymm14, ymm4 54 | INSTR ymm15, ymm5 55 | jl loop 56 | done: 57 | mov rsp, rbp 58 | pop rbp 59 | ret 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/AVX/vsqrtps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | vmovaps ymm1, ymm0 42 | vmovaps ymm2, ymm0 43 | vmovaps ymm3, ymm0 44 | vmovaps ymm4, ymm0 45 | vmovaps ymm5, ymm0 46 | loop: 47 | inc i 48 | INSTR ymm10, ymm0 49 | INSTR ymm11, ymm1 50 | INSTR ymm12, ymm2 51 | cmp i, N 52 | INSTR ymm13, ymm3 53 | INSTR ymm14, ymm4 54 | INSTR ymm15, ymm5 55 | jl loop 56 | done: 57 | mov rsp, rbp 58 | pop rbp 59 | ret 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/AVX/vsqrtps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | vinsertf128 ymm0, ymm0, xmm0, 0x1 26 | 27 | vaddps ymm1, ymm0, ymm0 # create 2.0 28 | vaddps ymm2, ymm0, ymm1 # create 3.0 29 | vaddps ymm4, ymm1, ymm1 # create 4.0 30 | vaddps ymm4, ymm4, ymm4 # create 8.0 31 | vaddps ymm4, ymm4, ymm4 # create 16.0 32 | vaddps ymm4, ymm4, ymm4 # create 32.0 33 | vaddps ymm4, ymm4, ymm4 # create 64.0 34 | vaddps ymm4, ymm4, ymm4 # create 128.0 35 | vaddps ymm4, ymm4, ymm4 # create 256.0 36 | vaddps ymm4, ymm4, ymm4 # create 512.0 37 | vaddps ymm4, ymm4, ymm4 # create 1024.0 38 | vdivps ymm1, ymm4, ymm2 # create 341.3333 39 | vdivps ymm2, ymm0, ymm1 # create 1/341.3333 40 | vaddps ymm0, ymm1, ymm1 # create 2*341.3333 41 | vmovaps ymm10, ymm0 # save value 42 | loop: 43 | inc i 44 | INSTR ymm1, ymm0 45 | vaddps ymm1, ymm1, ymm10 46 | INSTR ymm2, ymm1 47 | vaddps ymm2, ymm2, ymm10 48 | INSTR ymm3, ymm2 49 | vaddps ymm3, ymm3, ymm10 50 | cmp i, N 51 | INSTR ymm4, ymm3 52 | vaddps ymm4, ymm4, ymm10 53 | INSTR ymm5, ymm4 54 | vaddps ymm5, ymm5, ymm10 55 | INSTR ymm0, ymm5 56 | vaddps ymm0, ymm0, ymm10 57 | jl loop 58 | done: 59 | mov rsp, rbp 60 | pop rbp 61 | ret 62 | .size latency, .-latency 63 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213pd-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | loop: 31 | inc i 32 | INSTR ymm3, ymm0, ymm1 33 | INSTR ymm4, ymm1, ymm0 34 | INSTR ymm5, ymm0, ymm2 35 | INSTR ymm6, ymm2, ymm0 36 | INSTR ymm7, ymm1, ymm2 37 | INSTR ymm8, ymm2, ymm1 38 | INSTR ymm9, ymm2, ymm1 39 | cmp i, N 40 | INSTR ymm10, ymm2, ymm1 41 | INSTR ymm11, ymm2, ymm1 42 | INSTR ymm12, ymm2, ymm1 43 | INSTR ymm13, ymm2, ymm1 44 | INSTR ymm14, ymm2, ymm1 45 | INSTR ymm15, ymm2, ymm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213pd-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | 31 | # Mark registers AVX 32 | vmovapd ymm0, ymm0 33 | vmovapd ymm1, ymm1 34 | 35 | loop: 36 | inc i 37 | INSTR ymm0, ymm1, ymm1 38 | INSTR ymm0, ymm1, ymm1 39 | INSTR ymm0, ymm1, ymm1 40 | cmp i, N 41 | INSTR ymm0, ymm1, ymm1 42 | INSTR ymm0, ymm1, ymm1 43 | INSTR ymm0, ymm1, ymm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213pd-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE DP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy DP 1.0 31 | vmovapd zmm1, zmm0 32 | 33 | # Mark registers AVX-512 34 | vmovapd zmm0, zmm0 35 | vmovapd zmm1, zmm1 36 | 37 | loop: 38 | inc i 39 | INSTR zmm0, zmm1, zmm1 40 | INSTR zmm0, zmm1, zmm1 41 | INSTR zmm0, zmm1, zmm1 42 | cmp i, N 43 | INSTR zmm0, zmm1, zmm1 44 | INSTR zmm0, zmm1, zmm1 45 | INSTR zmm0, zmm1, zmm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213pd-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm3, xmm0, xmm1 31 | INSTR xmm4, xmm1, xmm0 32 | INSTR xmm5, xmm0, xmm2 33 | INSTR xmm6, xmm2, xmm0 34 | INSTR xmm7, xmm1, xmm2 35 | INSTR xmm8, xmm2, xmm1 36 | INSTR xmm9, xmm2, xmm1 37 | cmp i, N 38 | INSTR xmm10, xmm2, xmm1 39 | INSTR xmm11, xmm2, xmm1 40 | INSTR xmm12, xmm2, xmm1 41 | INSTR xmm13, xmm2, xmm1 42 | INSTR xmm14, xmm2, xmm1 43 | INSTR xmm15, xmm2, xmm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213pd-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213pd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers SSE 30 | movapd xmm0, xmm0 31 | movapd xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1, xmm1 36 | INSTR xmm0, xmm1, xmm1 37 | INSTR xmm0, xmm1, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1, xmm1 40 | INSTR xmm0, xmm1, xmm1 41 | INSTR xmm0, xmm1, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ps-avx-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | loop: 31 | inc i 32 | INSTR ymm3, ymm0, ymm1 33 | INSTR ymm4, ymm1, ymm0 34 | INSTR ymm5, ymm0, ymm2 35 | INSTR ymm6, ymm2, ymm0 36 | INSTR ymm7, ymm1, ymm2 37 | INSTR ymm8, ymm2, ymm1 38 | INSTR ymm9, ymm2, ymm1 39 | cmp i, N 40 | INSTR ymm10, ymm2, ymm1 41 | INSTR ymm11, ymm2, ymm1 42 | INSTR ymm12, ymm2, ymm1 43 | INSTR ymm13, ymm2, ymm1 44 | INSTR ymm14, ymm2, ymm1 45 | INSTR ymm15, ymm2, ymm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ps-avx.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # copy SP 1.0 29 | vmovaps ymm1, ymm0 30 | 31 | # Mark registers AVX 32 | vmovaps ymm0, ymm0 33 | vmovaps ymm1, ymm1 34 | 35 | loop: 36 | inc i 37 | INSTR ymm0, ymm1, ymm1 38 | INSTR ymm0, ymm1, ymm1 39 | INSTR ymm0, ymm1, ymm1 40 | cmp i, N 41 | INSTR ymm0, ymm1, ymm1 42 | INSTR ymm0, ymm1, ymm1 43 | INSTR ymm0, ymm1, ymm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ps-avx512.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SSE SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # expand from SSE to AVX 27 | vinsertf128 ymm0, ymm0, xmm0, 0x1 28 | # expand from AVX to AVX-512 29 | vinsertf64x4 zmm0, zmm0, ymm0, 0x1 30 | # copy SP 1.0 31 | vmovaps zmm1, zmm0 32 | 33 | # Mark registers AVX-512 34 | vmovaps zmm0, zmm0 35 | vmovaps zmm1, zmm1 36 | 37 | loop: 38 | inc i 39 | INSTR zmm0, zmm1, zmm1 40 | INSTR zmm0, zmm1, zmm1 41 | INSTR zmm0, zmm1, zmm1 42 | cmp i, N 43 | INSTR zmm0, zmm1, zmm1 44 | INSTR zmm0, zmm1, zmm1 45 | INSTR zmm0, zmm1, zmm1 46 | jl loop 47 | done: 48 | mov rsp, rbp 49 | pop rbp 50 | ret 51 | .size latency, .-latency 52 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm3, xmm0, xmm1 31 | INSTR xmm4, xmm1, xmm0 32 | INSTR xmm5, xmm0, xmm2 33 | INSTR xmm6, xmm2, xmm0 34 | INSTR xmm7, xmm1, xmm2 35 | INSTR xmm8, xmm2, xmm1 36 | INSTR xmm9, xmm2, xmm1 37 | cmp i, N 38 | INSTR xmm10, xmm2, xmm1 39 | INSTR xmm11, xmm2, xmm1 40 | INSTR xmm12, xmm2, xmm1 41 | INSTR xmm13, xmm2, xmm1 42 | INSTR xmm14, xmm2, xmm1 43 | INSTR xmm15, xmm2, xmm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers SSE 30 | movaps xmm0, xmm0 31 | movaps xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1, xmm1 36 | INSTR xmm0, xmm1, xmm1 37 | INSTR xmm0, xmm1, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1, xmm1 40 | INSTR xmm0, xmm1, xmm1 41 | INSTR xmm0, xmm1, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213sd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213sd 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm3, xmm0, xmm1 31 | INSTR xmm4, xmm1, xmm0 32 | INSTR xmm5, xmm0, xmm2 33 | INSTR xmm6, xmm2, xmm0 34 | INSTR xmm7, xmm1, xmm2 35 | INSTR xmm8, xmm2, xmm1 36 | INSTR xmm9, xmm2, xmm1 37 | cmp i, N 38 | INSTR xmm10, xmm2, xmm1 39 | INSTR xmm11, xmm2, xmm1 40 | INSTR xmm12, xmm2, xmm1 41 | INSTR xmm13, xmm2, xmm1 42 | INSTR xmm14, xmm2, xmm1 43 | INSTR xmm15, xmm2, xmm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213sd.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213sd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers scalar 30 | movsd xmm0, xmm0 31 | movsd xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1, xmm1 36 | INSTR xmm0, xmm1, xmm1 37 | INSTR xmm0, xmm1, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1, xmm1 40 | INSTR xmm0, xmm1, xmm1 41 | INSTR xmm0, xmm1, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ss-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ss 2 | #define NINST 13 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm3, xmm0, xmm1 31 | INSTR xmm4, xmm1, xmm0 32 | INSTR xmm5, xmm0, xmm2 33 | INSTR xmm6, xmm2, xmm0 34 | INSTR xmm7, xmm1, xmm2 35 | INSTR xmm8, xmm2, xmm1 36 | INSTR xmm9, xmm2, xmm1 37 | cmp i, N 38 | INSTR xmm10, xmm2, xmm1 39 | INSTR xmm11, xmm2, xmm1 40 | INSTR xmm12, xmm2, xmm1 41 | INSTR xmm13, xmm2, xmm1 42 | INSTR xmm14, xmm2, xmm1 43 | INSTR xmm15, xmm2, xmm1 44 | jl loop 45 | done: 46 | mov rsp, rbp 47 | pop rbp 48 | ret 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/FMA/vfmadd213ss.S: -------------------------------------------------------------------------------- 1 | #define INSTR vfmadd213ss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers scalar 30 | movss xmm0, xmm0 31 | movss xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1, xmm1 36 | INSTR xmm0, xmm1, xmm1 37 | INSTR xmm0, xmm1, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1, xmm1 40 | INSTR xmm0, xmm1, xmm1 41 | INSTR xmm0, xmm1, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/NEON/mov-x_x-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR mov 2 | #define NINST 6 3 | #define N x0 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .text 10 | .globl latency 11 | .type latency, @function 12 | .align 2 13 | latency: 14 | 15 | # push callee-save registers onto stack 16 | sub sp, sp, #64 17 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 18 | sub sp, sp, #64 19 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 20 | sub sp, sp, #64 21 | st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp] 22 | sub sp, sp, #64 23 | st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp] 24 | sub sp, sp, #64 25 | st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp] 26 | sub sp, sp, #64 27 | st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp] 28 | stp x19, x20, [sp, -96]! 29 | stp x21, x22, [sp, 16] 30 | stp x23, x24, [sp, 32] 31 | stp x25, x26, [sp, 48] 32 | stp x27, x28, [sp, 64] 33 | stp x29, x30, [sp, 80] 34 | 35 | mov x4, N 36 | 37 | mov x0, #1 38 | mov x1, #2 39 | loop: 40 | subs x4, x4, #1 41 | INSTR x0, x1 42 | INSTR x1, x0 43 | INSTR x0, x1 44 | INSTR x1, x0 45 | INSTR x0, x1 46 | INSTR x1, x0 47 | bne loop 48 | done: 49 | 50 | # pop callee-save registers from stack 51 | ldp x19, x20, [sp] 52 | ldp x21, x22, [sp, 16] 53 | ldp x23, x24, [sp, 32] 54 | ldp x25, x26, [sp, 48] 55 | ldp x27, x28, [sp, 64] 56 | ldp x29, x30, [sp, 80] 57 | add sp, sp, #96 58 | ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64 59 | ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64 60 | ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64 61 | ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64 62 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64 63 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64 64 | 65 | ret 66 | 67 | .size latency, .-latency 68 | -------------------------------------------------------------------------------- /src/NEON/neg-d_d-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR neg 2 | #define NINST 6 3 | #define N x0 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .text 10 | .globl latency 11 | .type latency, @function 12 | .align 2 13 | latency: 14 | 15 | # push callee-save registers onto stack 16 | sub sp, sp, #64 17 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 18 | sub sp, sp, #64 19 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 20 | sub sp, sp, #64 21 | st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp] 22 | sub sp, sp, #64 23 | st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp] 24 | sub sp, sp, #64 25 | st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp] 26 | sub sp, sp, #64 27 | st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp] 28 | stp x19, x20, [sp, -96]! 29 | stp x21, x22, [sp, 16] 30 | stp x23, x24, [sp, 32] 31 | stp x25, x26, [sp, 48] 32 | stp x27, x28, [sp, 64] 33 | stp x29, x30, [sp, 80] 34 | 35 | mov x4, N 36 | 37 | fmov d0, #1.00000000 38 | fmov d1, #1.00000000 39 | loop: 40 | subs x4, x4, #1 41 | INSTR d0, d1 42 | INSTR d1, d0 43 | INSTR d0, d1 44 | INSTR d1, d0 45 | INSTR d0, d1 46 | INSTR d1, d0 47 | bne loop 48 | done: 49 | 50 | # pop callee-save registers from stack 51 | ldp x19, x20, [sp] 52 | ldp x21, x22, [sp, 16] 53 | ldp x23, x24, [sp, 32] 54 | ldp x25, x26, [sp, 48] 55 | ldp x27, x28, [sp, 64] 56 | ldp x29, x30, [sp, 80] 57 | add sp, sp, #96 58 | ld1 {v28.2d, v29.2d, v30.2d, v31.2d}, [sp], #64 59 | ld1 {v24.2d, v25.2d, v26.2d, v27.2d}, [sp], #64 60 | ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [sp], #64 61 | ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [sp], #64 62 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64 63 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64 64 | 65 | ret 66 | 67 | .size latency, .-latency 68 | -------------------------------------------------------------------------------- /src/SSE4.2/vaddpd-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovapd xmm1, xmm0 28 | vmovapd xmm2, xmm0 29 | 30 | # Mark registers as SSE 31 | movapd xmm0, xmm0 32 | movapd xmm1, xmm1 33 | movapd xmm2, xmm2 34 | movapd xmm3, xmm3 35 | movapd xmm4, xmm4 36 | movapd xmm5, xmm5 37 | movapd xmm6, xmm6 38 | movapd xmm7, xmm7 39 | movapd xmm8, xmm8 40 | 41 | loop: 42 | inc i 43 | INSTR xmm3, xmm0, xmm1 44 | INSTR xmm4, xmm1, xmm0 45 | INSTR xmm5, xmm0, xmm2 46 | cmp i, N 47 | INSTR xmm6, xmm2, xmm0 48 | INSTR xmm7, xmm1, xmm2 49 | INSTR xmm8, xmm2, xmm1 50 | jl loop 51 | done: 52 | mov rsp, rbp 53 | pop rbp 54 | ret 55 | .size latency, .-latency 56 | -------------------------------------------------------------------------------- /src/SSE4.2/vaddpd-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovapd xmm1, xmm0 28 | 29 | # Mark registers as SSE 30 | movapd xmm0, xmm0 31 | movapd xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm0, xmm1 36 | INSTR xmm0, xmm0, xmm1 37 | INSTR xmm0, xmm0, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm0, xmm1 40 | INSTR xmm0, xmm0, xmm1 41 | INSTR xmm0, xmm0, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/SSE4.2/vaddps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm3, xmm0, xmm1 31 | INSTR xmm4, xmm1, xmm0 32 | INSTR xmm5, xmm0, xmm2 33 | cmp i, N 34 | INSTR xmm6, xmm2, xmm0 35 | INSTR xmm7, xmm1, xmm2 36 | INSTR xmm8, xmm2, xmm1 37 | jl loop 38 | done: 39 | mov rsp, rbp 40 | pop rbp 41 | ret 42 | .size latency, .-latency 43 | -------------------------------------------------------------------------------- /src/SSE4.2/vaddps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | loop: 29 | inc i 30 | INSTR xmm0, xmm0, xmm1 31 | INSTR xmm0, xmm0, xmm1 32 | INSTR xmm0, xmm0, xmm1 33 | cmp i, N 34 | INSTR xmm0, xmm0, xmm1 35 | INSTR xmm0, xmm0, xmm1 36 | INSTR xmm0, xmm0, xmm1 37 | jl loop 38 | done: 39 | mov rsp, rbp 40 | pop rbp 41 | ret 42 | .size latency, .-latency 43 | -------------------------------------------------------------------------------- /src/SSE4.2/vdivpd-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddpd xmm1, xmm0, xmm0 # create 2.0 27 | vaddpd xmm2, xmm0, xmm1 # create 3.0 28 | vaddpd xmm4, xmm1, xmm1 # create 4.0 29 | vaddpd xmm4, xmm4, xmm4 # create 8.0 30 | vaddpd xmm4, xmm4, xmm4 # create 16.0 31 | vaddpd xmm4, xmm4, xmm4 # create 32.0 32 | vaddpd xmm4, xmm4, xmm4 # create 64.0 33 | vaddpd xmm4, xmm4, xmm4 # create 128.0 34 | vaddpd xmm4, xmm4, xmm4 # create 256.0 35 | vaddpd xmm4, xmm4, xmm4 # create 512.0 36 | vaddpd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivpd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 40 | loop: 41 | inc i 42 | INSTR xmm3, xmm0, xmm1 43 | INSTR xmm4, xmm1, xmm0 44 | INSTR xmm5, xmm0, xmm2 45 | cmp i, N 46 | INSTR xmm6, xmm2, xmm0 47 | INSTR xmm7, xmm1, xmm2 48 | INSTR xmm8, xmm2, xmm1 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SSE4.2/vdivpd-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddpd xmm1, xmm0, xmm0 # create 2.0 27 | vaddpd xmm2, xmm0, xmm1 # create 3.0 28 | vaddpd xmm4, xmm1, xmm1 # create 4.0 29 | vaddpd xmm4, xmm4, xmm4 # create 8.0 30 | vaddpd xmm4, xmm4, xmm4 # create 16.0 31 | vaddpd xmm4, xmm4, xmm4 # create 32.0 32 | vaddpd xmm4, xmm4, xmm4 # create 64.0 33 | vaddpd xmm4, xmm4, xmm4 # create 128.0 34 | vaddpd xmm4, xmm4, xmm4 # create 256.0 35 | vaddpd xmm4, xmm4, xmm4 # create 512.0 36 | vaddpd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivpd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 40 | loop: 41 | inc i 42 | INSTR xmm0, xmm0, xmm1 43 | INSTR xmm0, xmm0, xmm2 44 | INSTR xmm0, xmm0, xmm1 45 | cmp i, N 46 | INSTR xmm0, xmm0, xmm2 47 | INSTR xmm0, xmm0, xmm1 48 | INSTR xmm0, xmm0, xmm2 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SSE4.2/vdivps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | loop: 41 | inc i 42 | INSTR xmm3, xmm0, xmm1 43 | INSTR xmm4, xmm1, xmm0 44 | INSTR xmm5, xmm0, xmm2 45 | cmp i, N 46 | INSTR xmm6, xmm2, xmm0 47 | INSTR xmm7, xmm1, xmm2 48 | INSTR xmm8, xmm2, xmm1 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SSE4.2/vdivps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | loop: 41 | inc i 42 | INSTR xmm0, xmm0, xmm1 43 | INSTR xmm0, xmm0, xmm2 44 | INSTR xmm0, xmm0, xmm1 45 | cmp i, N 46 | INSTR xmm0, xmm0, xmm2 47 | INSTR xmm0, xmm0, xmm1 48 | INSTR xmm0, xmm0, xmm2 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SSE4.2/vmovapd-load-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovapd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR xmm0, [rip+PI] 28 | INSTR xmm1, [rip+PI] 29 | INSTR xmm2, [rip+PI] 30 | cmp i, N 31 | INSTR xmm3, [rip+PI] 32 | INSTR xmm4, [rip+PI] 33 | INSTR xmm5, [rip+PI] 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/SSE4.2/vmovapd-store-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovapd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR [rip+PI], xmm0 28 | INSTR [rip+PI], xmm1 29 | INSTR [rip+PI], xmm2 30 | cmp i, N 31 | INSTR [rip+PI], xmm3 32 | INSTR [rip+PI], xmm4 33 | INSTR [rip+PI], xmm5 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/SSE4.2/vmovupd-load-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovupd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR xmm0, [rip+PI] 28 | INSTR xmm1, [rip+PI] 29 | INSTR xmm2, [rip+PI] 30 | cmp i, N 31 | INSTR xmm3, [rip+PI] 32 | INSTR xmm4, [rip+PI] 33 | INSTR xmm5, [rip+PI] 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/SSE4.2/vmovupd-store-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmovupd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .align 32 13 | PI: 14 | .long 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9, 0xf01b866e,0x400921f9 15 | .text 16 | .globl latency 17 | .type latency, @function 18 | .align 32 19 | latency: 20 | push rbp 21 | mov rbp, rsp 22 | xor i, i 23 | test N, N 24 | jle done 25 | loop: 26 | inc i 27 | INSTR [rip+PI], xmm0 28 | INSTR [rip+PI], xmm1 29 | INSTR [rip+PI], xmm2 30 | cmp i, N 31 | INSTR [rip+PI], xmm3 32 | INSTR [rip+PI], xmm4 33 | INSTR [rip+PI], xmm5 34 | jl loop 35 | done: 36 | mov rsp, rbp 37 | pop rbp 38 | ret 39 | .size latency, .-latency 40 | -------------------------------------------------------------------------------- /src/SSE4.2/vmulpd-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddpd xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivpd xmm2, xmm0, xmm1 30 | 31 | # Mark registers as SSE 32 | movapd xmm0, xmm0 33 | movapd xmm1, xmm1 34 | movapd xmm2, xmm2 35 | movapd xmm3, xmm3 36 | movapd xmm4, xmm4 37 | movapd xmm5, xmm5 38 | movapd xmm6, xmm6 39 | movapd xmm7, xmm7 40 | movapd xmm8, xmm8 41 | 42 | loop: 43 | inc i 44 | INSTR xmm3, xmm0, xmm1 45 | INSTR xmm4, xmm1, xmm0 46 | INSTR xmm5, xmm0, xmm2 47 | cmp i, N 48 | INSTR xmm6, xmm2, xmm0 49 | INSTR xmm7, xmm1, xmm2 50 | INSTR xmm8, xmm2, xmm1 51 | jl loop 52 | done: 53 | mov rsp, rbp 54 | pop rbp 55 | ret 56 | .size latency, .-latency 57 | -------------------------------------------------------------------------------- /src/SSE4.2/vmulpd-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddpd xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivpd xmm2, xmm0, xmm1 30 | 31 | # Mark registers as SSE 32 | movapd xmm0, xmm0 33 | movapd xmm1, xmm1 34 | movapd xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm0, xmm1 39 | INSTR xmm0, xmm0, xmm2 40 | INSTR xmm0, xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm0, xmm2 43 | INSTR xmm0, xmm0, xmm1 44 | INSTR xmm0, xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | -------------------------------------------------------------------------------- /src/SSE4.2/vmulps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | loop: 31 | inc i 32 | INSTR xmm3, xmm0, xmm1 33 | INSTR xmm4, xmm1, xmm0 34 | INSTR xmm5, xmm0, xmm2 35 | cmp i, N 36 | INSTR xmm6, xmm2, xmm0 37 | INSTR xmm7, xmm1, xmm2 38 | INSTR xmm8, xmm2, xmm1 39 | jl loop 40 | done: 41 | mov rsp, rbp 42 | pop rbp 43 | ret 44 | .size latency, .-latency 45 | -------------------------------------------------------------------------------- /src/SSE4.2/vmulps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | 31 | # Mark registers as SSE 32 | movaps xmm0, xmm0 33 | movaps xmm1, xmm1 34 | movaps xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm0, xmm1 39 | INSTR xmm0, xmm0, xmm2 40 | INSTR xmm0, xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm0, xmm2 43 | INSTR xmm0, xmm0, xmm1 44 | INSTR xmm0, xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | -------------------------------------------------------------------------------- /src/SSE4.2/vrcpps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcpps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | vmovaps xmm1, xmm0 41 | vmovaps xmm2, xmm0 42 | vmovaps xmm3, xmm0 43 | vmovaps xmm4, xmm0 44 | vmovaps xmm5, xmm0 45 | loop: 46 | inc i 47 | INSTR xmm10, xmm0 48 | INSTR xmm11, xmm1 49 | INSTR xmm12, xmm2 50 | cmp i, N 51 | INSTR xmm13, xmm3 52 | INSTR xmm14, xmm4 53 | INSTR xmm15, xmm5 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/SSE4.2/vrcpps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vrcpps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | loop: 41 | inc i 42 | INSTR xmm1, xmm0 43 | INSTR xmm2, xmm1 44 | INSTR xmm3, xmm2 45 | cmp i, N 46 | INSTR xmm4, xmm3 47 | INSTR xmm5, xmm4 48 | INSTR xmm0, xmm5 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SSE4.2/vsqrtpd-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddpd xmm1, xmm0, xmm0 # create 2.0 27 | vaddpd xmm2, xmm0, xmm1 # create 3.0 28 | vaddpd xmm4, xmm1, xmm1 # create 4.0 29 | vaddpd xmm4, xmm4, xmm4 # create 8.0 30 | vaddpd xmm4, xmm4, xmm4 # create 16.0 31 | vaddpd xmm4, xmm4, xmm4 # create 32.0 32 | vaddpd xmm4, xmm4, xmm4 # create 64.0 33 | vaddpd xmm4, xmm4, xmm4 # create 128.0 34 | vaddpd xmm4, xmm4, xmm4 # create 256.0 35 | vaddpd xmm4, xmm4, xmm4 # create 512.0 36 | vaddpd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivpd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 40 | vmovapd xmm1, xmm0 41 | vmovapd xmm2, xmm0 42 | vmovapd xmm3, xmm0 43 | vmovapd xmm4, xmm0 44 | vmovapd xmm5, xmm0 45 | loop: 46 | inc i 47 | INSTR xmm10, xmm0 48 | INSTR xmm11, xmm1 49 | INSTR xmm12, xmm2 50 | cmp i, N 51 | INSTR xmm13, xmm3 52 | INSTR xmm14, xmm4 53 | INSTR xmm15, xmm5 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/SSE4.2/vsqrtpd-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtpd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddpd xmm1, xmm0, xmm0 # create 2.0 27 | vaddpd xmm2, xmm0, xmm1 # create 3.0 28 | vaddpd xmm4, xmm1, xmm1 # create 4.0 29 | vaddpd xmm4, xmm4, xmm4 # create 8.0 30 | vaddpd xmm4, xmm4, xmm4 # create 16.0 31 | vaddpd xmm4, xmm4, xmm4 # create 32.0 32 | vaddpd xmm4, xmm4, xmm4 # create 64.0 33 | vaddpd xmm4, xmm4, xmm4 # create 128.0 34 | vaddpd xmm4, xmm4, xmm4 # create 256.0 35 | vaddpd xmm4, xmm4, xmm4 # create 512.0 36 | vaddpd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivpd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivpd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddpd xmm0, xmm1, xmm1 # create 2*341.3333 40 | vmovapd xmm10, xmm0 # save value 41 | loop: 42 | inc i 43 | INSTR xmm1, xmm0 44 | vaddpd xmm1, xmm1, xmm10 45 | INSTR xmm2, xmm1 46 | vaddpd xmm2, xmm2, xmm10 47 | INSTR xmm3, xmm2 48 | vaddpd xmm3, xmm3, xmm10 49 | cmp i, N 50 | INSTR xmm4, xmm3 51 | vaddpd xmm4, xmm4, xmm10 52 | INSTR xmm5, xmm4 53 | vaddpd xmm5, xmm5, xmm10 54 | INSTR xmm0, xmm5 55 | vaddpd xmm0, xmm0, xmm10 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/SSE4.2/vsqrtps-sse-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | vmovaps xmm1, xmm0 41 | vmovaps xmm2, xmm0 42 | vmovaps xmm3, xmm0 43 | vmovaps xmm4, xmm0 44 | vmovaps xmm5, xmm0 45 | loop: 46 | inc i 47 | INSTR xmm10, xmm0 48 | INSTR xmm11, xmm1 49 | INSTR xmm12, xmm2 50 | cmp i, N 51 | INSTR xmm13, xmm3 52 | INSTR xmm14, xmm4 53 | INSTR xmm15, xmm5 54 | jl loop 55 | done: 56 | mov rsp, rbp 57 | pop rbp 58 | ret 59 | .size latency, .-latency 60 | -------------------------------------------------------------------------------- /src/SSE4.2/vsqrtps-sse.S: -------------------------------------------------------------------------------- 1 | #define INSTR vsqrtps 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | vmovaps xmm10, xmm0 # save value 41 | loop: 42 | inc i 43 | INSTR xmm1, xmm0 44 | vaddps xmm1, xmm1, xmm10 45 | INSTR xmm2, xmm1 46 | vaddps xmm2, xmm2, xmm10 47 | INSTR xmm3, xmm2 48 | vaddps xmm3, xmm3, xmm10 49 | cmp i, N 50 | INSTR xmm4, xmm3 51 | vaddps xmm4, xmm4, xmm10 52 | INSTR xmm5, xmm4 53 | vaddps xmm5, xmm5, xmm10 54 | INSTR xmm0, xmm5 55 | vaddps xmm0, xmm0, xmm10 56 | jl loop 57 | done: 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/SVE/addpl-x_x_i-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR addpl 2 | #define NINST 8 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | mov x1, #1 28 | mov x2, #2 29 | mov x3, #3 30 | mov x5, #4 31 | loop: 32 | subs x4, x4, #1 33 | INSTR x1, x1, #1 34 | INSTR x1, x1, #1 35 | INSTR x1, x1, #1 36 | INSTR x1, x1, #1 37 | INSTR x1, x1, #1 38 | INSTR x1, x1, #1 39 | INSTR x1, x1, #1 40 | INSTR x1, x1, #1 41 | bne loop 42 | done: 43 | 44 | # pop callee-save registers from stack 45 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 46 | add sp, sp, #64 47 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 48 | add sp, sp, #64 49 | 50 | ret 51 | 52 | .size latency, .-latency 53 | -------------------------------------------------------------------------------- /src/SVE/addpl-x_x_i-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR addpl 2 | #define NINST 16 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | mov x1, #1 28 | mov x2, #2 29 | mov x3, #3 30 | mov x5, #4 31 | loop: 32 | subs x4, x4, #1 33 | INSTR x6, x1, #1 34 | INSTR x7, x2, #1 35 | INSTR x8, x3, #1 36 | INSTR x9, x5, #1 37 | INSTR x10, x1, #1 38 | INSTR x11, x2, #1 39 | INSTR x12, x3, #1 40 | INSTR x13, x5, #1 41 | INSTR x6, x1, #1 42 | INSTR x7, x2, #1 43 | INSTR x8, x3, #1 44 | INSTR x9, x5, #1 45 | INSTR x10, x1, #1 46 | INSTR x11, x2, #1 47 | INSTR x12, x3, #1 48 | INSTR x13, x5, #1 49 | bne loop 50 | done: 51 | 52 | # pop callee-save registers from stack 53 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 54 | add sp, sp, #64 55 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 56 | add sp, sp, #64 57 | 58 | ret 59 | 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/SVE/fadd-zd_zd_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR fadd 2 | #define NINST 8 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | fcpy z1.d, p0/m, #2.00000000 27 | fcpy z2.d, p0/m, #3.00000000 28 | fcpy z3.d, p0/m, #1.00000000 29 | fcpy z4.d, p0/m, #2.00000000 30 | fcpy z5.d, p0/m, #3.00000000 31 | fcpy z6.d, p0/m, #1.00000000 32 | fcpy z7.d, p0/m, #2.00000000 33 | 34 | loop: 35 | subs x4, x4, #1 36 | INSTR z0.d, z0.d, z0.d 37 | INSTR z0.d, z0.d, z0.d 38 | INSTR z0.d, z0.d, z0.d 39 | INSTR z0.d, z0.d, z0.d 40 | INSTR z0.d, z0.d, z0.d 41 | INSTR z0.d, z0.d, z0.d 42 | INSTR z0.d, z0.d, z0.d 43 | INSTR z0.d, z0.d, z0.d 44 | bne loop 45 | done: 46 | # pop callee-save registers from stack 47 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 48 | add sp, sp, #64 49 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 50 | add sp, sp, #64 51 | 52 | ret 53 | 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SVE/fadd-zd_zd_zd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR fadd 2 | #define NINST 13 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z1.d, z0.d, z0.d 30 | INSTR z2.d, z0.d, z0.d 31 | INSTR z3.d, z0.d, z0.d 32 | INSTR z4.d, z0.d, z0.d 33 | INSTR z5.d, z0.d, z0.d 34 | INSTR z6.d, z0.d, z0.d 35 | INSTR z7.d, z0.d, z0.d 36 | INSTR z8.d, z0.d, z0.d 37 | INSTR z9.d, z0.d, z0.d 38 | INSTR z10.d, z0.d, z0.d 39 | INSTR z11.d, z0.d, z0.d 40 | INSTR z12.d, z0.d, z0.d 41 | INSTR z13.d, z0.d, z0.d 42 | bne loop 43 | done: 44 | # pop callee-save registers from stack 45 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 46 | add sp, sp, #64 47 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 48 | add sp, sp, #64 49 | 50 | ret 51 | 52 | .size latency, .-latency 53 | -------------------------------------------------------------------------------- /src/SVE/fadda-d_p_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR fadda 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR d0, p0, d0, z0.d 30 | INSTR d0, p0, d0, z0.d 31 | INSTR d0, p0, d0, z0.d 32 | INSTR d0, p0, d0, z0.d 33 | INSTR d0, p0, d0, z0.d 34 | INSTR d0, p0, d0, z0.d 35 | bne loop 36 | done: 37 | # pop callee-save registers from stack 38 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 39 | add sp, sp, #64 40 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 41 | add sp, sp, #64 42 | 43 | ret 44 | 45 | .size latency, .-latency 46 | -------------------------------------------------------------------------------- /src/SVE/faddv-d_p_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR faddv 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR d0, p0, z0.d 30 | INSTR d0, p0, z0.d 31 | INSTR d0, p0, z0.d 32 | INSTR d0, p0, z0.d 33 | INSTR d0, p0, z0.d 34 | INSTR d0, p0, z0.d 35 | bne loop 36 | done: 37 | # pop callee-save registers from stack 38 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 39 | add sp, sp, #64 40 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 41 | add sp, sp, #64 42 | 43 | ret 44 | 45 | .size latency, .-latency 46 | -------------------------------------------------------------------------------- /src/SVE/fmad-zd_p_zd_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmad 2 | #define NINST 8 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z0.d, p0/m, z0.d, z0.d 30 | INSTR z0.d, p0/m, z0.d, z0.d 31 | INSTR z0.d, p0/m, z0.d, z0.d 32 | INSTR z0.d, p0/m, z0.d, z0.d 33 | INSTR z0.d, p0/m, z0.d, z0.d 34 | INSTR z0.d, p0/m, z0.d, z0.d 35 | INSTR z0.d, p0/m, z0.d, z0.d 36 | INSTR z0.d, p0/m, z0.d, z0.d 37 | bne loop 38 | done: 39 | # pop callee-save registers from stack 40 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 41 | add sp, sp, #64 42 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 43 | add sp, sp, #64 44 | 45 | ret 46 | 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/SVE/fmad-zd_p_zd_zd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmad 2 | #define NINST 24 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z1.d, p0/m, z0.d, z0.d 30 | INSTR z2.d, p0/m, z0.d, z0.d 31 | INSTR z3.d, p0/m, z0.d, z0.d 32 | INSTR z4.d, p0/m, z0.d, z0.d 33 | INSTR z5.d, p0/m, z0.d, z0.d 34 | INSTR z6.d, p0/m, z0.d, z0.d 35 | INSTR z7.d, p0/m, z0.d, z0.d 36 | INSTR z8.d, p0/m, z0.d, z0.d 37 | INSTR z9.d, p0/m, z0.d, z0.d 38 | INSTR z10.d, p0/m, z0.d, z0.d 39 | INSTR z11.d, p0/m, z0.d, z0.d 40 | INSTR z12.d, p0/m, z0.d, z0.d 41 | INSTR z13.d, p0/m, z0.d, z0.d 42 | INSTR z14.d, p0/m, z0.d, z0.d 43 | INSTR z15.d, p0/m, z0.d, z0.d 44 | INSTR z16.d, p0/m, z0.d, z0.d 45 | INSTR z17.d, p0/m, z0.d, z0.d 46 | INSTR z18.d, p0/m, z0.d, z0.d 47 | INSTR z19.d, p0/m, z0.d, z0.d 48 | INSTR z20.d, p0/m, z0.d, z0.d 49 | INSTR z21.d, p0/m, z0.d, z0.d 50 | INSTR z22.d, p0/m, z0.d, z0.d 51 | INSTR z23.d, p0/m, z0.d, z0.d 52 | INSTR z24.d, p0/m, z0.d, z0.d 53 | bne loop 54 | done: 55 | # pop callee-save registers from stack 56 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 57 | add sp, sp, #64 58 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 59 | add sp, sp, #64 60 | 61 | ret 62 | 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/SVE/fmla-zd_p_zd_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmla 2 | #define NINST 8 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z0.d, p0/m, z0.d, z0.d 30 | INSTR z0.d, p0/m, z0.d, z0.d 31 | INSTR z0.d, p0/m, z0.d, z0.d 32 | INSTR z0.d, p0/m, z0.d, z0.d 33 | INSTR z0.d, p0/m, z0.d, z0.d 34 | INSTR z0.d, p0/m, z0.d, z0.d 35 | INSTR z0.d, p0/m, z0.d, z0.d 36 | INSTR z0.d, p0/m, z0.d, z0.d 37 | bne loop 38 | done: 39 | # pop callee-save registers from stack 40 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 41 | add sp, sp, #64 42 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 43 | add sp, sp, #64 44 | 45 | ret 46 | 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/SVE/fmla-zd_p_zd_zd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmla 2 | #define NINST 24 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z1.d, p0/m, z0.d, z0.d 30 | INSTR z2.d, p0/m, z0.d, z0.d 31 | INSTR z3.d, p0/m, z0.d, z0.d 32 | INSTR z4.d, p0/m, z0.d, z0.d 33 | INSTR z5.d, p0/m, z0.d, z0.d 34 | INSTR z6.d, p0/m, z0.d, z0.d 35 | INSTR z7.d, p0/m, z0.d, z0.d 36 | INSTR z8.d, p0/m, z0.d, z0.d 37 | INSTR z9.d, p0/m, z0.d, z0.d 38 | INSTR z10.d, p0/m, z0.d, z0.d 39 | INSTR z11.d, p0/m, z0.d, z0.d 40 | INSTR z12.d, p0/m, z0.d, z0.d 41 | INSTR z13.d, p0/m, z0.d, z0.d 42 | INSTR z14.d, p0/m, z0.d, z0.d 43 | INSTR z15.d, p0/m, z0.d, z0.d 44 | INSTR z16.d, p0/m, z0.d, z0.d 45 | INSTR z17.d, p0/m, z0.d, z0.d 46 | INSTR z18.d, p0/m, z0.d, z0.d 47 | INSTR z19.d, p0/m, z0.d, z0.d 48 | INSTR z20.d, p0/m, z0.d, z0.d 49 | INSTR z21.d, p0/m, z0.d, z0.d 50 | INSTR z22.d, p0/m, z0.d, z0.d 51 | INSTR z23.d, p0/m, z0.d, z0.d 52 | INSTR z24.d, p0/m, z0.d, z0.d 53 | bne loop 54 | done: 55 | # pop callee-save registers from stack 56 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 57 | add sp, sp, #64 58 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 59 | add sp, sp, #64 60 | 61 | ret 62 | 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/SVE/fmul-zd_zd_zd-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmul 2 | #define NINST 8 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | fcpy z1.d, p0/m, #2.00000000 27 | fcpy z2.d, p0/m, #3.00000000 28 | fcpy z3.d, p0/m, #1.00000000 29 | fcpy z4.d, p0/m, #2.00000000 30 | fcpy z5.d, p0/m, #3.00000000 31 | fcpy z6.d, p0/m, #1.00000000 32 | fcpy z7.d, p0/m, #2.00000000 33 | 34 | loop: 35 | subs x4, x4, #1 36 | INSTR z0.d, z0.d, z0.d 37 | INSTR z0.d, z0.d, z0.d 38 | INSTR z0.d, z0.d, z0.d 39 | INSTR z0.d, z0.d, z0.d 40 | INSTR z0.d, z0.d, z0.d 41 | INSTR z0.d, z0.d, z0.d 42 | INSTR z0.d, z0.d, z0.d 43 | INSTR z0.d, z0.d, z0.d 44 | bne loop 45 | done: 46 | # pop callee-save registers from stack 47 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 48 | add sp, sp, #64 49 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 50 | add sp, sp, #64 51 | 52 | ret 53 | 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/SVE/fmul-zd_zd_zd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR fmul 2 | #define NINST 13 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | loop: 28 | subs x4, x4, #1 29 | INSTR z1.d, z0.d, z0.d 30 | INSTR z2.d, z0.d, z0.d 31 | INSTR z3.d, z0.d, z0.d 32 | INSTR z4.d, z0.d, z0.d 33 | INSTR z5.d, z0.d, z0.d 34 | INSTR z6.d, z0.d, z0.d 35 | INSTR z7.d, z0.d, z0.d 36 | INSTR z8.d, z0.d, z0.d 37 | INSTR z9.d, z0.d, z0.d 38 | INSTR z10.d, z0.d, z0.d 39 | INSTR z11.d, z0.d, z0.d 40 | INSTR z12.d, z0.d, z0.d 41 | INSTR z13.d, z0.d, z0.d 42 | bne loop 43 | done: 44 | # pop callee-save registers from stack 45 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 46 | add sp, sp, #64 47 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 48 | add sp, sp, #64 49 | 50 | ret 51 | 52 | .size latency, .-latency 53 | -------------------------------------------------------------------------------- /src/SVE/incd-x-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR incd 2 | #define NINST 16 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | fcpy z0.d, p0/m, #1.00000000 26 | 27 | mov x1, #1 28 | mov x2, #2 29 | mov x3, #3 30 | mov x5, #4 31 | loop: 32 | subs x4, x4, #1 33 | INSTR x6 34 | INSTR x7 35 | INSTR x8 36 | INSTR x9 37 | INSTR x10 38 | INSTR x11 39 | INSTR x12 40 | INSTR x13 41 | INSTR x6 42 | INSTR x7 43 | INSTR x8 44 | INSTR x9 45 | INSTR x10 46 | INSTR x11 47 | INSTR x12 48 | INSTR x13 49 | bne loop 50 | done: 51 | 52 | # pop callee-save registers from stack 53 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 54 | add sp, sp, #64 55 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 56 | add sp, sp, #64 57 | 58 | ret 59 | 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/SVE/ld1d-z_p_mb-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR ld1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | mov x24, sp 28 | 29 | ptrue p0.d 30 | fcpy z0.d, p0/m, #1.00000000 31 | fcpy z1.d, p0/m, #1.00000000 32 | fcpy z2.d, p0/m, #1.00000000 33 | 34 | mov x1, sp 35 | mov x2, sp 36 | mov x3, sp 37 | mov x5, sp 38 | mov x6, sp 39 | add x1, x1, #512 40 | add x2, x1, #512 41 | sub x3, x2, #512 42 | sub x5, x3, #512 43 | sub x6, x5, #512 44 | loop: 45 | subs x4, x4, #1 46 | INSTR {z0.d}, p0/z, [sp] 47 | INSTR {z1.d}, p0/z, [x1] 48 | INSTR {z2.d}, p0/z, [x2] 49 | INSTR {z3.d}, p0/z, [x3] 50 | INSTR {z4.d}, p0/z, [x5] 51 | INSTR {z5.d}, p0/z, [x6] 52 | bne loop 53 | done: 54 | mov sp, x24 55 | # pop callee-save registers from stack 56 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 57 | add sp, sp, #64 58 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 59 | add sp, sp, #64 60 | 61 | ret 62 | 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/SVE/ld1d-zd_p_mb-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR ld1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, sp 34 | mov x2, sp 35 | mov x3, sp 36 | mov x5, sp 37 | mov x6, sp 38 | add x1, x1, #512 39 | add x2, x1, #512 40 | sub x3, x2, #512 41 | sub x5, x3, #512 42 | sub x6, x5, #512 43 | loop: 44 | subs x4, x4, #1 45 | INSTR {z0.d}, p0/z, [sp] 46 | INSTR {z1.d}, p0/z, [x1] 47 | INSTR {z2.d}, p0/z, [x2] 48 | INSTR {z3.d}, p0/z, [x3] 49 | INSTR {z4.d}, p0/z, [x5] 50 | INSTR {z5.d}, p0/z, [x6] 51 | bne loop 52 | done: 53 | # pop callee-save registers from stack 54 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 55 | add sp, sp, #64 56 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 57 | add sp, sp, #64 58 | 59 | ret 60 | 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/SVE/ld1d-zd_p_mbi-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR ld1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, #-64 34 | mov x2, #-128 35 | mov x3, #-192 36 | mov x5, #-256 37 | mov x6, #-320 38 | mov x7, #-384 39 | loop: 40 | subs x4, x4, #1 41 | INSTR {z0.d}, p0/z, [sp, x1, lsl 3] 42 | INSTR {z1.d}, p0/z, [sp, x2, lsl 3] 43 | INSTR {z2.d}, p0/z, [sp, x3, lsl 3] 44 | INSTR {z3.d}, p0/z, [sp, x5, lsl 3] 45 | INSTR {z4.d}, p0/z, [sp, x6, lsl 3] 46 | INSTR {z5.d}, p0/z, [sp, x7, lsl 3] 47 | bne loop 48 | done: 49 | # pop callee-save registers from stack 50 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 51 | add sp, sp, #64 52 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 53 | add sp, sp, #64 54 | 55 | ret 56 | 57 | .size latency, .-latency 58 | -------------------------------------------------------------------------------- /src/SVE/st1d-zd_p_mb-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR st1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, sp 34 | mov x2, sp 35 | mov x3, sp 36 | mov x5, sp 37 | mov x6, sp 38 | sub x1, x1, #64 39 | sub x2, x2, #128 40 | sub x3, x3, #192 41 | sub x5, x5, #256 42 | sub x6, x6, #320 43 | loop: 44 | subs x4, x4, #1 45 | INSTR {z0.d}, p0, [sp] 46 | INSTR {z1.d}, p0, [x1] 47 | INSTR {z2.d}, p0, [x2] 48 | INSTR {z3.d}, p0, [x3] 49 | INSTR {z4.d}, p0, [x5] 50 | INSTR {z5.d}, p0, [x6] 51 | bne loop 52 | done: 53 | # pop callee-save registers from stack 54 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 55 | add sp, sp, #64 56 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 57 | add sp, sp, #64 58 | 59 | ret 60 | 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/SVE/st1d-zd_p_mb-il_1_1-dup-d_vd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR st1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, sp 34 | mov x2, sp 35 | mov x3, sp 36 | mov x5, sp 37 | mov x6, sp 38 | sub x1, x1, #64 39 | sub x2, x2, #128 40 | sub x3, x3, #192 41 | sub x5, x5, #256 42 | sub x6, x6, #320 43 | loop: 44 | subs x4, x4, #1 45 | INSTR {z0.d}, p0, [sp] 46 | dup d6, v1.d[0] 47 | INSTR {z1.d}, p0, [x1] 48 | dup d7, v1.d[0] 49 | INSTR {z2.d}, p0, [x2] 50 | dup d8, v1.d[0] 51 | INSTR {z3.d}, p0, [x3] 52 | dup d9, v1.d[0] 53 | INSTR {z4.d}, p0, [x5] 54 | dup d10, v1.d[0] 55 | INSTR {z5.d}, p0, [x6] 56 | dup d11, v1.d[0] 57 | bne loop 58 | done: 59 | # pop callee-save registers from stack 60 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 61 | add sp, sp, #64 62 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 63 | add sp, sp, #64 64 | 65 | ret 66 | 67 | .size latency, .-latency 68 | -------------------------------------------------------------------------------- /src/SVE/st1d-zd_p_mb-il_1_1-str-x_mb-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR st1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, sp 34 | mov x2, sp 35 | mov x3, sp 36 | mov x5, sp 37 | mov x6, sp 38 | sub x1, x1, #64 39 | sub x2, x2, #128 40 | sub x3, x3, #192 41 | sub x5, x5, #256 42 | sub x6, x6, #320 43 | loop: 44 | subs x4, x4, #1 45 | INSTR {z0.d}, p0, [sp] 46 | str x7, [sp] 47 | INSTR {z1.d}, p0, [x1] 48 | str x8, [sp] 49 | INSTR {z2.d}, p0, [x2] 50 | str x9, [sp] 51 | INSTR {z3.d}, p0, [x3] 52 | str x10, [sp] 53 | INSTR {z4.d}, p0, [x5] 54 | str x11, [sp] 55 | INSTR {z5.d}, p0, [x6] 56 | str x12, [sp] 57 | bne loop 58 | done: 59 | # pop callee-save registers from stack 60 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 61 | add sp, sp, #64 62 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 63 | add sp, sp, #64 64 | 65 | ret 66 | 67 | .size latency, .-latency 68 | -------------------------------------------------------------------------------- /src/SVE/st1d-zd_p_mb-il_1_2-dup-d_vd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR st1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, sp 34 | mov x2, sp 35 | mov x3, sp 36 | mov x5, sp 37 | mov x6, sp 38 | sub x1, x1, #64 39 | sub x2, x2, #128 40 | sub x3, x3, #192 41 | sub x5, x5, #256 42 | sub x6, x6, #320 43 | loop: 44 | subs x4, x4, #1 45 | INSTR {z0.d}, p0, [sp] 46 | dup d6, v1.d[0] 47 | dup d7, v1.d[0] 48 | INSTR {z1.d}, p0, [x1] 49 | dup d8, v1.d[0] 50 | dup d9, v1.d[0] 51 | INSTR {z2.d}, p0, [x2] 52 | dup d10, v1.d[0] 53 | dup d11, v1.d[0] 54 | INSTR {z3.d}, p0, [x3] 55 | dup d12, v1.d[0] 56 | dup d13, v1.d[0] 57 | INSTR {z4.d}, p0, [x5] 58 | dup d14, v1.d[0] 59 | dup d15, v1.d[0] 60 | INSTR {z5.d}, p0, [x6] 61 | dup d16, v1.d[0] 62 | dup d17, v1.d[0] 63 | bne loop 64 | done: 65 | # pop callee-save registers from stack 66 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 67 | add sp, sp, #64 68 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 69 | add sp, sp, #64 70 | 71 | ret 72 | 73 | .size latency, .-latency 74 | -------------------------------------------------------------------------------- /src/SVE/st1d-zd_p_mbi-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR st1d 2 | #define NINST 6 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | fmov v0.2d, #1.00000000 25 | fmov v1.2d, #1.00000000 26 | fmov v2.2d, #1.00000000 27 | 28 | ptrue p0.d 29 | fcpy z0.d, p0/m, #1.00000000 30 | fcpy z1.d, p0/m, #1.00000000 31 | fcpy z2.d, p0/m, #1.00000000 32 | 33 | mov x1, #-64 34 | mov x2, #-128 35 | mov x3, #-192 36 | mov x5, #-256 37 | mov x6, #-320 38 | mov x7, #-384 39 | loop: 40 | subs x4, x4, #1 41 | INSTR {z0.d}, p0, [sp, x1, lsl 3] 42 | INSTR {z1.d}, p0, [sp, x2, lsl 3] 43 | INSTR {z2.d}, p0, [sp, x3, lsl 3] 44 | INSTR {z3.d}, p0, [sp, x5, lsl 3] 45 | INSTR {z4.d}, p0, [sp, x6, lsl 3] 46 | INSTR {z5.d}, p0, [sp, x7, lsl 3] 47 | bne loop 48 | done: 49 | # pop callee-save registers from stack 50 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 51 | add sp, sp, #64 52 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 53 | add sp, sp, #64 54 | 55 | ret 56 | 57 | .size latency, .-latency 58 | -------------------------------------------------------------------------------- /src/SVE/whilelo-pd_x_x-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR whilelo 2 | #define NINST 12 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | ptrue p1.d 26 | ptrue p2.d 27 | ptrue p3.d 28 | ptrue p4.d 29 | ptrue p5.d 30 | fcpy z0.d, p0/m, #1.00000000 31 | 32 | mov x1, #1 33 | mov x2, #2 34 | mov x3, #3 35 | mov x5, #4 36 | loop: 37 | INSTR p0.d, x1, x2 38 | INSTR p1.d, x1, x2 39 | INSTR p2.d, x1, x2 40 | INSTR p3.d, x1, x2 41 | INSTR p4.d, x1, x2 42 | INSTR p5.d, x1, x2 43 | INSTR p0.d, x1, x2 44 | INSTR p1.d, x1, x2 45 | INSTR p2.d, x1, x2 46 | INSTR p3.d, x1, x2 47 | INSTR p4.d, x1, x2 48 | INSTR p5.d, x1, x2 49 | subs x4, x4, #1 50 | bne loop 51 | done: 52 | 53 | # pop callee-save registers from stack 54 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 55 | add sp, sp, #64 56 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 57 | add sp, sp, #64 58 | 59 | ret 60 | 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/SVE/whilelo-pd_x_x-il_1_1-mul-x_x_x-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR whilelo 2 | #define NINST 12 3 | #define N x0 4 | 5 | .arch armv8.2-a+sve 6 | .globl ninst 7 | .data 8 | ninst: 9 | .long NINST 10 | .text 11 | .globl latency 12 | .type latency, @function 13 | .align 2 14 | latency: 15 | 16 | # push callee-save registers onto stack 17 | sub sp, sp, #64 18 | st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 19 | sub sp, sp, #64 20 | st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 21 | 22 | mov x4, N 23 | 24 | ptrue p0.d 25 | ptrue p1.d 26 | ptrue p2.d 27 | ptrue p3.d 28 | ptrue p4.d 29 | ptrue p5.d 30 | fcpy z0.d, p0/m, #1.00000000 31 | 32 | mov x1, #1 33 | mov x2, #2 34 | mov x3, #3 35 | mov x5, #4 36 | loop: 37 | INSTR p0.d, x1, x1 38 | mul x3, x2, x2 39 | INSTR p1.d, x1, x1 40 | mul x5, x2, x2 41 | INSTR p2.d, x1, x1 42 | mul x6, x2, x2 43 | INSTR p3.d, x1, x1 44 | mul x7, x2, x2 45 | INSTR p4.d, x1, x1 46 | mul x8, x2, x2 47 | INSTR p5.d, x1, x1 48 | mul x9, x2, x2 49 | INSTR p0.d, x1, x1 50 | mul x10, x2, x2 51 | INSTR p1.d, x1, x1 52 | mul x11, x2, x2 53 | INSTR p2.d, x1, x1 54 | mul x12, x2, x2 55 | INSTR p3.d, x1, x1 56 | mul x13, x2, x2 57 | INSTR p4.d, x1, x1 58 | mul x14, x2, x2 59 | INSTR p5.d, x1, x1 60 | mul x15, x2, x2 61 | subs x4, x4, #1 62 | bne loop 63 | done: 64 | 65 | # pop callee-save registers from stack 66 | ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp] 67 | add sp, sp, #64 68 | ld1 {v8.2d, v9.2d, v10.2d, v11.2d}, [sp] 69 | add sp, sp, #64 70 | 71 | ret 72 | 73 | .size latency, .-latency 74 | -------------------------------------------------------------------------------- /src/VSX/xvadddp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvadddp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | zero: 11 | .double 0.0, 0.0 12 | one: 13 | .double 1.0, 1.0 14 | .text 15 | .abiversion 2 16 | .section ".toc","aw" 17 | .section ".text" 18 | .align 2 19 | .globl latency 20 | .type latency, @function 21 | latency : 22 | 0: addis 2,12,.TOC.-0b@ha 23 | addi 2,2,.TOC.-0b@l 24 | .localentry latency, .-latency 25 | 26 | mtctr N # move to count register 27 | # load DP FP zero 28 | li 10, 0 29 | 30 | addis 9,2,zero@toc@ha 31 | addi 9,9,zero@toc@l 32 | lxvd2x 0, 0, 9 33 | 34 | addis 9,2,one@toc@ha 35 | addi 9,9,one@toc@l 36 | lxvd2x 1, 0, 9 37 | loop: 38 | INSTR 0, 0, 1 39 | INSTR 0, 0, 1 40 | INSTR 0, 0, 1 41 | INSTR 0, 0, 1 42 | INSTR 0, 0, 1 43 | INSTR 0, 0, 1 44 | bdnz loop 45 | xvmovdp 1, 0 46 | blr 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/VSX/xvaddsp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvaddsp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | zero: 11 | .single 0.0, 0.0 12 | one: 13 | .single 1.0, 1.0 14 | .text 15 | .abiversion 2 16 | .section ".toc","aw" 17 | .section ".text" 18 | .align 2 19 | .globl latency 20 | .type latency, @function 21 | latency : 22 | 0: addis 2,12,.TOC.-0b@ha 23 | addi 2,2,.TOC.-0b@l 24 | .localentry latency, .-latency 25 | 26 | mtctr N # move to count register 27 | # load DP FP zero 28 | li 10, 0 29 | 30 | addis 9,2,zero@toc@ha 31 | addi 9,9,zero@toc@l 32 | lxvd2x 0, 0, 9 33 | 34 | addis 9,2,one@toc@ha 35 | addi 9,9,one@toc@l 36 | lxvd2x 1, 0, 9 37 | loop: 38 | INSTR 0, 0, 1 39 | INSTR 0, 0, 1 40 | INSTR 0, 0, 1 41 | INSTR 0, 0, 1 42 | INSTR 0, 0, 1 43 | INSTR 0, 0, 1 44 | bdnz loop 45 | xvmovdp 1, 0 46 | blr 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/VSX/xvdivdp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvdivdp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | half: 11 | .double 0.5, 0.5 12 | one: 13 | .double 1.0, 1.0 14 | two: 15 | .double 2.0, 2.0 16 | .text 17 | .abiversion 2 18 | .section ".toc","aw" 19 | .section ".text" 20 | .align 2 21 | .globl latency 22 | .type latency, @function 23 | latency : 24 | 0: addis 2,12,.TOC.-0b@ha 25 | addi 2,2,.TOC.-0b@l 26 | .localentry latency, .-latency 27 | 28 | mtctr N # move to count register 29 | li 10, 0 # offset zero 30 | addis 9,2,one@toc@ha # upper 32 bit of address 31 | addi 9,9,one@toc@l # lower 32 bit of address 32 | lxvd2x 0, 0, 9 33 | addis 9,2,half@toc@ha # upper 32 bit of address 34 | addi 9,9,half@toc@l # lower 32 bit of address 35 | lxvd2x 1, 0, 9 36 | addis 9,2,two@toc@ha # upper 32 bit of address 37 | addi 9,9,two@toc@l # lower 32 bit of address 38 | lxvd2x 2, 0, 9 39 | loop: 40 | INSTR 0, 0, 1 41 | INSTR 0, 0, 2 42 | INSTR 0, 0, 1 43 | INSTR 0, 0, 2 44 | INSTR 0, 0, 1 45 | INSTR 0, 0, 2 46 | bdnz loop 47 | xvmovdp 1, 0 48 | blr 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/VSX/xvdivsp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvdivsp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | half: 11 | .single 0.5, 0.5 12 | one: 13 | .single 1.0, 1.0 14 | two: 15 | .single 2.0, 2.0 16 | .text 17 | .abiversion 2 18 | .section ".toc","aw" 19 | .section ".text" 20 | .align 2 21 | .globl latency 22 | .type latency, @function 23 | latency : 24 | 0: addis 2,12,.TOC.-0b@ha 25 | addi 2,2,.TOC.-0b@l 26 | .localentry latency, .-latency 27 | 28 | mtctr N # move to count register 29 | li 10, 0 # offset zero 30 | addis 9,2,one@toc@ha # upper 32 bit of address 31 | addi 9,9,one@toc@l # lower 32 bit of address 32 | lxvd2x 0, 0, 9 33 | addis 9,2,half@toc@ha # upper 32 bit of address 34 | addi 9,9,half@toc@l # lower 32 bit of address 35 | lxvd2x 1, 0, 9 36 | addis 9,2,two@toc@ha # upper 32 bit of address 37 | addi 9,9,two@toc@l # lower 32 bit of address 38 | lxvd2x 2, 0, 9 39 | loop: 40 | INSTR 0, 0, 1 41 | INSTR 0, 0, 2 42 | INSTR 0, 0, 1 43 | INSTR 0, 0, 2 44 | INSTR 0, 0, 1 45 | INSTR 0, 0, 2 46 | bdnz loop 47 | xvmovdp 1, 0 48 | blr 49 | .size latency, .-latency 50 | -------------------------------------------------------------------------------- /src/VSX/xvmaddadp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvmaddadp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | zero: 11 | .double 0.0, 0.0 12 | two: 13 | .double 2.0, 2.0 14 | three: 15 | .double 3.0, 3.0 16 | .text 17 | .abiversion 2 18 | .section ".toc","aw" 19 | .section ".text" 20 | .align 2 21 | .globl latency 22 | .type latency, @function 23 | latency : 24 | 0: addis 2,12,.TOC.-0b@ha 25 | addi 2,2,.TOC.-0b@l 26 | .localentry latency, .-latency 27 | 28 | mtctr N # move to count register 29 | # load DP FP zero 30 | li 10, 0 31 | 32 | addis 9,2,zero@toc@ha 33 | addi 9,9,zero@toc@l 34 | lxvd2x 0, 0, 9 35 | 36 | addis 9,2,two@toc@ha 37 | addi 9,9,two@toc@l 38 | lxvd2x 1, 0, 9 39 | 40 | addis 9,2,three@toc@ha 41 | addi 9,9,three@toc@l 42 | lxvd2x 2, 0, 9 43 | loop: 44 | INSTR 0, 1, 2 45 | INSTR 0, 1, 2 46 | INSTR 0, 1, 2 47 | INSTR 0, 1, 2 48 | INSTR 0, 1, 2 49 | INSTR 0, 1, 2 50 | bdnz loop 51 | xvmovdp 1, 0 52 | blr 53 | .size latency, .-latency 54 | -------------------------------------------------------------------------------- /src/VSX/xvmuldp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvmuldp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | zero: 11 | .double 0.0, 0.0 12 | one: 13 | .double 1.0, 1.0 14 | .text 15 | .abiversion 2 16 | .section ".toc","aw" 17 | .section ".text" 18 | .align 2 19 | .globl latency 20 | .type latency, @function 21 | latency : 22 | 0: addis 2,12,.TOC.-0b@ha 23 | addi 2,2,.TOC.-0b@l 24 | .localentry latency, .-latency 25 | 26 | mtctr N # move to count register 27 | li 10, 0 # offset zero 28 | addis 9,2,one@toc@ha # upper 32 bit of address 29 | addi 9,9,one@toc@l # lower 32 bit of address 30 | lxvd2x 0, 0, 9 31 | addis 9,2,one@toc@ha # upper 32 bit of address 32 | addi 9,9,one@toc@l # lower 32 bit of address 33 | lxvd2x 1, 0, 9 34 | loop: 35 | INSTR 0, 0, 1 36 | INSTR 0, 0, 1 37 | INSTR 0, 0, 1 38 | INSTR 0, 0, 1 39 | INSTR 0, 0, 1 40 | INSTR 0, 0, 1 41 | bdnz loop 42 | xvmovdp 1, 0 43 | blr 44 | .size latency, .-latency 45 | -------------------------------------------------------------------------------- /src/VSX/xvmulsp.S: -------------------------------------------------------------------------------- 1 | #define INSTR xvmulsp 2 | #define NINST 6 3 | #define N 3 4 | 5 | .globl ninst 6 | .data 7 | ninst: 8 | .long NINST 9 | .align 16 10 | zero: 11 | .single 0.0, 0.0 12 | one: 13 | .single 1.0, 1.0 14 | .text 15 | .abiversion 2 16 | .section ".toc","aw" 17 | .section ".text" 18 | .align 2 19 | .globl latency 20 | .type latency, @function 21 | latency : 22 | 0: addis 2,12,.TOC.-0b@ha 23 | addi 2,2,.TOC.-0b@l 24 | .localentry latency, .-latency 25 | 26 | mtctr N # move to count register 27 | li 10, 0 # offset zero 28 | addis 9,2,one@toc@ha # upper 32 bit of address 29 | addi 9,9,one@toc@l # lower 32 bit of address 30 | lxvd2x 0, 0, 9 31 | addis 9,2,one@toc@ha # upper 32 bit of address 32 | addi 9,9,one@toc@l # lower 32 bit of address 33 | lxvd2x 1, 0, 9 34 | loop: 35 | INSTR 0, 0, 1 36 | INSTR 0, 0, 1 37 | INSTR 0, 0, 1 38 | INSTR 0, 0, 1 39 | INSTR 0, 0, 1 40 | INSTR 0, 0, 1 41 | bdnz loop 42 | xvmovdp 1, 0 43 | blr 44 | .size latency, .-latency 45 | -------------------------------------------------------------------------------- /src/scalar/addsd.S: -------------------------------------------------------------------------------- 1 | #define INSTR addsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movsd xmm0, xmm0 31 | movsd xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1 36 | INSTR xmm0, xmm1 37 | INSTR xmm0, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1 40 | INSTR xmm0, xmm1 41 | INSTR xmm0, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/scalar/addss.S: -------------------------------------------------------------------------------- 1 | #define INSTR addss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movss xmm0, xmm0 31 | movss xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm1 36 | INSTR xmm0, xmm1 37 | INSTR xmm0, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm1 40 | INSTR xmm0, xmm1 41 | INSTR xmm0, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/scalar/movsdx2-LAT.S: -------------------------------------------------------------------------------- 1 | #define INSTR mulsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddpd xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivpd xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movsd xmm0, xmm0 33 | movsd xmm1, xmm1 34 | movsd xmm2, xmm2 35 | 36 | sub rsp, 8 37 | 38 | loop: 39 | inc i 40 | movsd xmm0, [rsp] 41 | movsd [rsp], xmm0 42 | movsd xmm0, [rsp] 43 | movsd [rsp], xmm0 44 | movsd xmm0, [rsp] 45 | movsd [rsp], xmm0 46 | cmp i, N 47 | movsd xmm0, [rsp] 48 | movsd [rsp], xmm0 49 | movsd xmm0, [rsp] 50 | movsd [rsp], xmm0 51 | movsd xmm0, [rsp] 52 | movsd [rsp], xmm0 53 | jl loop 54 | done: 55 | 56 | add rsp, 8 57 | 58 | mov rsp, rbp 59 | pop rbp 60 | ret 61 | .size latency, .-latency 62 | -------------------------------------------------------------------------------- /src/scalar/mulsd.S: -------------------------------------------------------------------------------- 1 | #define INSTR mulsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddpd xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivpd xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movsd xmm0, xmm0 33 | movsd xmm1, xmm1 34 | movsd xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm1 39 | INSTR xmm0, xmm2 40 | INSTR xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm2 43 | INSTR xmm0, xmm1 44 | INSTR xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | -------------------------------------------------------------------------------- /src/scalar/mulss.S: -------------------------------------------------------------------------------- 1 | #define INSTR mulss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movss xmm0, xmm0 33 | movss xmm1, xmm1 34 | movss xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm1 39 | INSTR xmm0, xmm2 40 | INSTR xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm2 43 | INSTR xmm0, xmm1 44 | INSTR xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | -------------------------------------------------------------------------------- /src/scalar/rcpss-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR rcpss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddss xmm1, xmm0, xmm0 # create 2.0 27 | vaddss xmm2, xmm0, xmm1 # create 3.0 28 | vaddss xmm4, xmm1, xmm1 # create 4.0 29 | vaddss xmm4, xmm4, xmm4 # create 8.0 30 | vaddss xmm4, xmm4, xmm4 # create 16.0 31 | vaddss xmm4, xmm4, xmm4 # create 32.0 32 | vaddss xmm4, xmm4, xmm4 # create 64.0 33 | vaddss xmm4, xmm4, xmm4 # create 128.0 34 | vaddss xmm4, xmm4, xmm4 # create 256.0 35 | vaddss xmm4, xmm4, xmm4 # create 512.0 36 | vaddss xmm4, xmm4, xmm4 # create 1024.0 37 | vdivss xmm1, xmm4, xmm2 # create 341.3333 38 | vdivss xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddss xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movss xmm0, xmm0 43 | movss xmm1, xmm0 44 | movss xmm2, xmm0 45 | movss xmm3, xmm0 46 | movss xmm4, xmm0 47 | movss xmm5, xmm0 48 | 49 | loop: 50 | inc i 51 | INSTR xmm10, xmm0 52 | INSTR xmm11, xmm1 53 | INSTR xmm12, xmm2 54 | cmp i, N 55 | INSTR xmm13, xmm3 56 | INSTR xmm14, xmm4 57 | INSTR xmm15, xmm5 58 | jl loop 59 | done: 60 | mov rsp, rbp 61 | pop rbp 62 | ret 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/scalar/rcpss.S: -------------------------------------------------------------------------------- 1 | #define INSTR rcpss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddps xmm1, xmm0, xmm0 # create 2.0 27 | vaddps xmm2, xmm0, xmm1 # create 3.0 28 | vaddps xmm4, xmm1, xmm1 # create 4.0 29 | vaddps xmm4, xmm4, xmm4 # create 8.0 30 | vaddps xmm4, xmm4, xmm4 # create 16.0 31 | vaddps xmm4, xmm4, xmm4 # create 32.0 32 | vaddps xmm4, xmm4, xmm4 # create 64.0 33 | vaddps xmm4, xmm4, xmm4 # create 128.0 34 | vaddps xmm4, xmm4, xmm4 # create 256.0 35 | vaddps xmm4, xmm4, xmm4 # create 512.0 36 | vaddps xmm4, xmm4, xmm4 # create 1024.0 37 | vdivps xmm1, xmm4, xmm2 # create 341.3333 38 | vdivps xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddps xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movss xmm0, xmm0 43 | movss xmm1, xmm1 44 | movss xmm2, xmm2 45 | movss xmm3, xmm3 46 | movss xmm4, xmm4 47 | movss xmm5, xmm5 48 | 49 | loop: 50 | inc i 51 | INSTR xmm1, xmm0 52 | INSTR xmm2, xmm1 53 | INSTR xmm3, xmm2 54 | cmp i, N 55 | INSTR xmm4, xmm3 56 | INSTR xmm5, xmm4 57 | INSTR xmm0, xmm5 58 | jl loop 59 | done: 60 | mov rsp, rbp 61 | pop rbp 62 | ret 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/scalar/sqrtsd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR sqrtsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddsd xmm1, xmm0, xmm0 # create 2.0 27 | vaddsd xmm2, xmm0, xmm1 # create 3.0 28 | vaddsd xmm4, xmm1, xmm1 # create 4.0 29 | vaddsd xmm4, xmm4, xmm4 # create 8.0 30 | vaddsd xmm4, xmm4, xmm4 # create 16.0 31 | vaddsd xmm4, xmm4, xmm4 # create 32.0 32 | vaddsd xmm4, xmm4, xmm4 # create 64.0 33 | vaddsd xmm4, xmm4, xmm4 # create 128.0 34 | vaddsd xmm4, xmm4, xmm4 # create 256.0 35 | vaddsd xmm4, xmm4, xmm4 # create 512.0 36 | vaddsd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivsd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivsd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddsd xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movsd xmm0, xmm0 43 | movsd xmm1, xmm0 44 | movsd xmm2, xmm0 45 | movsd xmm3, xmm0 46 | movsd xmm4, xmm0 47 | movsd xmm5, xmm0 48 | 49 | loop: 50 | inc i 51 | INSTR xmm10, xmm0 52 | INSTR xmm11, xmm1 53 | INSTR xmm12, xmm2 54 | cmp i, N 55 | INSTR xmm13, xmm3 56 | INSTR xmm14, xmm4 57 | INSTR xmm15, xmm5 58 | jl loop 59 | done: 60 | mov rsp, rbp 61 | pop rbp 62 | ret 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/scalar/sqrtss-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR sqrtss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddss xmm1, xmm0, xmm0 # create 2.0 27 | vaddss xmm2, xmm0, xmm1 # create 3.0 28 | vaddss xmm4, xmm1, xmm1 # create 4.0 29 | vaddss xmm4, xmm4, xmm4 # create 8.0 30 | vaddss xmm4, xmm4, xmm4 # create 16.0 31 | vaddss xmm4, xmm4, xmm4 # create 32.0 32 | vaddss xmm4, xmm4, xmm4 # create 64.0 33 | vaddss xmm4, xmm4, xmm4 # create 128.0 34 | vaddss xmm4, xmm4, xmm4 # create 256.0 35 | vaddss xmm4, xmm4, xmm4 # create 512.0 36 | vaddss xmm4, xmm4, xmm4 # create 1024.0 37 | vdivss xmm1, xmm4, xmm2 # create 341.3333 38 | vdivss xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddss xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movss xmm0, xmm0 43 | movss xmm1, xmm0 44 | movss xmm2, xmm0 45 | movss xmm3, xmm0 46 | movss xmm4, xmm0 47 | movss xmm5, xmm0 48 | 49 | loop: 50 | inc i 51 | INSTR xmm10, xmm0 52 | INSTR xmm11, xmm1 53 | INSTR xmm12, xmm2 54 | cmp i, N 55 | INSTR xmm13, xmm3 56 | INSTR xmm14, xmm4 57 | INSTR xmm15, xmm5 58 | jl loop 59 | done: 60 | mov rsp, rbp 61 | pop rbp 62 | ret 63 | .size latency, .-latency 64 | -------------------------------------------------------------------------------- /src/scalar/vaddsd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movsd xmm0, xmm0 31 | movsd xmm1, xmm0 32 | movsd xmm2, xmm0 33 | movsd xmm3, xmm0 34 | movsd xmm4, xmm0 35 | movsd xmm5, xmm0 36 | movsd xmm6, xmm0 37 | movsd xmm7, xmm0 38 | movsd xmm8, xmm0 39 | 40 | loop: 41 | inc i 42 | INSTR xmm3, xmm0, xmm1 43 | INSTR xmm4, xmm1, xmm0 44 | INSTR xmm5, xmm0, xmm2 45 | cmp i, N 46 | INSTR xmm6, xmm2, xmm0 47 | INSTR xmm7, xmm1, xmm2 48 | INSTR xmm8, xmm2, xmm1 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/scalar/vaddsd.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (10 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movsd xmm0, xmm0 31 | movsd xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm0, xmm1 36 | INSTR xmm0, xmm0, xmm1 37 | INSTR xmm0, xmm0, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm0, xmm1 40 | INSTR xmm0, xmm0, xmm1 41 | INSTR xmm0, xmm0, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/scalar/vaddss-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movss xmm0, xmm0 31 | movss xmm1, xmm0 32 | movss xmm2, xmm0 33 | movss xmm3, xmm0 34 | movss xmm4, xmm0 35 | movss xmm5, xmm0 36 | movss xmm6, xmm0 37 | movss xmm7, xmm0 38 | movss xmm8, xmm0 39 | 40 | loop: 41 | inc i 42 | INSTR xmm3, xmm0, xmm1 43 | INSTR xmm4, xmm1, xmm0 44 | INSTR xmm5, xmm0, xmm2 45 | cmp i, N 46 | INSTR xmm6, xmm2, xmm0 47 | INSTR xmm7, xmm1, xmm2 48 | INSTR xmm8, xmm2, xmm1 49 | jl loop 50 | done: 51 | mov rsp, rbp 52 | pop rbp 53 | ret 54 | .size latency, .-latency 55 | -------------------------------------------------------------------------------- /src/scalar/vaddss.S: -------------------------------------------------------------------------------- 1 | #define INSTR vaddss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # copy SP 1.0 27 | vmovaps xmm1, xmm0 28 | 29 | # Mark registers as scalar 30 | movss xmm0, xmm0 31 | movss xmm1, xmm1 32 | 33 | loop: 34 | inc i 35 | INSTR xmm0, xmm0, xmm1 36 | INSTR xmm0, xmm0, xmm1 37 | INSTR xmm0, xmm0, xmm1 38 | cmp i, N 39 | INSTR xmm0, xmm0, xmm1 40 | INSTR xmm0, xmm0, xmm1 41 | INSTR xmm0, xmm0, xmm1 42 | jl loop 43 | done: 44 | mov rsp, rbp 45 | pop rbp 46 | ret 47 | .size latency, .-latency 48 | -------------------------------------------------------------------------------- /src/scalar/vdivsd.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 24 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddsd xmm1, xmm0, xmm0 # create 2.0 27 | vaddsd xmm2, xmm0, xmm1 # create 3.0 28 | vaddsd xmm4, xmm1, xmm1 # create 4.0 29 | vaddsd xmm4, xmm4, xmm4 # create 8.0 30 | vaddsd xmm4, xmm4, xmm4 # create 16.0 31 | vaddsd xmm4, xmm4, xmm4 # create 32.0 32 | vaddsd xmm4, xmm4, xmm4 # create 64.0 33 | vaddsd xmm4, xmm4, xmm4 # create 128.0 34 | vaddsd xmm4, xmm4, xmm4 # create 256.0 35 | vaddsd xmm4, xmm4, xmm4 # create 512.0 36 | vaddsd xmm4, xmm4, xmm4 # create 1024.0 37 | vdivsd xmm1, xmm4, xmm2 # create 341.3333 38 | vdivsd xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddsd xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movsd xmm0, xmm0 43 | movsd xmm1, xmm1 44 | movsd xmm2, xmm2 45 | 46 | loop: 47 | inc i 48 | INSTR xmm0, xmm0, xmm1 49 | INSTR xmm0, xmm0, xmm2 50 | INSTR xmm0, xmm0, xmm1 51 | cmp i, N 52 | INSTR xmm0, xmm0, xmm2 53 | INSTR xmm0, xmm0, xmm1 54 | INSTR xmm0, xmm0, xmm2 55 | jl loop 56 | done: 57 | mov rsp, rbp 58 | pop rbp 59 | ret 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/scalar/vdivss.S: -------------------------------------------------------------------------------- 1 | #define INSTR vdivss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | .intel_syntax noprefix 7 | .globl ninst 8 | .data 9 | ninst: 10 | .long NINST 11 | .text 12 | .globl latency 13 | .type latency, @function 14 | .align 32 15 | latency: 16 | push rbp 17 | mov rbp, rsp 18 | xor i, i 19 | test N, N 20 | jle done 21 | # create SP 1.0 22 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 23 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 24 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 25 | 26 | vaddss xmm1, xmm0, xmm0 # create 2.0 27 | vaddss xmm2, xmm0, xmm1 # create 3.0 28 | vaddss xmm4, xmm1, xmm1 # create 4.0 29 | vaddss xmm4, xmm4, xmm4 # create 8.0 30 | vaddss xmm4, xmm4, xmm4 # create 16.0 31 | vaddss xmm4, xmm4, xmm4 # create 32.0 32 | vaddss xmm4, xmm4, xmm4 # create 64.0 33 | vaddss xmm4, xmm4, xmm4 # create 128.0 34 | vaddss xmm4, xmm4, xmm4 # create 256.0 35 | vaddss xmm4, xmm4, xmm4 # create 512.0 36 | vaddss xmm4, xmm4, xmm4 # create 1024.0 37 | vdivss xmm1, xmm4, xmm2 # create 341.3333 38 | vdivss xmm2, xmm0, xmm1 # create 1/341.3333 39 | vaddss xmm0, xmm1, xmm1 # create 2*341.3333 40 | 41 | # Mark registers as scalar 42 | movss xmm0, xmm0 43 | movss xmm1, xmm1 44 | movss xmm2, xmm2 45 | 46 | loop: 47 | inc i 48 | INSTR xmm0, xmm0, xmm1 49 | INSTR xmm0, xmm0, xmm2 50 | INSTR xmm0, xmm0, xmm1 51 | cmp i, N 52 | INSTR xmm0, xmm0, xmm2 53 | INSTR xmm0, xmm0, xmm1 54 | INSTR xmm0, xmm0, xmm2 55 | jl loop 56 | done: 57 | mov rsp, rbp 58 | pop rbp 59 | ret 60 | .size latency, .-latency 61 | -------------------------------------------------------------------------------- /src/scalar/vmulsd-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movsd xmm0, xmm0 33 | movsd xmm1, xmm1 34 | movsd xmm2, xmm2 35 | movsd xmm3, xmm3 36 | movsd xmm4, xmm4 37 | movsd xmm5, xmm5 38 | movsd xmm6, xmm6 39 | movsd xmm7, xmm7 40 | movsd xmm8, xmm8 41 | 42 | loop: 43 | inc i 44 | INSTR xmm3, xmm0, xmm1 45 | INSTR xmm4, xmm1, xmm0 46 | INSTR xmm5, xmm0, xmm2 47 | cmp i, N 48 | INSTR xmm6, xmm2, xmm0 49 | INSTR xmm7, xmm1, xmm2 50 | INSTR xmm8, xmm2, xmm1 51 | jl loop 52 | done: 53 | mov rsp, rbp 54 | pop rbp 55 | ret 56 | .size latency, .-latency 57 | -------------------------------------------------------------------------------- /src/scalar/vmulsd.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulsd 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54 = 64 - (11 - 1)) 25 | vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddpd xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivpd xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movsd xmm0, xmm0 33 | movsd xmm1, xmm1 34 | movsd xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm0, xmm1 39 | INSTR xmm0, xmm0, xmm2 40 | INSTR xmm0, xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm0, xmm2 43 | INSTR xmm0, xmm0, xmm1 44 | INSTR xmm0, xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | -------------------------------------------------------------------------------- /src/scalar/vmulss-TP.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movsd xmm0, xmm0 33 | movsd xmm1, xmm1 34 | movsd xmm2, xmm2 35 | movsd xmm3, xmm3 36 | movsd xmm4, xmm4 37 | movsd xmm5, xmm5 38 | movsd xmm6, xmm6 39 | movsd xmm7, xmm7 40 | movsd xmm8, xmm8 41 | 42 | loop: 43 | inc i 44 | INSTR xmm3, xmm0, xmm1 45 | INSTR xmm4, xmm1, xmm0 46 | INSTR xmm5, xmm0, xmm2 47 | cmp i, N 48 | INSTR xmm6, xmm2, xmm0 49 | INSTR xmm7, xmm1, xmm2 50 | INSTR xmm8, xmm2, xmm1 51 | jl loop 52 | done: 53 | mov rsp, rbp 54 | pop rbp 55 | ret 56 | .size latency, .-latency 57 | -------------------------------------------------------------------------------- /src/scalar/vmulss.S: -------------------------------------------------------------------------------- 1 | #define INSTR vmulss 2 | #define NINST 6 3 | #define N edi 4 | #define i r8d 5 | 6 | 7 | .intel_syntax noprefix 8 | .globl ninst 9 | .data 10 | ninst: 11 | .long NINST 12 | .text 13 | .globl latency 14 | .type latency, @function 15 | .align 32 16 | latency: 17 | push rbp 18 | mov rbp, rsp 19 | xor i, i 20 | test N, N 21 | jle done 22 | # create SP 1.0 23 | vpcmpeqw xmm0, xmm0, xmm0 # all ones 24 | vpslld xmm0, xmm0, 25 # logical left shift: 11111110..0 (25 = 32 - (8 - 1)) 25 | vpsrld xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero 26 | # create SP 2.0 27 | vaddps xmm1, xmm0, xmm0 28 | # create SP 0.5 29 | vdivps xmm2, xmm0, xmm1 30 | 31 | # Mark registers as scalar 32 | movss xmm0, xmm0 33 | movss xmm1, xmm1 34 | movss xmm2, xmm2 35 | 36 | loop: 37 | inc i 38 | INSTR xmm0, xmm0, xmm1 39 | INSTR xmm0, xmm0, xmm2 40 | INSTR xmm0, xmm0, xmm1 41 | cmp i, N 42 | INSTR xmm0, xmm0, xmm2 43 | INSTR xmm0, xmm0, xmm1 44 | INSTR xmm0, xmm0, xmm2 45 | jl loop 46 | done: 47 | mov rsp, rbp 48 | pop rbp 49 | ret 50 | .size latency, .-latency 51 | --------------------------------------------------------------------------------