├── .clang-format ├── .github └── workflows │ └── main.yml ├── .gitignore ├── CMakeLists.txt ├── CONTRIBUTING.md ├── CONTRIBUTORS.md ├── COPYING ├── README.md ├── chap15 ├── ex1 │ ├── CMakeLists.txt │ ├── ex1_bench.cpp │ ├── ex1_test.cpp │ ├── transform_avx.c │ ├── transform_avx.h │ ├── transform_sse.c │ └── transform_sse.h ├── ex10 │ ├── CMakeLists.txt │ ├── ex10_bench.cpp │ ├── ex10_test.cpp │ ├── saxpy32.asm │ ├── saxpy32.c │ ├── saxpy32.h │ └── saxpy32.s ├── ex12 │ ├── CMakeLists.txt │ ├── ex12_bench.cpp │ ├── ex12_test.cpp │ ├── saxpy16.asm │ ├── saxpy16.c │ ├── saxpy16.h │ ├── saxpy16.s │ ├── saxpy32.asm │ ├── saxpy32.c │ ├── saxpy32.h │ └── saxpy32.s ├── ex14 │ ├── CMakeLists.txt │ ├── cond_scalar.asm │ ├── cond_scalar.c │ ├── cond_scalar.h │ ├── cond_scalar.s │ ├── cond_vmaskmov.asm │ ├── cond_vmaskmov.c │ ├── cond_vmaskmov.h │ ├── cond_vmaskmov.s │ ├── ex14_bench.cpp │ └── ex14_test.cpp ├── ex16 │ ├── CMakeLists.txt │ ├── ex16_bench.cpp │ ├── ex16_test.cpp │ ├── three_tap_sse.asm │ ├── three_tap_sse.c │ ├── three_tap_sse.h │ └── three_tap_sse.s ├── ex17 │ ├── CMakeLists.txt │ ├── ex17_bench.cpp │ ├── ex17_test.cpp │ ├── three_tap_avx.asm │ ├── three_tap_avx.c │ ├── three_tap_avx.h │ └── three_tap_avx.s ├── ex18 │ ├── CMakeLists.txt │ ├── ex18_bench.cpp │ ├── ex18_test.cpp │ ├── three_tap_mixed_avx.asm │ ├── three_tap_mixed_avx.c │ ├── three_tap_mixed_avx.h │ └── three_tap_mixed_avx.s ├── ex19 │ ├── CMakeLists.txt │ ├── ex19_bench.cpp │ ├── ex19_test.cpp │ ├── vblendps_transpose.asm │ ├── vblendps_transpose.c │ ├── vblendps_transpose.h │ ├── vblendps_transpose.s │ ├── vshufps_transpose.asm │ ├── vshufps_transpose.c │ ├── vshufps_transpose.h │ └── vshufps_transpose.s ├── ex2 │ ├── CMakeLists.txt │ ├── ex2_bench.cpp │ ├── ex2_test.cpp │ ├── transform_avx.asm │ ├── transform_avx.c │ ├── transform_avx.h │ ├── transform_avx.s │ ├── transform_sse.asm │ ├── transform_sse.c │ ├── transform_sse.h │ └── transform_sse.s ├── ex20 │ ├── CMakeLists.txt │ ├── ex20_bench.cpp │ ├── ex20_test.cpp │ ├── vinsertps_transpose.asm │ ├── vinsertps_transpose.c │ ├── vinsertps_transpose.h │ └── vinsertps_transpose.s ├── ex21 │ ├── CMakeLists.txt │ ├── complex_num.h │ ├── ex21_bench.cpp │ ├── ex21_test.cpp │ ├── mul_cpx_mem.asm │ ├── mul_cpx_mem.c │ ├── mul_cpx_mem.h │ ├── mul_cpx_mem.s │ ├── mul_cpx_reg.asm │ ├── mul_cpx_reg.c │ ├── mul_cpx_reg.h │ └── mul_cpx_reg.s ├── ex22 │ ├── CMakeLists.txt │ ├── divps_sse.asm │ ├── divps_sse.c │ ├── divps_sse.h │ ├── divps_sse.s │ ├── ex22_bench.cpp │ ├── ex22_test.cpp │ ├── vdivps_avx.asm │ ├── vdivps_avx.c │ ├── vdivps_avx.h │ └── vdivps_avx.s ├── ex23 │ ├── CMakeLists.txt │ ├── ex23_bench.cpp │ ├── ex23_test.cpp │ ├── rcpps_sse.asm │ ├── rcpps_sse.c │ ├── rcpps_sse.h │ ├── rcpps_sse.s │ ├── vrcpps_avx.asm │ ├── vrcpps_avx.c │ ├── vrcpps_avx.h │ └── vrcpps_avx.s ├── ex24 │ ├── CMakeLists.txt │ ├── ex24_bench.cpp │ ├── ex24_test.cpp │ ├── rcpps_mul_sse.asm │ ├── rcpps_mul_sse.c │ ├── rcpps_mul_sse.h │ ├── rcpps_mul_sse.s │ ├── vrcpps_mul_avx.asm │ ├── vrcpps_mul_avx.c │ ├── vrcpps_mul_avx.h │ └── vrcpps_mul_avx.s ├── ex25 │ ├── CMakeLists.txt │ ├── ex25_bench.cpp │ ├── ex25_test.cpp │ ├── sqrtps_divps_sse.asm │ ├── sqrtps_divps_sse.c │ ├── sqrtps_divps_sse.h │ ├── sqrtps_divps_sse.s │ ├── vsqrtps_vdivps_avx.asm │ ├── vsqrtps_vdivps_avx.c │ ├── vsqrtps_vdivps_avx.h │ └── vsqrtps_vdivps_avx.s ├── ex26 │ ├── CMakeLists.txt │ ├── ex26_bench.cpp │ ├── ex26_test.cpp │ ├── rsqrtps_sse.asm │ ├── rsqrtps_sse.c │ ├── rsqrtps_sse.h │ ├── rsqrtps_sse.s │ ├── vrsqrtps_avx.asm │ ├── vrsqrtps_avx.c │ ├── vrsqrtps_avx.h │ └── vrsqrtps_avx.s ├── ex27 │ ├── CMakeLists.txt │ ├── ex27_bench.cpp │ ├── ex27_test.cpp │ ├── rsqrtps_newt_sse.asm │ ├── rsqrtps_newt_sse.c │ ├── rsqrtps_newt_sse.h │ ├── rsqrtps_newt_sse.s │ ├── vrsqrtps_newt_avx.asm │ ├── vrsqrtps_newt_avx.c │ ├── vrsqrtps_newt_avx.h │ └── vrsqrtps_newt_avx.s ├── ex28 │ ├── CMakeLists.txt │ ├── ex28_bench.cpp │ ├── ex28_test.cpp │ ├── sqrtps_sse.asm │ ├── sqrtps_sse.c │ ├── sqrtps_sse.h │ ├── sqrtps_sse.s │ ├── vsqrtps_avx.asm │ ├── vsqrtps_avx.c │ ├── vsqrtps_avx.h │ └── vsqrtps_avx.s ├── ex29 │ ├── CMakeLists.txt │ ├── ex29_bench.cpp │ ├── ex29_test.cpp │ ├── sqrt_rsqrtps_sse.asm │ ├── sqrt_rsqrtps_sse.c │ ├── sqrt_rsqrtps_sse.h │ ├── sqrt_rsqrtps_sse.s │ ├── sqrt_vrsqrtps_avx.asm │ ├── sqrt_vrsqrtps_avx.c │ ├── sqrt_vrsqrtps_avx.h │ └── sqrt_vrsqrtps_avx.s ├── ex3 │ ├── CMakeLists.txt │ ├── ex3_bench.cpp │ ├── ex3_test.cpp │ ├── poly_avx_128.asm │ ├── poly_avx_128.c │ ├── poly_avx_128.h │ ├── poly_avx_128.s │ ├── poly_avx_256.asm │ ├── poly_avx_256.c │ ├── poly_avx_256.h │ ├── poly_avx_256.s │ ├── poly_sse.asm │ ├── poly_sse.c │ ├── poly_sse.h │ └── poly_sse.s ├── ex30 │ ├── CMakeLists.txt │ ├── ex30_bench.cpp │ ├── ex30_test.cpp │ ├── sqrt_rsqrtps_taylor_sse.asm │ ├── sqrt_rsqrtps_taylor_sse.c │ ├── sqrt_rsqrtps_taylor_sse.h │ ├── sqrt_rsqrtps_taylor_sse.s │ ├── sqrt_vrsqrtps_taylor_avx.asm │ ├── sqrt_vrsqrtps_taylor_avx.c │ ├── sqrt_vrsqrtps_taylor_avx.h │ └── sqrt_vrsqrtps_taylor_avx.s ├── ex31 │ ├── CMakeLists.txt │ ├── ex31_bench.cpp │ ├── ex31_test.cpp │ ├── subsum_avx.asm │ ├── subsum_avx.c │ ├── subsum_avx.h │ ├── subsum_avx.s │ ├── subsum_sse.asm │ ├── subsum_sse.c │ ├── subsum_sse.h │ └── subsum_sse.s ├── ex34 │ ├── CMakeLists.txt │ ├── ex34_bench.cpp │ ├── ex34_test.cpp │ ├── halfp.asm │ ├── halfp.c │ ├── halfp.h │ ├── halfp.s │ ├── singlep.asm │ ├── singlep.c │ ├── singlep.h │ └── singlep.s ├── ex35 │ ├── CMakeLists.txt │ ├── ex35_bench.cpp │ ├── ex35_test.cpp │ ├── fp_fma.asm │ ├── fp_fma.c │ ├── fp_fma.h │ ├── fp_fma.s │ ├── fp_mul_add.asm │ ├── fp_mul_add.c │ ├── fp_mul_add.h │ └── fp_mul_add.s ├── ex36 │ ├── CMakeLists.txt │ ├── ex36_bench.cpp │ ├── ex36_test.cpp │ ├── no_unroll_reduce.asm │ ├── no_unroll_reduce.c │ ├── no_unroll_reduce.h │ ├── no_unroll_reduce.s │ ├── unroll_reduce.asm │ ├── unroll_reduce.c │ ├── unroll_reduce.h │ └── unroll_reduce.s ├── ex38 │ └── lkt_intra_block.h ├── ex39 │ ├── CMakeLists.txt │ ├── ex39_test.cpp │ ├── klt_256.c │ └── klt_256.h ├── ex40 │ └── parmod10.h ├── ex41 │ ├── CMakeLists.txt │ ├── ex41_bench.cpp │ ├── ex41_test.cpp │ ├── i64toa_avx2.c │ ├── i64toa_avx2.h │ └── ubsavx2.h ├── ex42 │ └── avx2i_q2a_u63b.h ├── ex45 │ ├── CMakeLists.txt │ ├── ex45_test.cpp │ ├── vpgatherd_soft.asm │ ├── vpgatherd_soft.c │ ├── vpgatherd_soft.h │ └── vpgatherd_soft.s ├── ex46 │ ├── CMakeLists.txt │ ├── avx2_vpgatherd.asm │ ├── avx2_vpgatherd.c │ ├── avx2_vpgatherd.h │ ├── avx2_vpgatherd.s │ ├── avx_vinsrt.asm │ ├── avx_vinsrt.c │ ├── avx_vinsrt.h │ ├── avx_vinsrt.s │ ├── complex_num.h │ ├── ex46_bench.cpp │ ├── ex46_test.cpp │ ├── scalar.asm │ ├── scalar.c │ ├── scalar.h │ └── scalar.s ├── ex47 │ ├── CMakeLists.txt │ ├── avx2_gatherpd.asm │ ├── avx2_gatherpd.c │ ├── avx2_gatherpd.h │ ├── avx2_gatherpd.s │ ├── avx_vinsert.asm │ ├── avx_vinsert.c │ ├── avx_vinsert.h │ ├── avx_vinsert.s │ ├── complex_num.h │ ├── ex47_bench.cpp │ └── ex47_test.cpp ├── ex48 │ ├── CMakeLists.txt │ ├── avx2_min_max.asm │ ├── avx2_min_max.c │ ├── avx2_min_max.h │ ├── avx2_min_max.s │ ├── ex48_bench.cpp │ ├── ex48_test.cpp │ ├── min_max.h │ ├── mmx_min_max.asm │ ├── mmx_min_max.c │ ├── mmx_min_max.h │ └── mmx_min_max.s ├── ex6 │ ├── CMakeLists.txt │ ├── complex_conv_avx_stride.asm │ ├── complex_conv_avx_stride.c │ ├── complex_conv_avx_stride.h │ ├── complex_conv_avx_stride.s │ ├── complex_conv_sse.asm │ ├── complex_conv_sse.c │ ├── complex_conv_sse.h │ ├── complex_conv_sse.s │ ├── complex_num.h │ ├── ex6_bench.cpp │ └── ex6_test.cpp ├── ex7 │ ├── CMakeLists.txt │ ├── ex7_bench.cpp │ ├── ex7_test.cpp │ ├── median_avx_overlap.asm │ ├── median_avx_overlap.c │ ├── median_avx_overlap.h │ ├── median_avx_overlap.s │ ├── median_avx_vperm.asm │ ├── median_avx_vperm.c │ ├── median_avx_vperm.h │ ├── median_avx_vperm.s │ ├── median_sse.asm │ ├── median_sse.c │ ├── median_sse.h │ └── median_sse.s ├── ex8 │ ├── CMakeLists.txt │ ├── ex8_bench.cpp │ ├── ex8_test.cpp │ ├── gather_scalar.asm │ ├── gather_scalar.c │ ├── gather_scalar.h │ ├── gather_scalar.s │ ├── gather_vinsert.asm │ ├── gather_vinsert.c │ ├── gather_vinsert.h │ ├── gather_vinsert.s │ ├── gather_vinsert_vshufps.asm │ ├── gather_vinsert_vshufps.c │ ├── gather_vinsert_vshufps.h │ └── gather_vinsert_vshufps.s └── ex9 │ ├── CMakeLists.txt │ ├── ex9_bench.cpp │ ├── ex9_test.cpp │ ├── scatter_avx.asm │ ├── scatter_avx.c │ ├── scatter_avx.h │ ├── scatter_avx.s │ ├── scatter_scalar.asm │ ├── scatter_scalar.c │ ├── scatter_scalar.h │ └── scatter_scalar.s ├── chap18 ├── ex1 │ ├── CMakeLists.txt │ ├── ex1_bench.cpp │ ├── ex1_test.cpp │ ├── transform_avx.c │ ├── transform_avx.h │ ├── transform_avx512.c │ └── transform_avx512.h ├── ex10 │ ├── CMakeLists.txt │ ├── avx2_compress.asm │ ├── avx2_compress.c │ ├── avx2_compress.h │ ├── avx2_compress.s │ ├── avx512_compress.asm │ ├── avx512_compress.c │ ├── avx512_compress.h │ ├── avx512_compress.s │ ├── avx_compress.asm │ ├── avx_compress.c │ ├── avx_compress.h │ ├── avx_compress.s │ ├── ex10_bench.cpp │ ├── ex10_test.cpp │ ├── scalar_compress.asm │ ├── scalar_compress.c │ ├── scalar_compress.h │ └── scalar_compress.s ├── ex11 │ ├── CMakeLists.txt │ ├── ex11_bench.cpp │ ├── ex11_test.cpp │ ├── expand_avx2.asm │ ├── expand_avx2.c │ ├── expand_avx2.h │ ├── expand_avx2.s │ ├── expand_avx512.asm │ ├── expand_avx512.c │ ├── expand_avx512.h │ ├── expand_avx512.s │ ├── expand_scalar.asm │ ├── expand_scalar.c │ ├── expand_scalar.h │ └── expand_scalar.s ├── ex12 │ ├── CMakeLists.txt │ ├── ex12_bench.cpp │ ├── ex12_test.cpp │ ├── ternary_avx2.asm │ ├── ternary_avx2.h │ ├── ternary_avx2.s │ ├── ternary_avx512.asm │ ├── ternary_avx512.h │ ├── ternary_avx512.s │ ├── ternary_vpternlog.asm │ ├── ternary_vpternlog.h │ └── ternary_vpternlog.s ├── ex13 │ ├── CMakeLists.txt │ ├── ex13_bench.cpp │ ├── ex13_test.cpp │ ├── transpose_avx2.asm │ ├── transpose_avx2.c │ ├── transpose_avx2.h │ ├── transpose_avx2.s │ ├── transpose_avx512.asm │ ├── transpose_avx512.c │ ├── transpose_avx512.h │ ├── transpose_avx512.s │ ├── transpose_scalar.asm │ ├── transpose_scalar.c │ ├── transpose_scalar.h │ └── transpose_scalar.s ├── ex14 │ ├── CMakeLists.txt │ ├── embedded_broadcast.asm │ ├── embedded_broadcast.c │ ├── embedded_broadcast.h │ ├── embedded_broadcast.s │ ├── ex14_bench.cpp │ ├── ex14_test.cpp │ ├── memory_broadcast.asm │ ├── memory_broadcast.c │ ├── memory_broadcast.h │ ├── memory_broadcast.s │ ├── register_broadcast.asm │ ├── register_broadcast.c │ ├── register_broadcast.h │ └── register_broadcast.s ├── ex15 │ ├── CMakeLists.txt │ ├── ex15_bench.cpp │ ├── ex15_test.cpp │ ├── memory_broadcast.asm │ ├── memory_broadcast.c │ ├── memory_broadcast.h │ ├── memory_broadcast.s │ ├── register_broadcast.asm │ ├── register_broadcast.c │ ├── register_broadcast.h │ └── register_broadcast.s ├── ex16 │ ├── CMakeLists.txt │ ├── embedded_rounding.asm │ ├── embedded_rounding.c │ ├── embedded_rounding.h │ ├── embedded_rounding.s │ ├── ex16_test.cpp │ ├── manual_rounding.asm │ ├── manual_rounding.c │ ├── manual_rounding.h │ └── manual_rounding.s ├── ex17 │ ├── CMakeLists.txt │ ├── ex17_bench.cpp │ ├── ex17_test.cpp │ ├── hardware_scatter.asm │ ├── hardware_scatter.c │ ├── hardware_scatter.h │ ├── hardware_scatter.s │ ├── scalar_scatter.asm │ ├── scalar_scatter.c │ ├── scalar_scatter.h │ ├── scalar_scatter.s │ ├── software_scatter.asm │ ├── software_scatter.c │ ├── software_scatter.h │ └── software_scatter.s ├── ex18 │ ├── CMakeLists.txt │ ├── ex18_bench.cpp │ ├── ex18_test.cpp │ ├── qword_avx2.c │ ├── qword_avx2.h │ ├── qword_avx2_ass.asm │ ├── qword_avx2_ass.s │ ├── qword_avx2_intrinsics.c │ ├── qword_avx512.c │ ├── qword_avx512.h │ ├── qword_avx512_ass.asm │ ├── qword_avx512_ass.s │ └── qword_avx512_intrinsics.c ├── ex19 │ ├── CMakeLists.txt │ ├── avx512_histogram.asm │ ├── avx512_histogram.c │ ├── avx512_histogram.h │ ├── avx512_histogram.s │ ├── ex19_bench.cpp │ ├── ex19_test.cpp │ ├── scalar_histogram.asm │ ├── scalar_histogram.c │ ├── scalar_histogram.h │ └── scalar_histogram.s ├── ex2 │ ├── CMakeLists.txt │ ├── ex2_bench.cpp │ ├── ex2_test.cpp │ ├── transform_avx.asm │ ├── transform_avx.c │ ├── transform_avx.h │ ├── transform_avx.s │ ├── transform_avx512.asm │ ├── transform_avx512.c │ ├── transform_avx512.h │ └── transform_avx512.s ├── ex20 │ ├── CMakeLists.txt │ ├── avx512_vector_dp.asm │ ├── avx512_vector_dp.c │ ├── avx512_vector_dp.h │ ├── avx512_vector_dp.s │ ├── ex20_bench.cpp │ ├── ex20_test.cpp │ ├── init_sparse.cpp │ ├── init_sparse.h │ ├── scalar_vector_dp.asm │ ├── scalar_vector_dp.c │ ├── scalar_vector_dp.h │ └── scalar_vector_dp.s ├── ex21 │ ├── CMakeLists.txt │ ├── ex21_bench.cpp │ ├── ex21_test.cpp │ ├── lookup_novbmi.asm │ ├── lookup_novbmi.c │ ├── lookup_novbmi.h │ ├── lookup_novbmi.s │ ├── lookup_vbmi.asm │ ├── lookup_vbmi.c │ ├── lookup_vbmi.h │ └── lookup_vbmi.s ├── ex22 │ ├── CMakeLists.txt │ ├── ex22_bench.cpp │ ├── ex22_test.cpp │ ├── lookup128_novbmi.asm │ ├── lookup128_novbmi.c │ ├── lookup128_novbmi.h │ ├── lookup128_novbmi.s │ ├── lookup128_vbmi.asm │ ├── lookup128_vbmi.c │ ├── lookup128_vbmi.h │ └── lookup128_vbmi.s ├── ex23 │ ├── CMakeLists.txt │ ├── decompress_novbmi.asm │ ├── decompress_novbmi.c │ ├── decompress_novbmi.h │ ├── decompress_novbmi.s │ ├── decompress_vbmi.asm │ ├── decompress_vbmi.c │ ├── decompress_vbmi.h │ ├── decompress_vbmi.s │ ├── ex23_bench.cpp │ └── ex23_test.cpp ├── ex24 │ ├── CMakeLists.txt │ ├── both_256_512bit.asm │ ├── both_256_512bit.h │ ├── both_256_512bit.s │ ├── ex24_bench.cpp │ ├── ex24_test.cpp │ ├── only_256bit.asm │ ├── only_256bit.h │ └── only_256bit.s ├── ex25 │ ├── CMakeLists.txt │ ├── ex25_test.cpp │ ├── fma_only_tpt.asm │ ├── fma_only_tpt.s │ ├── fma_shuffle_tpt.asm │ ├── fma_shuffle_tpt.s │ ├── fma_unit_count.c │ └── fma_unit_count.h ├── ex26 │ ├── CMakeLists.txt │ ├── complex_num.h │ ├── ex26_bench.cpp │ ├── ex26_test.cpp │ ├── g2s_vpermi2d.asm │ ├── g2s_vpermi2d.c │ ├── g2s_vpermi2d.h │ ├── g2s_vpermi2d.s │ ├── g2s_vpgatherdd.asm │ ├── g2s_vpgatherdd.c │ ├── g2s_vpgatherdd.h │ └── g2s_vpgatherdd.s ├── ex27 │ ├── CMakeLists.txt │ ├── complex_num.h │ ├── ex27_bench.cpp │ ├── ex27_test.cpp │ ├── s2s_verpmi2d.c │ ├── s2s_vpermi2d.asm │ ├── s2s_vpermi2d.h │ ├── s2s_vpermi2d.s │ ├── s2s_vscatterdps.asm │ ├── s2s_vscatterdps.c │ ├── s2s_vscatterdps.h │ └── s2s_vscatterdps.s ├── ex28 │ ├── CMakeLists.txt │ ├── adj_load_masked_broadcast.asm │ ├── adj_load_masked_broadcast.c │ ├── adj_load_masked_broadcast.h │ ├── adj_load_masked_broadcast.s │ ├── adj_vpgatherpd.asm │ ├── adj_vpgatherpd.c │ ├── adj_vpgatherpd.h │ ├── adj_vpgatherpd.s │ ├── elem_struct.h │ ├── ex28_bench.cpp │ └── ex28_test.cpp ├── ex29 │ ├── CMakeLists.txt │ ├── ex29_bench.cpp │ ├── ex29_test.cpp │ ├── saxpy_512.asm │ ├── saxpy_512.c │ ├── saxpy_512.h │ └── saxpy_512.s ├── ex3 │ ├── CMakeLists.txt │ ├── ex3_bench.cpp │ ├── ex3_test.cpp │ ├── mul_blend_avx.c │ ├── mul_blend_avx.h │ ├── mul_blend_avx512.c │ └── mul_blend_avx512.h ├── ex30 │ ├── CMakeLists.txt │ ├── ex30_test.cpp │ ├── single_div_14.asm │ ├── single_div_14.c │ ├── single_div_14.h │ ├── single_div_14.s │ ├── single_div_23.asm │ ├── single_div_23.c │ ├── single_div_23.h │ ├── single_div_23.s │ ├── single_div_24.asm │ ├── single_div_24.c │ ├── single_div_24.h │ └── single_div_24.s ├── ex31 │ ├── CMakeLists.txt │ ├── ex31_test.cpp │ ├── single_rcps_14.asm │ ├── single_rcps_14.c │ ├── single_rcps_14.h │ ├── single_rcps_14.s │ ├── single_rcps_22.asm │ ├── single_rcps_22.c │ ├── single_rcps_22.h │ ├── single_rcps_22.s │ ├── single_rcps_23.asm │ ├── single_rcps_23.c │ ├── single_rcps_23.h │ └── single_rcps_23.s ├── ex32 │ ├── CMakeLists.txt │ ├── ex32_test.cpp │ ├── single_sqrt_14.asm │ ├── single_sqrt_14.c │ ├── single_sqrt_14.h │ ├── single_sqrt_14.s │ ├── single_sqrt_23.asm │ ├── single_sqrt_23.c │ ├── single_sqrt_23.h │ ├── single_sqrt_23.s │ ├── single_sqrt_24.asm │ ├── single_sqrt_24.c │ ├── single_sqrt_24.h │ └── single_sqrt_24.s ├── ex33 │ ├── CMakeLists.txt │ ├── double_div_14.asm │ ├── double_div_14.c │ ├── double_div_14.h │ ├── double_div_14.s │ ├── double_div_26.asm │ ├── double_div_26.c │ ├── double_div_26.h │ ├── double_div_26.s │ ├── double_div_52.asm │ ├── double_div_52.c │ ├── double_div_52.h │ ├── double_div_52.s │ ├── double_div_53.asm │ ├── double_div_53.c │ ├── double_div_53.h │ ├── double_div_53.s │ └── ex33_test.cpp ├── ex34 │ ├── CMakeLists.txt │ ├── double_rsqrt_14.asm │ ├── double_rsqrt_14.c │ ├── double_rsqrt_14.h │ ├── double_rsqrt_14.s │ ├── double_rsqrt_26.asm │ ├── double_rsqrt_26.c │ ├── double_rsqrt_26.h │ ├── double_rsqrt_26.s │ ├── double_rsqrt_50.asm │ ├── double_rsqrt_50.c │ ├── double_rsqrt_50.h │ ├── double_rsqrt_50.s │ ├── double_rsqrt_51.asm │ ├── double_rsqrt_51.c │ ├── double_rsqrt_51.h │ ├── double_rsqrt_51.s │ ├── double_rsqrt_52.asm │ ├── double_rsqrt_52.c │ ├── double_rsqrt_52.h │ ├── double_rsqrt_52.s │ └── ex34_test.cpp ├── ex35 │ ├── CMakeLists.txt │ ├── double_sqrt_14.asm │ ├── double_sqrt_14.c │ ├── double_sqrt_14.h │ ├── double_sqrt_14.s │ ├── double_sqrt_26.asm │ ├── double_sqrt_26.c │ ├── double_sqrt_26.h │ ├── double_sqrt_26.s │ ├── double_sqrt_52.asm │ ├── double_sqrt_52.c │ ├── double_sqrt_52.h │ ├── double_sqrt_52.s │ ├── double_sqrt_53.asm │ ├── double_sqrt_53.c │ ├── double_sqrt_53.h │ ├── double_sqrt_53.s │ └── ex35_test.cpp ├── ex4 │ ├── CMakeLists.txt │ ├── ex4_bench.cpp │ ├── ex4_test.cpp │ ├── mul_blend_avx.asm │ ├── mul_blend_avx.c │ ├── mul_blend_avx.h │ ├── mul_blend_avx.s │ ├── mul_blend_avx512.asm │ ├── mul_blend_avx512.c │ ├── mul_blend_avx512.h │ └── mul_blend_avx512.s ├── ex5 │ ├── CMakeLists.txt │ ├── ex5_bench.cpp │ ├── ex5_test.cpp │ ├── mul_mask_avx512.asm │ ├── mul_mask_avx512.c │ ├── mul_mask_avx512.h │ ├── mul_mask_avx512.s │ ├── mul_nomask_avx512.asm │ ├── mul_nomask_avx512.c │ ├── mul_nomask_avx512.h │ ├── mul_nomask_avx512.s │ ├── mul_zeromask_avx512.asm │ ├── mul_zeromask_avx512.c │ ├── mul_zeromask_avx512.h │ └── mul_zeromask_avx512.s ├── ex6 │ ├── CMakeLists.txt │ ├── blend_avx512.asm │ ├── blend_avx512.c │ ├── blend_avx512.h │ ├── blend_avx512.s │ ├── ex6_bench.cpp │ ├── ex6_test.cpp │ ├── mask_avx512.asm │ ├── mask_avx512.c │ ├── mask_avx512.h │ └── mask_avx512.s ├── ex7 │ ├── CMakeLists.txt │ ├── blend_avx512.asm │ ├── blend_avx512.c │ ├── blend_avx512.h │ ├── blend_avx512.s │ ├── ex7_bench.cpp │ ├── ex7_test.cpp │ ├── mask_avx512.asm │ ├── mask_avx512.c │ ├── mask_avx512.h │ └── mask_avx512.s ├── ex8 │ ├── CMakeLists.txt │ ├── ex8_bench.cpp │ ├── ex8_test.cpp │ ├── mce_avx2.asm │ ├── mce_avx2.c │ ├── mce_avx2.h │ ├── mce_avx2.s │ ├── mce_avx512.asm │ ├── mce_avx512.c │ ├── mce_avx512.h │ ├── mce_avx512.s │ ├── mce_scalar.asm │ ├── mce_scalar.c │ ├── mce_scalar.h │ └── mce_scalar.s └── ex9 │ ├── CMakeLists.txt │ ├── ex9_bench.cpp │ ├── ex9_test.cpp │ ├── no_peeling.asm │ ├── no_peeling.c │ ├── no_peeling.h │ ├── no_peeling.s │ ├── peeling.asm │ ├── peeling.c │ ├── peeling.h │ └── peeling.s ├── chap19 ├── ex1 │ ├── CMakeLists.txt │ ├── ex1_test.cpp │ ├── real_from_complex_mask.cpp │ └── real_from_complex_mask.h ├── ex2 │ ├── CMakeLists.txt │ ├── complex_from_real_mask_and.cpp │ ├── complex_from_real_mask_and.h │ └── ex2_test.cpp ├── ex3 │ ├── CMakeLists.txt │ ├── complex_from_real_mask_or.cpp │ ├── complex_from_real_mask_or.h │ └── ex3_test.cpp ├── ex4 │ ├── CMakeLists.txt │ ├── compress_ph.cpp │ ├── compress_ph.h │ ├── compress_ph_test.h │ └── ex4_test.cpp └── ex5 │ ├── CMakeLists.txt │ ├── ex5_test.cpp │ ├── fast_special_min.cpp │ ├── fast_special_min.h │ └── fast_special_min_test.h ├── chap20 ├── ex1 │ └── amx_tile.hpp ├── ex10 │ ├── CMakeLists.txt │ ├── amx_interleaved_gemm_ass.asm │ ├── amx_interleaved_gemm_ass.h │ ├── amx_interleaved_gemm_ass.s │ ├── amx_ref_gemm_int8.hpp │ ├── ex10_bench.cpp │ ├── ex10_cpp_bench.cpp │ ├── ex10_test.cpp │ └── gemm │ │ └── amx_interleaved_gemm.hpp ├── ex11 │ └── amx_conv_activations_layout.hpp ├── ex12 │ └── amx_conv_weights_layout.hpp ├── ex13 │ └── amx_conv_int8.hpp ├── ex14 │ ├── CMakeLists.txt │ ├── amx_conv_int8_test_utils.hpp │ ├── ex14_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_conv_gemm.hpp ├── ex16 │ ├── CMakeLists.txt │ ├── amx_conv_block_int8.hpp │ ├── ex16_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_conv_block_gemm.hpp ├── ex17 │ ├── CMakeLists.txt │ ├── amx_int8_uint8_test_utils.hpp │ ├── amx_post_conv_gemm_relu_ass.asm │ ├── amx_post_conv_gemm_relu_ass.h │ ├── amx_post_conv_gemm_relu_ass.s │ ├── amx_ref_gemm_int8_uint8.hpp │ ├── ex17_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_post_conv_gemm_relu_ass.hpp ├── ex18 │ ├── CMakeLists.txt │ ├── amx_int8_uint8_test_utils.hpp │ ├── amx_interleaved_gemm_relu_ass.asm │ ├── amx_interleaved_gemm_relu_ass.h │ ├── amx_interleaved_gemm_relu_ass.s │ ├── amx_ref_gemm_int8_uint8.hpp │ ├── ex18_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_interleaved_gemm_relu_ass.hpp ├── ex19 │ ├── CMakeLists.txt │ ├── bf16_conv.c │ ├── bf16_conv.h │ └── ex19_test.cpp ├── ex20 │ ├── CMakeLists.txt │ ├── ex20_test.cpp │ ├── int8_conv.h │ ├── int8_conv_test.cpp │ └── int8_conv_test.h ├── ex21 │ ├── CMakeLists.txt │ ├── embedding.c │ ├── embedding.h │ ├── ex21_bench.cpp │ └── ex21_test.cpp ├── ex22 │ ├── CMakeLists.txt │ ├── ex22_test.cpp │ ├── flat_to_flat_bf16_trans.asm │ ├── flat_to_flat_bf16_trans.c │ ├── flat_to_flat_bf16_trans.h │ └── flat_to_flat_bf16_trans.s ├── ex23 │ ├── CMakeLists.txt │ ├── ex23_test.cpp │ ├── vnni_to_vnni_bf16_trans.asm │ ├── vnni_to_vnni_bf16_trans.c │ ├── vnni_to_vnni_bf16_trans.h │ └── vnni_to_vnni_bf16_trans.s ├── ex24 │ ├── CMakeLists.txt │ ├── ex24_test.cpp │ ├── flat_to_vnni_bf16_trans.asm │ ├── flat_to_vnni_bf16_trans.c │ ├── flat_to_vnni_bf16_trans.h │ └── flat_to_vnni_bf16_trans.s ├── ex25 │ ├── CMakeLists.txt │ ├── ex25_test.cpp │ ├── flat_to_vnni_bf16_relayout.asm │ ├── flat_to_vnni_bf16_relayout.c │ ├── flat_to_vnni_bf16_relayout.h │ └── flat_to_vnni_bf16_relayout.s ├── ex27 │ ├── CMakeLists.txt │ ├── byte_decompression.c │ ├── byte_decompression.h │ └── ex27_test.cpp ├── ex3 │ └── amx_ref_gemm_int8.hpp ├── ex4 │ ├── CMakeLists.txt │ ├── amx_int8_test_utils.hpp │ ├── amx_ref_gemm_int8_bench.hpp │ ├── ex4_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_ref_gemm.hpp ├── ex5 │ ├── CMakeLists.txt │ ├── ex5_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_slow_gemm.hpp ├── ex6 │ ├── CMakeLists.txt │ ├── ex6_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_preload_gemm.hpp ├── ex7 │ ├── CMakeLists.txt │ ├── ex7_test.cpp │ └── gemm │ │ ├── .clang-format │ │ └── amx_switched_gemm.hpp └── ex8 │ ├── CMakeLists.txt │ ├── ex8_test.cpp │ └── gemm │ ├── .clang-format │ └── amx_interleaved_gemm.hpp ├── chap5 └── ex15 │ ├── CMakeLists.txt │ ├── ex15_test.cpp │ ├── supports_avx2.asm │ ├── supports_avx2.h │ └── supports_avx2.s ├── chap7 ├── ex3 │ ├── CMakeLists.txt │ ├── ex3_bench.cpp │ ├── ex3_test.cpp │ ├── swizzling_sse.asm │ ├── swizzling_sse.c │ ├── swizzling_sse.h │ ├── swizzling_sse.s │ └── vertex_struct.h ├── ex4 │ ├── CMakeLists.txt │ ├── ex4_bench.cpp │ ├── ex4_test.cpp │ ├── swizzling_unpck_sse.asm │ ├── swizzling_unpck_sse.c │ ├── swizzling_unpck_sse.h │ ├── swizzling_unpck_sse.s │ └── vertex_struct.h ├── ex5 │ ├── CMakeLists.txt │ ├── deswizzling_sse.asm │ ├── deswizzling_sse.c │ ├── deswizzling_sse.h │ ├── deswizzling_sse.s │ ├── ex5_bench.cpp │ ├── ex5_test.cpp │ └── vertex_struct.h └── ex6 │ ├── CMakeLists.txt │ ├── deswizzling_rgb_sse.asm │ ├── deswizzling_rgb_sse.c │ ├── deswizzling_rgb_sse.h │ ├── deswizzling_rgb_sse.s │ ├── ex6_bench.cpp │ ├── ex6_test.cpp │ └── vertex_struct.h ├── chap8 ├── ex1 │ ├── CMakeLists.txt │ ├── dotprod_novnni.asm │ ├── dotprod_novnni.h │ ├── dotprod_novnni.s │ ├── dotprod_vnni.asm │ ├── dotprod_vnni.h │ ├── dotprod_vnni.s │ ├── ex1_bench.cpp │ └── ex1_test.cpp ├── ex10 │ ├── CMakeLists.txt │ ├── ex10_test.cpp │ ├── pixel_shuffler_offset.c │ └── pixel_shuffler_offset.h ├── ex11 │ ├── CMakeLists.txt │ ├── ex11_test.cpp │ ├── sigmoid_approx.hpp │ ├── sigmoid_approx_avx512.cpp │ └── sigmoid_approx_avx512.h ├── ex12 │ ├── CMakeLists.txt │ ├── ex12_test.cpp │ ├── sigmoid_scalef.hpp │ ├── sigmoid_scalef_avx512.cpp │ └── sigmoid_scalef_avx512.h ├── ex2 │ ├── CMakeLists.txt │ ├── ex2_test.cpp │ ├── quant_types.hpp │ ├── quantization_avx512.cpp │ ├── quantization_avx512.hpp │ ├── quantization_scalar.cpp │ └── quantization_scalar.hpp ├── ex3 │ ├── .clang-format │ ├── CMakeLists.txt │ ├── direct_conv.c │ ├── direct_conv.h │ ├── ex3_bench.cpp │ └── ex3_test.cpp ├── ex4 │ ├── CMakeLists.txt │ ├── ex4_test.cpp │ ├── low_ofm_conv.cpp │ └── low_ofm_conv.hpp ├── ex5 │ ├── CMakeLists.txt │ ├── ex5_test.cpp │ ├── post_conv.cpp │ └── post_conv.hpp ├── ex6 │ ├── CMakeLists.txt │ ├── eltwise.c │ ├── eltwise.h │ └── ex6_test.cpp ├── ex7 │ ├── CMakeLists.txt │ ├── ex7_test.cpp │ ├── pooling.c │ └── pooling.h └── ex9 │ ├── CMakeLists.txt │ ├── ex9_test.cpp │ ├── pixel_shuffler.cpp │ └── pixel_shuffler.hpp ├── check-format.sh ├── common ├── CMakeLists.txt ├── optimisation_common.c ├── optimisation_common.h ├── supports_amx.asm ├── supports_amx.s ├── supports_amx_macos.s ├── supports_avx512.asm ├── supports_avx512.s ├── supports_avx512_bf16.asm ├── supports_avx512_bf16.s └── supports_avx512_macos.s └── verify.sh /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 8 3 | UseTab: Always 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *# 3 | build 4 | .checkpatch-camelcase.git. 5 | checkpatch.pl 6 | const_structs.checkpatch 7 | spelling.txt 8 | **/optimisation.tar 9 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Intel 2 | - mark.d.ryan@intel.com 3 | - Laxman.Sole@intel.com 4 | - athenas.jimenez.gonzalez@intel.com 5 | - joe.konno@intel.com 6 | - barukh.ziv@intel.com 7 | - keola.wierschem@intel.com 8 | - jyothi.krishna.viswakaran.sreelatha@intel.com 9 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Permission to use, copy, modify, and/or distribute this software for any 2 | purpose with or without fee is hereby granted. 3 | 4 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 5 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 6 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 7 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 8 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 9 | OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 10 | PERFORMANCE OF THIS SOFTWARE. 11 | 12 | -------------------------------------------------------------------------------- /chap15/ex1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(avx_ex1_srcs transform_sse.c transform_avx.c) 2 | add_executable(avx_ex1_tests ex1_test.cpp ${avx_ex1_srcs}) 3 | target_link_libraries(avx_ex1_tests gtest_main) 4 | 5 | IF( benchmark_FOUND ) 6 | add_executable(avx_ex1_bench ex1_bench.cpp ${avx_ex1_srcs}) 7 | target_link_libraries(avx_ex1_bench benchmark::benchmark) 8 | ENDIF() 9 | 10 | add_test(NAME avx_ex1_test COMMAND avx_ex1_tests) 11 | -------------------------------------------------------------------------------- /chap15/ex10/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex10_ass saxpy32.s) 3 | elseif(MSVC) 4 | set(avx_ex10_ass saxpy32.asm) 5 | endif() 6 | add_executable(avx_ex10_tests ex10_test.cpp saxpy32.c ${avx_ex10_ass}) 7 | target_link_libraries(avx_ex10_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex10_bench ex10_bench.cpp ${avx_ex10_ass}) 11 | target_link_libraries(avx_ex10_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex10_test COMMAND avx_ex10_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex12/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex12_ass saxpy32.s saxpy16.s) 3 | elseif(MSVC) 4 | set(avx_ex12_ass saxpy32.asm saxpy16.asm) 5 | endif() 6 | add_executable(avx_ex12_tests ex12_test.cpp saxpy32.c saxpy16.c ${avx_ex12_ass}) 7 | target_link_libraries(avx_ex12_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex12_bench ex12_bench.cpp ${avx_ex12_ass}) 11 | target_link_libraries(avx_ex12_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex12_test COMMAND avx_ex12_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex14/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex14_ass cond_scalar.s cond_vmaskmov.s) 3 | elseif(MSVC) 4 | set(avx_ex14_ass cond_scalar.asm cond_vmaskmov.asm) 5 | endif() 6 | add_executable(avx_ex14_tests ex14_test.cpp cond_scalar.c cond_vmaskmov.c ${avx_ex14_ass}) 7 | target_link_libraries(avx_ex14_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex14_bench ex14_bench.cpp ${avx_ex14_ass}) 11 | target_link_libraries(avx_ex14_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex14_test COMMAND avx_ex14_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex16/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex16_ass three_tap_sse.s) 3 | elseif(MSVC) 4 | set(avx_ex16_ass three_tap_sse.asm) 5 | endif() 6 | add_executable(avx_ex16_tests ex16_test.cpp three_tap_sse.c ${avx_ex16_ass}) 7 | target_link_libraries(avx_ex16_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex16_bench ex16_bench.cpp ${avx_ex16_ass}) 11 | target_link_libraries(avx_ex16_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex16_test COMMAND avx_ex16_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex17/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(avx_ex17_srcs) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(avx_ex17_ass three_tap_avx.s) 4 | elseif(MSVC) 5 | set(avx_ex17_ass three_tap_avx.asm) 6 | endif() 7 | add_executable(avx_ex17_tests ex17_test.cpp three_tap_avx.c ${avx_ex17_ass}) 8 | target_link_libraries(avx_ex17_tests gtest_main) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx_ex17_bench ex17_bench.cpp ${avx_ex17_ass}) 12 | target_link_libraries(avx_ex17_bench benchmark::benchmark) 13 | ENDIF() 14 | 15 | add_test(NAME avx_ex17_test COMMAND avx_ex17_tests) 16 | -------------------------------------------------------------------------------- /chap15/ex18/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex18_ass three_tap_mixed_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex18_ass three_tap_mixed_avx.asm) 5 | endif() 6 | add_executable(avx_ex18_tests ex18_test.cpp three_tap_mixed_avx.c ${avx_ex18_ass}) 7 | target_link_libraries(avx_ex18_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex18_bench ex18_bench.cpp ${avx_ex18_ass}) 11 | target_link_libraries(avx_ex18_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex18_test COMMAND avx_ex18_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex19/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex19_ass vshufps_transpose.s vblendps_transpose.s) 3 | elseif(MSVC) 4 | set(avx_ex19_ass vshufps_transpose.asm vblendps_transpose.asm) 5 | endif() 6 | add_executable(avx_ex19_tests ex19_test.cpp vshufps_transpose.c vblendps_transpose.c ${avx_ex19_ass}) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(avx_ex19_bench ex19_bench.cpp ${avx_ex19_ass}) 10 | target_link_libraries(avx_ex19_bench benchmark::benchmark) 11 | ENDIF() 12 | 13 | target_link_libraries(avx_ex19_tests gtest_main) 14 | add_test(NAME avx_ex19_test COMMAND avx_ex19_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex2_ass transform_sse.s transform_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex2_ass transform_sse.asm transform_avx.asm) 5 | endif() 6 | 7 | add_executable(avx_ex2_tests ex2_test.cpp transform_sse.c transform_avx.c ${avx_ex2_ass}) 8 | target_link_libraries(avx_ex2_tests gtest_main) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx_ex2_bench ex2_bench.cpp ${avx_ex2_ass}) 12 | target_link_libraries(avx_ex2_bench benchmark::benchmark) 13 | ENDIF() 14 | 15 | add_test(NAME avx_ex2_test COMMAND avx_ex2_tests) 16 | -------------------------------------------------------------------------------- /chap15/ex20/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex20_ass ${avx_ex20_ass} vinsertps_transpose.s) 3 | elseif(MSVC) 4 | set(avx_ex20_ass ${avx_ex20_ass} vinsertps_transpose.asm) 5 | endif() 6 | add_executable(avx_ex20_tests ex20_test.cpp vinsertps_transpose.c ${avx_ex20_ass}) 7 | target_link_libraries(avx_ex20_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex20_bench ex20_bench.cpp ${avx_ex20_ass}) 11 | target_link_libraries(avx_ex20_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex20_test COMMAND avx_ex20_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex21/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex21_ass mul_cpx_reg.s mul_cpx_mem.s) 3 | elseif(MSVC) 4 | set(avx_ex21_ass mul_cpx_reg.asm mul_cpx_mem.asm) 5 | endif() 6 | add_executable(avx_ex21_tests ex21_test.cpp mul_cpx_reg.c mul_cpx_mem.c ${avx_ex21_ass}) 7 | target_link_libraries(avx_ex21_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex21_bench ex21_bench.cpp ${avx_ex21_ass}) 11 | target_link_libraries(avx_ex21_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex21_test COMMAND avx_ex21_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex21/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | float real; 22 | float imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap15/ex22/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex22_ass divps_sse.s vdivps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex22_ass divps_sse.asm vdivps_avx.asm) 5 | endif() 6 | add_executable(avx_ex22_tests ex22_test.cpp divps_sse.c vdivps_avx.c ${avx_ex22_ass}) 7 | target_link_libraries(avx_ex22_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex22_bench ex22_bench.cpp ${avx_ex22_ass}) 11 | target_link_libraries(avx_ex22_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex22_test COMMAND avx_ex22_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex22/divps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DIVPS_SSE_H__ 17 | #define DIVPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void divps_sse(float *in1, float *in2, float *out, size_t len); 26 | bool divps_sse_check(float *in1, float *in2, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex22/vdivps_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef VDIVPS_AVX_H__ 17 | #define VDIVPS_AVX_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void vdivps_avx(float *in1, float *in2, float *out, size_t len); 26 | bool vdivps_avx_check(float *in1, float *in2, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex23/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex23_ass rcpps_sse.s vrcpps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex23_ass rcpps_sse.asm vrcpps_avx.asm) 5 | endif() 6 | add_executable(avx_ex23_tests ex23_test.cpp rcpps_sse.c vrcpps_avx.c ${avx_ex23_ass}) 7 | target_link_libraries(avx_ex23_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex23_bench ex23_bench.cpp ${avx_ex23_ass}) 11 | target_link_libraries(avx_ex23_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex23_test COMMAND avx_ex23_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex23/rcpps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef RCPPS_SSE_H__ 17 | #define RCPPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void rcpps_sse(float *in1, float *in2, float *out, size_t len); 26 | bool rcpps_sse_check(float *in1, float *in2, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex23/vrcpps_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef VRCPPS_AVX_H__ 17 | #define VRCPPS_AVX_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void vrcpps_avx(float *in1, float *in2, float *out, size_t len); 26 | bool vrcpps_avx_check(float *in1, float *in2, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex24/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex24_ass rcpps_mul_sse.s vrcpps_mul_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex24_ass rcpps_mul_sse.asm vrcpps_mul_avx.asm) 5 | endif() 6 | add_executable(avx_ex24_tests ex24_test.cpp rcpps_mul_sse.c vrcpps_mul_avx.c ${avx_ex24_ass}) 7 | target_link_libraries(avx_ex24_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex24_bench ex24_bench.cpp ${avx_ex24_ass}) 11 | target_link_libraries(avx_ex24_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex24_test COMMAND avx_ex24_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex25/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex25_ass sqrtps_divps_sse.s vsqrtps_vdivps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex25_ass sqrtps_divps_sse.asm vsqrtps_vdivps_avx.asm) 5 | endif() 6 | add_executable(avx_ex25_tests ex25_test.cpp sqrtps_divps_sse.c vsqrtps_vdivps_avx.c ${avx_ex25_ass}) 7 | target_link_libraries(avx_ex25_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex25_bench ex25_bench.cpp ${avx_ex25_ass}) 11 | target_link_libraries(avx_ex25_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex25_test COMMAND avx_ex25_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex25/sqrtps_divps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SQRTPS_DIVPS_SSE_H__ 17 | #define SQRTPS_DIVPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void sqrtps_divps_sse(float *in, float *out, size_t len); 26 | bool sqrtps_divps_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex26/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex26_ass rsqrtps_sse.s vrsqrtps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex26_ass rsqrtps_sse.asm vrsqrtps_avx.asm) 5 | endif() 6 | add_executable(avx_ex26_tests ex26_test.cpp rsqrtps_sse.c vrsqrtps_avx.c ${avx_ex26_ass}) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(avx_ex26_bench ex26_bench.cpp ${avx_ex26_ass}) 10 | target_link_libraries(avx_ex26_bench benchmark::benchmark) 11 | ENDIF() 12 | 13 | target_link_libraries(avx_ex26_tests gtest_main) 14 | add_test(NAME avx_ex26_test COMMAND avx_ex26_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex26/rsqrtps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef RSQRTPS_SSE_H__ 17 | #define RSQRTPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void rsqrtps_sse(float *in, float *out, size_t len); 26 | bool rsqrtps_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex26/vrsqrtps_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef VRSQRTPS_AVX_H__ 17 | #define VRSQRTPS_AVX_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void vrsqrtps_avx(float *in, float *out, size_t len); 26 | bool vrsqrtps_avx_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex27/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex27_ass rsqrtps_newt_sse.s vrsqrtps_newt_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex27_ass rsqrtps_newt_sse.asm vrsqrtps_newt_avx.asm) 5 | endif() 6 | add_executable(avx_ex27_tests ex27_test.cpp rsqrtps_newt_sse.c vrsqrtps_newt_avx.c ${avx_ex27_ass}) 7 | target_link_libraries(avx_ex27_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex27_bench ex27_bench.cpp ${avx_ex27_ass}) 11 | target_link_libraries(avx_ex27_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex27_test COMMAND avx_ex27_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex27/rsqrtps_newt_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef RSQRTPS_NEWT_SSE_H__ 17 | #define RSQRTPS_NEWT_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void rsqrtps_newt_sse(float *in, float *out, size_t len); 26 | bool rsqrtps_newt_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex28/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex28_ass sqrtps_sse.s vsqrtps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex28_ass sqrtps_sse.asm vsqrtps_avx.asm) 5 | endif() 6 | add_executable(avx_ex28_tests ex28_test.cpp sqrtps_sse.c vsqrtps_avx.c ${avx_ex28_ass}) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(avx_ex28_bench ex28_bench.cpp ${avx_ex28_ass}) 10 | target_link_libraries(avx_ex28_bench benchmark::benchmark) 11 | ENDIF() 12 | 13 | target_link_libraries(avx_ex28_tests gtest_main) 14 | add_test(NAME avx_ex28_test COMMAND avx_ex28_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex28/sqrtps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SQRTPS_SSE_H__ 17 | #define SQRTPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void sqrtps_sse(float *in, float *out, size_t len); 26 | bool sqrtps_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex28/vsqrtps_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef VSQRTPS_AVX_H__ 17 | #define VSQRTPS_AVX_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void vsqrtps_avx(float *in, float *out, size_t len); 26 | bool vsqrtps_avx_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex29/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex29_ass sqrt_rsqrtps_sse.s sqrt_vrsqrtps_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex29_ass sqrt_rsqrtps_sse.asm sqrt_vrsqrtps_avx.asm) 5 | endif() 6 | add_executable(avx_ex29_tests ex29_test.cpp sqrt_rsqrtps_sse.c sqrt_vrsqrtps_avx.c ${avx_ex29_ass}) 7 | target_link_libraries(avx_ex29_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex29_bench ex29_bench.cpp ${avx_ex29_ass}) 11 | target_link_libraries(avx_ex29_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex29_test COMMAND avx_ex29_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex29/sqrt_rsqrtps_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SQRT_RSQRTPS_SSE_H__ 17 | #define SQRT_RSQRTPS_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void sqrt_rsqrtps_sse(float *in, float *out, size_t len); 26 | bool sqrt_rsqrtps_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex3_ass poly_sse.s poly_avx_128.s poly_avx_256.s) 3 | elseif(MSVC) 4 | set(avx_ex3_ass poly_sse.asm poly_avx_128.asm poly_avx_256.asm) 5 | endif() 6 | 7 | add_executable(avx_ex3_tests ex3_test.cpp poly_sse.c poly_avx_128.c poly_avx_256.c ${avx_ex3_ass}) 8 | target_link_libraries(avx_ex3_tests gtest_main) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx_ex3_bench ex3_bench.cpp ${avx_ex3_ass}) 12 | target_link_libraries(avx_ex3_bench benchmark::benchmark) 13 | ENDIF() 14 | 15 | add_test(NAME avx_ex3_test COMMAND avx_ex3_tests) 16 | -------------------------------------------------------------------------------- /chap15/ex3/poly_avx_128.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef POLY_AVX_128_H__ 17 | #define POLY_AVX_128_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void poly_avx_128(float *in, float *out, int32_t len); 26 | bool poly_avx_128_check(float *in, float *out, int32_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex3/poly_avx_256.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef POLY_AVX_256_H__ 17 | #define POLY_AVX_256_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void poly_avx_256(float *in, float *out, int32_t len); 26 | bool poly_avx_256_check(float *in, float *out, int32_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex3/poly_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef POLY_SSE_H__ 17 | #define POLY_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void poly_sse(float *in, float *out, int32_t len); 26 | bool poly_sse_check(float *in, float *out, int32_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex30/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex30_ass sqrt_rsqrtps_taylor_sse.s sqrt_vrsqrtps_taylor_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex30_ass sqrt_rsqrtps_taylor_sse.asm sqrt_vrsqrtps_taylor_avx.asm) 5 | endif() 6 | add_executable(avx_ex30_tests ex30_test.cpp sqrt_rsqrtps_taylor_sse.c sqrt_vrsqrtps_taylor_avx.c ${avx_ex30_ass}) 7 | target_link_libraries(avx_ex30_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex30_bench ex30_bench.cpp ${avx_ex30_ass}) 11 | target_link_libraries(avx_ex30_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex30_test COMMAND avx_ex30_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex31/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex31_ass subsum_avx.s subsum_sse.s) 3 | elseif(MSVC) 4 | set(avx_ex31_ass subsum_avx.asm subsum_sse.asm) 5 | endif() 6 | add_executable(avx_ex31_tests ex31_test.cpp subsum_avx.c subsum_sse.c ${avx_ex31_ass}) 7 | target_link_libraries(avx_ex31_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex31_bench ex31_bench.cpp ${avx_ex31_ass}) 11 | target_link_libraries(avx_ex31_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex31_test COMMAND avx_ex31_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex31/subsum_avx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SUBSUM_AVX_H__ 17 | #define SUBSUM_AVX_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void subsum_avx(float *in, float *out, size_t len); 26 | bool subsum_avx_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex31/subsum_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SUBSUM_SSE_H__ 17 | #define SUBSUM_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void subsum_sse(float *in, float *out, size_t len); 26 | bool subsum_sse_check(float *in, float *out, size_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex34/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex34_ass singlep.s halfp.s) 3 | elseif(MSVC) 4 | set(avx_ex34_ass singlep.asm halfp.asm) 5 | endif() 6 | add_executable(avx_ex34_tests ex34_test.cpp singlep.c halfp.c ${avx_ex34_ass}) 7 | target_link_libraries(avx_ex34_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex34_bench ex34_bench.cpp ${avx_ex34_ass}) 11 | target_link_libraries(avx_ex34_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex34_test COMMAND avx_ex34_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex34/halfp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef HALFP_H__ 17 | #define HALFP_H__ 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | void halfp(__m128i *x, __m128i *y, uint64_t len); 27 | bool halfp_check(__m128i *x, __m128i *y, uint64_t len); 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap15/ex34/singlep.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLEP_H__ 17 | #define SINGLEP_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void singlep(float *x, float *y, uint64_t len); 26 | bool singlep_check(float *x, float *y, uint64_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex35/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex35_ass fp_mul_add.s fp_fma.s) 3 | elseif(MSVC) 4 | set(avx_ex35_ass fp_mul_add.asm fp_fma.asm) 5 | endif() 6 | add_executable(avx_ex35_tests ex35_test.cpp fp_mul_add.c fp_fma.c ${avx_ex35_ass}) 7 | target_link_libraries(avx_ex35_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex35_bench ex35_bench.cpp ${avx_ex35_ass}) 11 | target_link_libraries(avx_ex35_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex35_test COMMAND avx_ex35_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex35/fp_fma.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef FP_FMA_H__ 17 | #define FP_FMA_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void fp_fma(float *a, float *c1, float *c2, uint32_t iters); 26 | bool fp_fma_check(float *a, float *c1, float *c2, uint32_t iters); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex35/fp_mul_add.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef FP_MUL_ADD_H__ 17 | #define FP_MUL_ADD_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void fp_mul_add(float *a, float *c1, float *c2, uint32_t iters); 26 | bool fp_mul_add_check(float *a, float *c1, float *c2, uint32_t iters); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex36/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex36_ass no_unroll_reduce.s unroll_reduce.s) 3 | elseif(MSVC) 4 | set(avx_ex36_ass no_unroll_reduce.asm unroll_reduce.asm) 5 | endif() 6 | add_executable(avx_ex36_tests ex36_test.cpp no_unroll_reduce.c unroll_reduce.c ${avx_ex36_ass}) 7 | target_link_libraries(avx_ex36_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex36_bench ex36_bench.cpp ${avx_ex36_ass}) 11 | target_link_libraries(avx_ex36_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex36_test COMMAND avx_ex36_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex36/no_unroll_reduce.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef NO_UNROLL_REDUCE_H__ 17 | #define NO_UNROLL_REDUCE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | float no_unroll_reduce(float *a, uint32_t len); 26 | bool no_unroll_reduce_check(float *a, uint32_t len, float *result); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex36/unroll_reduce.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef UNROLL_REDUCE_H__ 17 | #define UNROLL_REDUCE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | float unroll_reduce(float *a, uint32_t len); 26 | bool unroll_reduce_check(float *a, uint32_t len, float *result); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex39/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(avx_ex39_tests ex39_test.cpp klt_256.c) 2 | target_link_libraries(avx_ex39_tests gtest_main) 3 | target_include_directories(avx_ex39_tests 4 | PUBLIC 5 | ../ex38 6 | ) 7 | add_test(NAME avx_ex39_test COMMAND avx_ex39_tests) 8 | -------------------------------------------------------------------------------- /chap15/ex39/klt_256.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef KLT_256_H__ 17 | #define KLT_256_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void klt_256_d(short *input, short *output, int width, int height); 26 | bool klt_256_d_check(short *input, short *output, int width, int height); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex41/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(avx_ex41_tests ex41_test.cpp i64toa_avx2.c) 2 | target_link_libraries(avx_ex41_tests gtest_main) 3 | target_include_directories(avx_ex41_tests 4 | PUBLIC 5 | ../ex40 6 | ../ex42 7 | ) 8 | add_test(NAME avx_ex41_test COMMAND avx_ex41_tests) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx_ex41_bench ex41_bench.cpp i64toa_avx2.c) 12 | target_link_libraries(avx_ex41_bench benchmark::benchmark) 13 | target_include_directories(avx_ex41_bench 14 | PUBLIC 15 | ../ex40 16 | ../ex42 17 | ) 18 | ENDIF() 19 | 20 | -------------------------------------------------------------------------------- /chap15/ex41/i64toa_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef I64TOA_AVX2_H__ 17 | #define I64TOA_AVX2_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | char *i64toa_avx2i(int64_t xx, char *p); 26 | bool i64toa_avx2i_check(int64_t xx, char *p); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex45/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(avx_ex45_srcs ex45_test.cpp vpgatherd_soft.c) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(avx_ex45_srcs ${avx_ex45_srcs} vpgatherd_soft.s) 4 | elseif(MSVC) 5 | set(avx_ex45_srcs ${avx_ex45_srcs} vpgatherd_soft.asm) 6 | endif() 7 | add_executable(avx_ex45_tests ${avx_ex45_srcs}) 8 | 9 | target_link_libraries(avx_ex45_tests gtest_main) 10 | add_test(NAME avx_ex45_test COMMAND avx_ex45_tests) 11 | -------------------------------------------------------------------------------- /chap15/ex45/vpgatherd_soft.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "vpgatherd_soft.h" 17 | 18 | bool vpgatherd_soft8_check(uint32_t *indices, uint32_t *in, uint32_t *out) 19 | { 20 | /* 21 | * indices, in and out must be non-NULL and each contain at least 8 22 | * elements. 23 | */ 24 | 25 | if (!indices || !in || !out) 26 | return false; 27 | 28 | vpgatherd_soft8(indices, in, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap15/ex46/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex46_ass scalar.s avx2_vpgatherd.s avx_vinsrt.s) 3 | elseif(MSVC) 4 | set(avx_ex46_ass scalar.asm avx2_vpgatherd.asm avx_vinsrt.asm) 5 | endif() 6 | add_executable(avx_ex46_tests ex46_test.cpp scalar.c avx2_vpgatherd.c avx_vinsrt.c ${avx_ex46_ass}) 7 | target_link_libraries(avx_ex46_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex46_bench ex46_bench.cpp ${avx_ex46_ass}) 11 | target_link_libraries(avx_ex46_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex46_test COMMAND avx_ex46_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex46/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | float real; 22 | float imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap15/ex47/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex47_ass avx2_gatherpd.s avx_vinsert.s) 3 | elseif(MSVC) 4 | set(avx_ex47_ass avx2_gatherpd.asm avx_vinsert.asm) 5 | endif() 6 | add_executable(avx_ex47_tests ex47_test.cpp avx2_gatherpd.c avx_vinsert.c ${avx_ex47_ass}) 7 | target_link_libraries(avx_ex47_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex47_bench ex47_bench.cpp ${avx_ex47_ass}) 11 | target_link_libraries(avx_ex47_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex47_test COMMAND avx_ex47_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex47/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | double real; 22 | double imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap15/ex48/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex48_ass mmx_min_max.s avx2_min_max.s) 3 | elseif(MSVC) 4 | set(avx_ex48_ass mmx_min_max.asm avx2_min_max.asm) 5 | endif() 6 | add_executable(avx_ex48_tests ex48_test.cpp mmx_min_max.c avx2_min_max.c ${avx_ex48_ass}) 7 | target_link_libraries(avx_ex48_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex48_bench ex48_bench.cpp ${avx_ex48_ass}) 11 | target_link_libraries(avx_ex48_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex48_test COMMAND avx_ex48_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex48/min_max.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef MIN_MAX_H__ 17 | #define MIN_MAX_H__ 18 | 19 | typedef struct min_max_ min_max; 20 | struct min_max_ { 21 | int16_t max; 22 | int16_t min; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap15/ex6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(avx_ex6_srcs ) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(avx_ex6_ass complex_conv_sse.s complex_conv_avx_stride.s) 4 | elseif(MSVC) 5 | set(avx_ex6_ass complex_conv_sse.asm complex_conv_avx_stride.asm) 6 | endif() 7 | add_executable(avx_ex6_tests ex6_test.cpp complex_conv_sse.c complex_conv_avx_stride.c ${avx_ex6_ass}) 8 | target_link_libraries(avx_ex6_tests gtest_main) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx_ex6_bench ex6_bench.cpp ${avx_ex6_ass}) 12 | target_link_libraries(avx_ex6_bench benchmark::benchmark) 13 | ENDIF() 14 | 15 | add_test(NAME avx_ex6_test COMMAND avx_ex6_tests) 16 | -------------------------------------------------------------------------------- /chap15/ex6/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | float real; 22 | float imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap15/ex7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex7_ass median_sse.s median_avx_overlap.s median_avx_vperm.s) 3 | elseif(MSVC) 4 | set(avx_ex7_ass median_sse.asm median_avx_overlap.asm median_avx_vperm.asm) 5 | endif() 6 | add_executable(avx_ex7_tests ex7_test.cpp median_sse.c median_avx_overlap.c median_avx_vperm.c ${avx_ex7_ass}) 7 | target_link_libraries(avx_ex7_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex7_bench ex7_bench.cpp ${avx_ex7_ass}) 11 | target_link_libraries(avx_ex7_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex7_test COMMAND avx_ex7_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex7/median_avx_vperm.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef MEDIAN_AVX_VPERM_H__ 17 | #define MEDIAN_AVX_VPERM_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void median_avx_vperm(float *x, float *y, uint64_t len); 26 | bool median_avx_vperm_check(float *x, float *y, uint64_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex7/median_sse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef MEDIAN_SSE_H__ 17 | #define MEDIAN_SSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void median_sse(float *x, float *y, uint64_t len); 26 | bool median_sse_check(float *x, float *y, uint64_t len); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap15/ex8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex8_ass gather_scalar.s gather_vinsert.s gather_vinsert_vshufps.s) 3 | elseif(MSVC) 4 | set(avx_ex8_ass gather_scalar.asm gather_vinsert.asm gather_vinsert_vshufps.asm) 5 | endif() 6 | add_executable(avx_ex8_tests ex8_test.cpp gather_scalar.c gather_vinsert.c gather_vinsert_vshufps.c ${avx_ex8_ass}) 7 | target_link_libraries(avx_ex8_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex8_bench ex8_bench.cpp ${avx_ex8_ass}) 11 | target_link_libraries(avx_ex8_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex8_test COMMAND avx_ex8_tests) 15 | -------------------------------------------------------------------------------- /chap15/ex9/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx_ex9_ass scatter_scalar.s scatter_avx.s) 3 | elseif(MSVC) 4 | set(avx_ex9_ass scatter_scalar.asm scatter_avx.asm) 5 | endif() 6 | add_executable(avx_ex9_tests ex9_test.cpp scatter_scalar.c scatter_avx.c ${avx_ex9_ass}) 7 | target_link_libraries(avx_ex9_tests gtest_main) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx_ex9_bench ex9_bench.cpp ${avx_ex9_ass}) 11 | target_link_libraries(avx_ex9_bench benchmark::benchmark) 12 | ENDIF() 13 | 14 | add_test(NAME avx_ex9_test COMMAND avx_ex9_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE transform_avx512.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | set(avx512_ex1_srcs transform_avx.c transform_avx512.c) 5 | add_executable(avx512_ex1_tests ex1_test.cpp ${avx512_ex1_srcs}) 6 | target_link_libraries(avx512_ex1_tests gtest_main optimisation_common) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(avx512_ex1_bench ex1_bench.cpp ${avx512_ex1_srcs}) 10 | target_link_libraries(avx512_ex1_bench benchmark::benchmark optimisation_common) 11 | ENDIF() 12 | 13 | add_test(NAME avx512_ex1_test COMMAND avx512_ex1_tests) 14 | -------------------------------------------------------------------------------- /chap18/ex10/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex10_ass scalar_compress.s avx_compress.s avx2_compress.s avx512_compress.s) 3 | elseif(MSVC) 4 | set(avx512_ex10_ass scalar_compress.asm avx_compress.asm avx2_compress.asm avx512_compress.asm) 5 | endif() 6 | add_executable(avx512_ex10_tests ex10_test.cpp scalar_compress.c avx_compress.c avx2_compress.c avx512_compress.c ${avx512_ex10_ass}) 7 | target_link_libraries(avx512_ex10_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex10_bench ex10_bench.cpp ${avx512_ex10_ass}) 11 | target_link_libraries(avx512_ex10_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex10_test COMMAND avx512_ex10_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex11/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex11_ass expand_scalar.s expand_avx2.s expand_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex11_ass expand_scalar.asm expand_avx2.asm expand_avx512.asm) 5 | endif() 6 | add_executable(avx512_ex11_tests ex11_test.cpp expand_scalar.c expand_avx2.c expand_avx512.c ${avx512_ex11_ass}) 7 | target_link_libraries(avx512_ex11_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex11_bench ex11_bench.cpp ${avx512_ex11_ass}) 11 | target_link_libraries(avx512_ex11_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex11_test COMMAND avx512_ex11_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex11/expand_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef EXPAND_AVX2_H__ 17 | #define EXPAND_AVX2_H__ 18 | 19 | #include 20 | #include 21 | #include 22 | 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #endif 26 | void expand_avx2(int32_t *out, int32_t *in, size_t N); 27 | bool expand_avx2_check(int32_t *out, int32_t *in, size_t N); 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex11/expand_scalar.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "expand_scalar.h" 17 | 18 | bool expand_scalar_check(int32_t *out, int32_t *in, size_t N) 19 | { 20 | /* 21 | * in and out must be non NULL. 22 | */ 23 | 24 | if (!out || !in) 25 | return false; 26 | 27 | /* 28 | * N must be > 0. 29 | */ 30 | 31 | if (N == 0) 32 | return false; 33 | 34 | expand_scalar(out, in, N); 35 | 36 | return true; 37 | } 38 | -------------------------------------------------------------------------------- /chap18/ex12/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex12_ass ternary_avx2.s ternary_avx512.s ternary_vpternlog.s) 3 | elseif(MSVC) 4 | set(avx512_ex12_ass ternary_avx2.asm ternary_avx512.asm ternary_vpternlog.asm) 5 | endif() 6 | add_executable(avx512_ex12_tests ex12_test.cpp ${avx512_ex12_ass}) 7 | target_link_libraries(avx512_ex12_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex12_bench ex12_bench.cpp ${avx512_ex12_ass}) 11 | target_link_libraries(avx512_ex12_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex12_test COMMAND avx512_ex12_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex12/ternary_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef TERNARY_AVX2_H__ 17 | #define TERNARY_AVX2_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void ternary_avx2(uint32_t *dest, const uint32_t *src1, const uint32_t *src2, 26 | const uint32_t *src3, uint64_t len); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex12/ternary_avx512.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef TERNARY_AVX512_H__ 17 | #define TERNARY_AVX512_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void ternary_avx512(uint32_t *dest, const uint32_t *src1, const uint32_t *src2, 26 | const uint32_t *src3, uint64_t len); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex13/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex13_ass transpose_scalar.s transpose_avx2.s transpose_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex13_ass transpose_scalar.asm transpose_avx2.asm transpose_avx512.asm) 5 | endif() 6 | add_executable(avx512_ex13_tests ex13_test.cpp transpose_scalar.c transpose_avx2.c transpose_avx512.c ${avx512_ex13_ass}) 7 | target_link_libraries(avx512_ex13_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex13_bench ex13_bench.cpp ${avx512_ex13_ass}) 11 | target_link_libraries(avx512_ex13_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex13_test COMMAND avx512_ex13_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex14/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex14_ass register_broadcast.s memory_broadcast.s embedded_broadcast.s) 3 | elseif(MSVC) 4 | set(avx512_ex14_ass register_broadcast.asm memory_broadcast.asm embedded_broadcast.asm) 5 | endif() 6 | add_executable(avx512_ex14_tests ex14_test.cpp register_broadcast.c memory_broadcast.c embedded_broadcast.c ${avx512_ex14_ass}) 7 | target_link_libraries(avx512_ex14_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex14_bench ex14_bench.cpp ${avx512_ex14_ass}) 11 | target_link_libraries(avx512_ex14_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex14_test COMMAND avx512_ex14_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex15/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex15_ass register_broadcast.s memory_broadcast.s) 3 | elseif(MSVC) 4 | set(avx512_ex15_ass register_broadcast.asm memory_broadcast.asm) 5 | endif() 6 | add_executable(avx512_ex15_tests ex15_test.cpp register_broadcast.c memory_broadcast.c ${avx512_ex15_ass}) 7 | target_link_libraries(avx512_ex15_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex15_bench ex15_bench.cpp ${avx512_ex15_ass}) 11 | target_link_libraries(avx512_ex15_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex15_test COMMAND avx512_ex15_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex16/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex16_ass embedded_rounding.s manual_rounding.s) 3 | elseif(MSVC) 4 | set(avx512_ex16_ass embedded_rounding.asm manual_rounding.asm) 5 | endif() 6 | add_executable(avx512_ex16_tests ex16_test.cpp embedded_rounding.c manual_rounding.c ${avx512_ex16_ass}) 7 | 8 | target_link_libraries(avx512_ex16_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex16_test COMMAND avx512_ex16_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex16/embedded_rounding.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "embedded_rounding.h" 17 | 18 | bool embedded_rounding_check(const float *a, const float *b, float *out) 19 | { 20 | /* 21 | * a, b and out must be non-NULL. 22 | */ 23 | 24 | if (!a || !b || !out) 25 | return false; 26 | 27 | embedded_rounding(a, b, out); 28 | 29 | return true; 30 | } 31 | -------------------------------------------------------------------------------- /chap18/ex16/manual_rounding.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "manual_rounding.h" 17 | 18 | bool manual_rounding_check(const float *a, const float *b, float *out) 19 | { 20 | /* 21 | * a, b and out must be non-NULL. 22 | */ 23 | 24 | if (!a || !b || !out) 25 | return false; 26 | 27 | manual_rounding(a, b, out); 28 | 29 | return true; 30 | } 31 | -------------------------------------------------------------------------------- /chap18/ex17/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex17_ass scalar_scatter.s software_scatter.s hardware_scatter.s) 3 | elseif(MSVC) 4 | set(avx512_ex17_ass scalar_scatter.asm software_scatter.asm hardware_scatter.asm) 5 | endif() 6 | add_executable(avx512_ex17_tests ex17_test.cpp scalar_scatter.c software_scatter.c hardware_scatter.c ${avx512_ex17_ass}) 7 | target_link_libraries(avx512_ex17_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex17_bench ex17_bench.cpp ${avx512_ex17_ass}) 11 | target_link_libraries(avx512_ex17_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex17_test COMMAND avx512_ex17_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex18/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex18_ass qword_avx2_ass.s qword_avx512_ass.s) 3 | set_property(SOURCE qword_avx512_intrinsics.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512dq") 4 | elseif(MSVC) 5 | set(avx512_ex18_ass qword_avx2_ass.asm qword_avx512_ass.asm) 6 | endif() 7 | add_executable(avx512_ex18_tests ex18_test.cpp qword_avx2_intrinsics.c qword_avx2.c qword_avx512_intrinsics.c qword_avx512.c ${avx512_ex18_ass}) 8 | target_link_libraries(avx512_ex18_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex18_bench ex18_bench.cpp qword_avx2_intrinsics.c qword_avx512_intrinsics.c ${avx512_ex18_ass}) 12 | target_link_libraries(avx512_ex18_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME avx512_ex18_test COMMAND avx512_ex18_tests) 16 | -------------------------------------------------------------------------------- /chap18/ex19/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex19_ass scalar_histogram.s avx512_histogram.s) 3 | elseif(MSVC) 4 | set(avx512_ex19_ass scalar_histogram.asm avx512_histogram.asm) 5 | endif() 6 | add_executable(avx512_ex19_tests ex19_test.cpp scalar_histogram.c avx512_histogram.c ${avx512_ex19_ass}) 7 | target_link_libraries(avx512_ex19_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex19_bench ex19_bench.cpp ${avx512_ex19_ass}) 11 | target_link_libraries(avx512_ex19_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex19_test COMMAND avx512_ex19_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex2_ass transform_avx.s transform_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex2_ass transform_avx.asm transform_avx512.asm) 5 | endif() 6 | 7 | add_executable(avx512_ex2_tests ex2_test.cpp transform_avx.c transform_avx512.c ${avx512_ex2_ass}) 8 | 9 | target_link_libraries(avx512_ex2_tests gtest_main optimisation_common) 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex2_bench ex2_bench.cpp ${avx512_ex2_ass}) 12 | target_link_libraries(avx512_ex2_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME avx512_ex2_test COMMAND avx512_ex2_tests) 16 | -------------------------------------------------------------------------------- /chap18/ex20/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex20_ass scalar_vector_dp.s avx512_vector_dp.s) 3 | elseif(MSVC) 4 | set(avx512_ex20_ass scalar_vector_dp.asm avx512_vector_dp.asm) 5 | endif() 6 | add_executable(avx512_ex20_tests ex20_test.cpp init_sparse.cpp scalar_vector_dp.c avx512_vector_dp.c ${avx512_ex20_ass}) 7 | target_link_libraries(avx512_ex20_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex20_bench ex20_bench.cpp init_sparse.cpp ${avx512_ex20_ass}) 11 | target_link_libraries(avx512_ex20_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex20_test COMMAND avx512_ex20_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex20/init_sparse.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef INIT_SPARSE_H__ 17 | #define INIT_SPARSE_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | void init_sparse(uint32_t *a_index, double *a_values, uint32_t *b_index, 27 | double *b_values, size_t len); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /chap18/ex21/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex21_ass lookup_novbmi.s lookup_vbmi.s) 3 | elseif(MSVC) 4 | set(avx512_ex21_ass lookup_novbmi.asm lookup_vbmi.asm) 5 | endif() 6 | add_executable(avx512_ex21_tests ex21_test.cpp lookup_novbmi.c lookup_vbmi.c ${avx512_ex21_ass}) 7 | target_link_libraries(avx512_ex21_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex21_bench ex21_bench.cpp ${avx512_ex21_ass}) 11 | target_link_libraries(avx512_ex21_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex21_test COMMAND avx512_ex21_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex22/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex22_ass lookup128_novbmi.s lookup128_vbmi.s) 3 | elseif(MSVC) 4 | set(avx512_ex22_ass lookup128_novbmi.asm lookup128_vbmi.asm) 5 | endif() 6 | add_executable(avx512_ex22_tests ex22_test.cpp lookup128_novbmi.c lookup128_vbmi.c ${avx512_ex22_ass}) 7 | target_link_libraries(avx512_ex22_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex22_bench ex22_bench.cpp ${avx512_ex22_ass}) 11 | target_link_libraries(avx512_ex22_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex22_test COMMAND avx512_ex22_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex23/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex23_ass decompress_novbmi.s decompress_vbmi.s) 3 | elseif(MSVC) 4 | set(avx512_ex23_ass decompress_novbmi.asm decompress_vbmi.asm) 5 | endif() 6 | add_executable(avx512_ex23_tests ex23_test.cpp decompress_novbmi.c decompress_vbmi.c ${avx512_ex23_ass}) 7 | target_link_libraries(avx512_ex23_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex23_bench ex23_bench.cpp ${avx512_ex23_ass}) 11 | target_link_libraries(avx512_ex23_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex23_test COMMAND avx512_ex23_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex24/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex24_ass only_256bit.s both_256_512bit.s) 3 | elseif(MSVC) 4 | set(avx512_ex24_ass only_256bit.asm both_256_512bit.asm) 5 | endif() 6 | add_executable(avx512_ex24_tests ex24_test.cpp ${avx512_ex24_ass}) 7 | target_link_libraries(avx512_ex24_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex24_bench ex24_bench.cpp ${avx512_ex24_ass}) 11 | target_link_libraries(avx512_ex24_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex24_test COMMAND avx512_ex24_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex24/both_256_512bit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef BOTH_256_512BIT_H__ 17 | #define BOTH_256_512BIT_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void both_256_512bit(uint64_t count); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap18/ex24/only_256bit.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef ONLY_256BIT_H__ 17 | #define ONLY_256BIT_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void only_256bit(uint64_t count); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap18/ex25/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(avx512_ex25_srcs ex25_test.cpp fma_unit_count.c) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(avx512_ex25_srcs ${avx512_ex25_srcs} fma_shuffle_tpt.s fma_only_tpt.s) 4 | elseif(MSVC) 5 | set(avx512_ex25_srcs ${avx512_ex25_srcs} fma_shuffle_tpt.asm fma_only_tpt.asm) 6 | endif() 7 | add_executable(avx512_ex25_tests ${avx512_ex25_srcs}) 8 | 9 | target_link_libraries(avx512_ex25_tests gtest_main optimisation_common) 10 | add_test(NAME avx512_ex25_test COMMAND avx512_ex25_tests) 11 | -------------------------------------------------------------------------------- /chap18/ex25/fma_unit_count.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef FMA_UNIT_COUNT_H__ 17 | #define FMA_UNIT_COUNT_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | uint64_t fma_unit_count(void); 26 | void fma_shuffle_tpt(uint64_t loop_cnt); 27 | void fma_only_tpt(uint64_t loop_cnt); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /chap18/ex26/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex26_ass g2s_vpgatherdd.s g2s_vpermi2d.s) 3 | elseif(MSVC) 4 | set(avx512_ex26_ass g2s_vpgatherdd.asm g2s_vpermi2d.asm) 5 | endif() 6 | add_executable(avx512_ex26_tests ex26_test.cpp g2s_vpgatherdd.c g2s_vpermi2d.c ${avx512_ex26_ass}) 7 | target_link_libraries(avx512_ex26_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex26_bench ex26_bench.cpp ${avx512_ex26_ass}) 11 | target_link_libraries(avx512_ex26_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex26_test COMMAND avx512_ex26_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex26/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | float real; 22 | float imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap18/ex27/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex27_ass s2s_vscatterdps.s s2s_vpermi2d.s) 3 | elseif(MSVC) 4 | set(avx512_ex27_ass s2s_vscatterdps.asm s2s_vpermi2d.asm) 5 | endif() 6 | add_executable(avx512_ex27_tests ex27_test.cpp s2s_vscatterdps.c s2s_verpmi2d.c ${avx512_ex27_ass}) 7 | target_link_libraries(avx512_ex27_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex27_bench ex27_bench.cpp ${avx512_ex27_ass}) 11 | target_link_libraries(avx512_ex27_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex27_test COMMAND avx512_ex27_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex27/complex_num.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPLEX_NUM_H__ 17 | #define COMPLEX_NUM_H__ 18 | 19 | typedef struct complex_num_ complex_num; 20 | struct complex_num_ { 21 | float real; 22 | float imaginary; 23 | }; 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap18/ex28/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex28_ass adj_vpgatherpd.s adj_load_masked_broadcast.s) 3 | elseif(MSVC) 4 | set(avx512_ex28_ass adj_vpgatherpd.asm adj_load_masked_broadcast.asm) 5 | endif() 6 | add_executable(avx512_ex28_tests ex28_test.cpp adj_vpgatherpd.c adj_load_masked_broadcast.c ${avx512_ex28_ass}) 7 | target_link_libraries(avx512_ex28_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex28_bench ex28_bench.cpp ${avx512_ex28_ass}) 11 | target_link_libraries(avx512_ex28_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex28_test COMMAND avx512_ex28_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex28/elem_struct.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef ELEM_STRUCT_H__ 17 | #define ELEM_STRUCT_H__ 18 | 19 | typedef struct { 20 | double var[4]; 21 | } elem_struct_t; 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /chap18/ex29/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex29_ass saxpy_512.s) 3 | elseif(MSVC) 4 | set(avx512_ex29_ass saxpy_512.asm) 5 | endif() 6 | add_executable(avx512_ex29_tests ex29_test.cpp saxpy_512.c ${avx512_ex29_ass}) 7 | target_link_libraries(avx512_ex29_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex29_bench ex29_bench.cpp ${avx512_ex29_ass}) 11 | target_link_libraries(avx512_ex29_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | add_test(NAME avx512_ex29_test COMMAND avx512_ex29_tests) 14 | -------------------------------------------------------------------------------- /chap18/ex3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE mul_blend_avx512.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | set(avx512_ex3_srcs mul_blend_avx.c mul_blend_avx512.c) 5 | 6 | add_executable(avx512_ex3_tests ex3_test.cpp ${avx512_ex3_srcs}) 7 | target_link_libraries(avx512_ex3_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex3_bench ex3_bench.cpp ${avx512_ex3_srcs}) 11 | target_link_libraries(avx512_ex3_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | add_test(NAME avx512_ex3_test COMMAND avx512_ex3_tests) 14 | -------------------------------------------------------------------------------- /chap18/ex30/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex30_ass single_div_24.s single_div_23.s single_div_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex30_ass single_div_24.asm single_div_23.asm single_div_14.asm) 5 | endif() 6 | add_executable(avx512_ex30_tests ex30_test.cpp single_div_24.c single_div_23.c single_div_14.c ${avx512_ex30_ass}) 7 | 8 | target_link_libraries(avx512_ex30_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex30_test COMMAND avx512_ex30_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_div_14.h" 17 | 18 | bool single_div_14_check(float *a, float *b, float *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | single_div_14(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_DIV_14_H_ 17 | #define SINGLE_DIV_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_div_14(float *a, float *b, float *out); 26 | bool single_div_14_check(float *a, float *b, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_23.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_div_23.h" 17 | 18 | bool single_div_23_check(float *a, float *b, float *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | single_div_23(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_23.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_DIV_23_H_ 17 | #define SINGLE_DIV_23_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_div_23(float *a, float *b, float *out); 26 | bool single_div_23_check(float *a, float *b, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_24.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_div_24.h" 17 | 18 | bool single_div_24_check(float a, float b, float *out) 19 | { 20 | /* 21 | * out must be non-null and must point to a buffer large 22 | * enough to hold 16 floats. 23 | */ 24 | 25 | if (!out) 26 | return false; 27 | 28 | single_div_24(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex30/single_div_24.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_DIV_24_H_ 17 | #define SINGLE_DIV_24_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_div_24(float a, float b, float *out); 26 | bool single_div_24_check(float a, float b, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex31/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex31_ass single_rcps_22.s single_rcps_23.s single_rcps_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex31_ass single_rcps_22.asm single_rcps_23.asm single_rcps_14.asm) 5 | endif() 6 | add_executable(avx512_ex31_tests ex31_test.cpp single_rcps_22.c single_rcps_23.c single_rcps_14.c ${avx512_ex31_ass}) 7 | 8 | target_link_libraries(avx512_ex31_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex31_test COMMAND avx512_ex31_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_14.asm: -------------------------------------------------------------------------------- 1 | ; 2 | ; Copyright (C) 2021 by Intel Corporation 3 | ; 4 | ; Permission to use, copy, modify, and/or distribute this software for any 5 | ; purpose with or without fee is hereby granted. 6 | ; 7 | ; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | ; REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | ; AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | ; INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | ; LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | ; OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | ; PERFORMANCE OF THIS SOFTWARE. 14 | ; 15 | 16 | 17 | ; .globl single_rcps_14 18 | 19 | ; void single_rcps_14(float *a, float *out); 20 | ; On entry: 21 | ; rcx = a 22 | ; rdx = out 23 | 24 | 25 | .code 26 | single_rcps_14 PROC public 27 | 28 | vmovups zmm0, [rcx] 29 | 30 | vrsqrt14ps zmm2, zmm0 31 | 32 | vmovups [rdx], zmm2 33 | 34 | vzeroupper 35 | 36 | ret 37 | single_rcps_14 ENDP 38 | end -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_rcps_14.h" 17 | 18 | bool single_rcps_14_check(float *a, float *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_rcps_14(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_RCPS_14_H_ 17 | #define SINGLE_RCPS_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_rcps_14(float *a, float *out); 26 | bool single_rcps_14_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_22.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_rcps_22.h" 17 | 18 | bool single_rcps_22_check(float *a, float *out) 19 | { 20 | /* 21 | * out must be non-null and must point to a buffer large 22 | * enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_rcps_22(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_22.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_RCPS_22_H_ 17 | #define SINGLE_RCPS_22_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_rcps_22(float *a, float *out); 26 | bool single_rcps_22_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_23.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_rcps_23.h" 17 | 18 | bool single_rcps_23_check(float *a, float *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_rcps_23(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex31/single_rcps_23.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_RCPS_23_H_ 17 | #define SINGLE_RCPS_23_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_rcps_23(float *a, float *out); 26 | bool single_rcps_23_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex32/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex32_ass single_sqrt_24.s single_sqrt_23.s single_sqrt_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex32_ass single_sqrt_24.asm single_sqrt_23.asm single_sqrt_14.asm) 5 | endif() 6 | add_executable(avx512_ex32_tests ex32_test.cpp single_sqrt_24.c single_sqrt_23.c single_sqrt_14.c ${avx512_ex32_ass}) 7 | 8 | target_link_libraries(avx512_ex32_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex32_test COMMAND avx512_ex32_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_sqrt_14.h" 17 | 18 | bool single_sqrt_14_check(float *a, float *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_sqrt_14(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_SQRT_14_H_ 17 | #define SINGLE_SQRT_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_sqrt_14(float *a, float *out); 26 | bool single_sqrt_14_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_23.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_sqrt_23.h" 17 | 18 | bool single_sqrt_23_check(float *a, float *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_sqrt_23(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_23.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_SQRT_23_H_ 17 | #define SINGLE_SQRT_23_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_sqrt_23(float *a, float *out); 26 | bool single_sqrt_23_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_24.asm: -------------------------------------------------------------------------------- 1 | ; 2 | ; Copyright (C) 2021 by Intel Corporation 3 | ; 4 | ; Permission to use, copy, modify, and/or distribute this software for any 5 | ; purpose with or without fee is hereby granted. 6 | ; 7 | ; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | ; REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | ; AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | ; INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | ; LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | ; OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | ; PERFORMANCE OF THIS SOFTWARE. 14 | ; 15 | 16 | 17 | ; .globl single_sqrt_24 18 | 19 | ; void single_sqrt_24(float *a, float *out); 20 | ; On entry: 21 | ; rcx = a 22 | ; rdx = out 23 | 24 | .code 25 | single_sqrt_24 PROC public 26 | 27 | vmovups zmm0, [rcx] 28 | 29 | vsqrtps zmm2, zmm0 30 | 31 | vmovups [rdx], zmm2 32 | 33 | vzeroupper 34 | 35 | ret 36 | 37 | single_sqrt_24 ENDP 38 | end -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_24.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "single_sqrt_24.h" 17 | 18 | bool single_sqrt_24_check(float *a, float *out) 19 | { 20 | /* 21 | * out must be non-null and must point to a buffer large 22 | * enough to hold 16 floats. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | single_sqrt_24(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex32/single_sqrt_24.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SINGLE_SQRT_24_H_ 17 | #define SINGLE_SQRT_24_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void single_sqrt_24(float *a, float *out); 26 | bool single_sqrt_24_check(float *a, float *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex33/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex33_ass double_div_53.s double_div_52.s double_div_26.s double_div_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex33_ass double_div_53.asm double_div_52.asm double_div_26.asm double_div_14.asm) 5 | endif() 6 | add_executable(avx512_ex33_tests ex33_test.cpp double_div_53.c double_div_52.c double_div_26.c double_div_14.c ${avx512_ex33_ass}) 7 | 8 | target_link_libraries(avx512_ex33_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex33_test COMMAND avx512_ex33_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_div_14.h" 17 | 18 | bool double_div_14_check(double *a, double *b, double *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | double_div_14(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_DIV_14_H_ 17 | #define DOUBLE_DIV_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_div_14(double *a, double *b, double *out); 26 | bool double_div_14_check(double *a, double *b, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_26.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_div_26.h" 17 | 18 | bool double_div_26_check(double *a, double *b, double *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | double_div_26(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_26.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_DIV_26_H_ 17 | #define DOUBLE_DIV_26_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_div_26(double *a, double *b, double *out); 26 | bool double_div_26_check(double *a, double *b, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_52.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_div_52.h" 17 | 18 | bool double_div_52_check(double *a, double *b, double *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | double_div_52(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_52.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_DIV_52_H_ 17 | #define DOUBLE_DIV_52_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_div_52(double *a, double *b, double *out); 26 | bool double_div_52_check(double *a, double *b, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_53.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_div_53.h" 17 | 18 | bool double_div_53_check(double *a, double *b, double *out) 19 | { 20 | /* 21 | * a, b and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !b || !out) 26 | return false; 27 | 28 | double_div_53(a, b, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex33/double_div_53.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_DIV_53_H_ 17 | #define DOUBLE_DIV_53_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_div_53(double *a, double *b, double *out); 26 | bool double_div_53_check(double *a, double *b, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex34/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex34_ass double_rsqrt_52.s double_rsqrt_51.s double_rsqrt_50.s double_rsqrt_26.s double_rsqrt_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex34_ass double_rsqrt_52.asm double_rsqrt_51.asm double_rsqrt_50.asm double_rsqrt_26.asm double_rsqrt_14.asm) 5 | endif() 6 | add_executable(avx512_ex34_tests ex34_test.cpp double_rsqrt_52.c double_rsqrt_51.c double_rsqrt_50.c double_rsqrt_26.c double_rsqrt_14.c ${avx512_ex34_ass}) 7 | 8 | target_link_libraries(avx512_ex34_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex34_test COMMAND avx512_ex34_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_14.asm: -------------------------------------------------------------------------------- 1 | ; 2 | ; Copyright (C) 2021 by Intel Corporation 3 | ; 4 | ; Permission to use, copy, modify, and/or distribute this software for any 5 | ; purpose with or without fee is hereby granted. 6 | ; 7 | ; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | ; REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | ; AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | ; INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | ; LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | ; OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | ; PERFORMANCE OF THIS SOFTWARE. 14 | ; 15 | 16 | 17 | ; .globl double_rsqrt_14 18 | 19 | ; void double_rsqrt_14(double *a, double *out); 20 | ; On entry: 21 | ; rcx = a 22 | ; rdx = out 23 | 24 | 25 | .code 26 | double_rsqrt_14 PROC public 27 | 28 | vmovupd zmm0, [rcx] 29 | 30 | vrsqrt14pd zmm2, zmm0 31 | 32 | vmovupd [rdx], zmm2 33 | 34 | vzeroupper 35 | 36 | ret 37 | double_rsqrt_14 ENDP 38 | end -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_rsqrt_14.h" 17 | 18 | bool double_rsqrt_14_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_rsqrt_14(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_RSQRT_14_H_ 17 | #define DOUBLE_RSQRT_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_rsqrt_14(double *a, double *out); 26 | bool double_rsqrt_14_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_26.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_rsqrt_26.h" 17 | 18 | bool double_rsqrt_26_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_rsqrt_26(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_26.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_RSQRT_26_H_ 17 | #define DOUBLE_RSQRT_26_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_rsqrt_26(double *a, double *out); 26 | bool double_rsqrt_26_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_50.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_rsqrt_50.h" 17 | 18 | bool double_rsqrt_50_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_rsqrt_50(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_50.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_RSQRT_50_H_ 17 | #define DOUBLE_RSQRT_50_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_rsqrt_50(double *a, double *out); 26 | bool double_rsqrt_50_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_51.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_rsqrt_51.h" 17 | 18 | bool double_rsqrt_51_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_rsqrt_51(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_51.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_RSQRT_51_H_ 17 | #define DOUBLE_RSQRT_51_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_rsqrt_51(double *a, double *out); 26 | bool double_rsqrt_51_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_52.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_rsqrt_52.h" 17 | 18 | bool double_rsqrt_52_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_rsqrt_52(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex34/double_rsqrt_52.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_RSQRT_52_H_ 17 | #define DOUBLE_RSQRT_52_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_rsqrt_52(double *a, double *out); 26 | bool double_rsqrt_52_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex35/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex35_ass double_sqrt_52.s double_sqrt_53.s double_sqrt_26.s double_sqrt_14.s) 3 | elseif(MSVC) 4 | set(avx512_ex35_ass double_sqrt_52.asm double_sqrt_53.asm double_sqrt_26.asm double_sqrt_14.asm) 5 | endif() 6 | add_executable(avx512_ex35_tests ex35_test.cpp double_sqrt_52.c double_sqrt_53.c double_sqrt_26.c double_sqrt_14.c ${avx512_ex35_ass}) 7 | 8 | target_link_libraries(avx512_ex35_tests gtest_main optimisation_common) 9 | add_test(NAME avx512_ex35_test COMMAND avx512_ex35_tests) 10 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_14.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_sqrt_14.h" 17 | 18 | bool double_sqrt_14_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_sqrt_14(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_14.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_SQRT_14_H_ 17 | #define DOUBLE_SQRT_14_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_sqrt_14(double *a, double *out); 26 | bool double_sqrt_14_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_26.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_sqrt_26.h" 17 | 18 | bool double_sqrt_26_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to 22 | * buffers large enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_sqrt_26(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_26.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_SQRT_26_H_ 17 | #define DOUBLE_SQRT_26_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_sqrt_26(double *a, double *out); 26 | bool double_sqrt_26_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_52.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_sqrt_52.h" 17 | 18 | bool double_sqrt_52_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_sqrt_52(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_52.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_SQRT_52_H_ 17 | #define DOUBLE_SQRT_52_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_sqrt_52(double *a, double *out); 26 | bool double_sqrt_52_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_53.asm: -------------------------------------------------------------------------------- 1 | ; 2 | ; Copyright (C) 2021 by Intel Corporation 3 | ; 4 | ; Permission to use, copy, modify, and/or distribute this software for any 5 | ; purpose with or without fee is hereby granted. 6 | ; 7 | ; THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | ; REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | ; AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | ; INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | ; LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | ; OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | ; PERFORMANCE OF THIS SOFTWARE. 14 | ; 15 | 16 | 17 | ; .globl double_sqrt_53 18 | 19 | ; void double_sqrt_53(double *a, double *out); 20 | ; On entry: 21 | ; rcx = a 22 | ; rdx = out 23 | 24 | 25 | .code 26 | double_sqrt_53 PROC public 27 | 28 | vmovupd zmm0, [rcx] 29 | 30 | vsqrtpd zmm2, zmm0 31 | 32 | vmovupd [rdx], zmm2 33 | 34 | vzeroupper 35 | 36 | ret 37 | double_sqrt_53 ENDP 38 | end -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_53.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "double_sqrt_53.h" 17 | 18 | bool double_sqrt_53_check(double *a, double *out) 19 | { 20 | /* 21 | * a and out must be non-null and must point to a buffer large 22 | * enough to hold 8 doubles. 23 | */ 24 | 25 | if (!a || !out) 26 | return false; 27 | 28 | double_sqrt_53(a, out); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap18/ex35/double_sqrt_53.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOUBLE_SQRT_53_H_ 17 | #define DOUBLE_SQRT_53_H_ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | void double_sqrt_53(double *a, double *out); 26 | bool double_sqrt_53_check(double *a, double *out); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap18/ex4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex4_ass mul_blend_avx.s mul_blend_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex4_ass mul_blend_avx.asm mul_blend_avx512.asm) 5 | endif() 6 | 7 | add_executable(avx512_ex4_tests ex4_test.cpp mul_blend_avx.c mul_blend_avx512.c ${avx512_ex4_ass}) 8 | target_link_libraries(avx512_ex4_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex4_bench ex4_bench.cpp ${avx512_ex4_ass}) 12 | target_link_libraries(avx512_ex4_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME avx512_ex4_test COMMAND avx512_ex4_tests) 16 | -------------------------------------------------------------------------------- /chap18/ex5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex5_ass mul_nomask_avx512.s mul_mask_avx512.s mul_zeromask_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex5_ass mul_nomask_avx512.asm mul_mask_avx512.asm mul_zeromask_avx512.asm) 5 | endif() 6 | 7 | add_executable(avx512_ex5_tests ex5_test.cpp mul_nomask_avx512.c mul_mask_avx512.c mul_zeromask_avx512.c ${avx512_ex5_ass}) 8 | target_link_libraries(avx512_ex5_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex5_bench ex5_bench.cpp ${avx512_ex5_ass}) 12 | target_link_libraries(avx512_ex5_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME avx512_ex5_test COMMAND avx512_ex5_tests) 16 | -------------------------------------------------------------------------------- /chap18/ex6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex6_ass mask_avx512.s blend_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex6_ass mask_avx512.asm blend_avx512.asm) 5 | endif() 6 | 7 | add_executable(avx512_ex6_tests ex6_test.cpp mask_avx512.c blend_avx512.c ${avx512_ex6_ass}) 8 | target_link_libraries(avx512_ex6_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex6_bench ex6_bench.cpp ${avx512_ex6_ass}) 12 | target_link_libraries(avx512_ex6_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | add_test(NAME avx512_ex6_test COMMAND avx512_ex6_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex7_ass mask_avx512.s blend_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex7_ass mask_avx512.asm blend_avx512.asm) 5 | endif() 6 | 7 | add_executable(avx512_ex7_tests ex7_test.cpp mask_avx512.c blend_avx512.c ${avx512_ex7_ass}) 8 | target_link_libraries(avx512_ex7_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(avx512_ex7_bench ex7_bench.cpp ${avx512_ex7_ass}) 12 | target_link_libraries(avx512_ex7_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME avx512_ex7_test COMMAND avx512_ex7_tests) 16 | -------------------------------------------------------------------------------- /chap18/ex8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex8_ass mce_scalar.s mce_avx2.s mce_avx512.s) 3 | elseif(MSVC) 4 | set(avx512_ex8_ass mce_scalar.asm mce_avx2.asm mce_avx512.asm) 5 | endif() 6 | add_executable(avx512_ex8_tests ex8_test.cpp mce_scalar.c mce_avx2.c mce_avx512.c ${avx512_ex8_ass}) 7 | target_link_libraries(avx512_ex8_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex8_bench ex8_bench.cpp ${avx512_ex8_ass}) 11 | target_link_libraries(avx512_ex8_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex8_test COMMAND avx512_ex8_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex8/mce_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef MCE_AVX2_H__ 17 | #define MCE_AVX2_H__ 18 | 19 | #include 20 | #include 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | void mce_avx2(uint32_t *out, const uint32_t *in, uint64_t width); 26 | bool mce_avx2_check(uint32_t *out, const uint32_t *in, uint64_t width); 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap18/ex9/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(avx512_ex9_ass no_peeling.s peeling.s) 3 | elseif(MSVC) 4 | set(avx512_ex9_ass no_peeling.asm peeling.asm) 5 | endif() 6 | add_executable(avx512_ex9_tests ex9_test.cpp no_peeling.c peeling.c ${avx512_ex9_ass}) 7 | target_link_libraries(avx512_ex9_tests gtest_main optimisation_common) 8 | 9 | IF( benchmark_FOUND ) 10 | add_executable(avx512_ex9_bench ex9_bench.cpp ${avx512_ex9_ass}) 11 | target_link_libraries(avx512_ex9_bench benchmark::benchmark optimisation_common) 12 | ENDIF() 13 | 14 | add_test(NAME avx512_ex9_test COMMAND avx512_ex9_tests) 15 | -------------------------------------------------------------------------------- /chap18/ex9/peeling.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "peeling.h" 17 | 18 | bool peel_check(float *out, const float *in, uint64_t width, float add_value, 19 | float alfa) 20 | { 21 | /* 22 | * out and in must be non-NULL if width > 0. 23 | */ 24 | 25 | if (width > 0 && (!in || !out)) 26 | return false; 27 | 28 | peel(out, in, width, add_value, alfa); 29 | 30 | return true; 31 | } 32 | -------------------------------------------------------------------------------- /chap19/ex1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(fp16_ex1_srcs real_from_complex_mask.cpp) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set_property(SOURCE real_from_complex_mask.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512dq" "-mavx512bw" "-mavx512vl") 4 | endif() 5 | add_executable(fp16_ex1_tests ex1_test.cpp ${fp16_ex1_srcs}) 6 | target_link_libraries(fp16_ex1_tests gtest_main optimisation_common) 7 | 8 | add_test(NAME fp16_ex1_test COMMAND fp16_ex1_tests) 9 | -------------------------------------------------------------------------------- /chap19/ex1/real_from_complex_mask.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef REAL_FROM_COMPLEX_MASK_H 17 | #define REAL_FROM_COMPLEX_MASK_H 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | unsigned int get_real_mask_from_complex_mask(unsigned int m); 26 | __mmask8 getRealMaskFromComplexMask(__mmask8 m); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /chap19/ex2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(fp16_ex2_srcs complex_from_real_mask_and.cpp) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set_property(SOURCE complex_from_real_mask_and APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512bw" "-mavx512vl" "-mavx512dq") 4 | endif() 5 | add_executable(fp16_ex2_tests ex2_test.cpp ${fp16_ex2_srcs}) 6 | target_link_libraries(fp16_ex2_tests gtest_main optimisation_common) 7 | 8 | add_test(NAME fp16_ex2_test COMMAND fp16_ex2_tests) 9 | -------------------------------------------------------------------------------- /chap19/ex3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(fp16_ex3_srcs complex_from_real_mask_or.cpp) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set_property(SOURCE complex_from_real_mask_or.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512bw" "-mavx512vl" "-mavx512dq") 4 | endif() 5 | add_executable(fp16_ex3_tests ex3_test.cpp ${fp16_ex3_srcs}) 6 | target_link_libraries(fp16_ex3_tests gtest_main optimisation_common) 7 | 8 | add_test(NAME fp16_ex3_test COMMAND fp16_ex3_tests) 9 | -------------------------------------------------------------------------------- /chap19/ex4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | check_cxx_compiler_flag(-mavx512fp16 COMPILER_SUPPORTS_FP16) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | if(COMPILER_SUPPORTS_FP16) 4 | set(fp16_ex4_srcs compress_ph.cpp) 5 | set_property(SOURCE ex4_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-DCOMPILER_SUPPORTS_FP16") 6 | set_property(SOURCE compress_ph.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512vbmi2" "-mavx512fp16" "-mavx512bw") 7 | endif() 8 | else() 9 | set(fp16_ex4_srcs compress_ph.cpp) 10 | set_property(SOURCE ex4_test.cpp APPEND PROPERTY COMPILE_OPTIONS "/DCOMPILER_SUPPORTS_FP16") 11 | endif() 12 | add_executable(fp16_ex4_tests ex4_test.cpp ${fp16_ex4_srcs}) 13 | target_link_libraries(fp16_ex4_tests gtest_main optimisation_common) 14 | 15 | add_test(NAME fp16_ex4_test COMMAND fp16_ex4_tests) 16 | -------------------------------------------------------------------------------- /chap19/ex4/compress_ph.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPRESS_PH_H 17 | #define COMPRESS_PH_H 18 | 19 | #include 20 | #include 21 | 22 | __m512h compress_ph(__m512h src, __mmask32 mask, __m512h value); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /chap19/ex4/compress_ph_test.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef COMPRESS_PH_TEST_H 17 | #define COMPRESS_PH_TEST_H 18 | 19 | #include 20 | 21 | void test_compress_ph(uint32_t mask, const float *floats, uint16_t *halves, 22 | uint16_t *compressed_halves); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /chap19/ex5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | check_cxx_compiler_flag(-mavx512fp16 COMPILER_SUPPORTS_FP16) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | if(COMPILER_SUPPORTS_FP16) 4 | set(fp16_ex5_srcs fast_special_min.cpp) 5 | set_property(SOURCE ex5_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-DCOMPILER_SUPPORTS_FP16") 6 | set_property(SOURCE fast_special_min.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512vl" "-mavx512fp16") 7 | endif() 8 | else() 9 | set(fp16_ex5_srcs fast_special_min.cpp) 10 | set_property(SOURCE ex5_test.cpp APPEND PROPERTY COMPILE_OPTIONS "/DCOMPILER_SUPPORTS_FP16") 11 | endif() 12 | add_executable(fp16_ex5_tests ex5_test.cpp ${fp16_ex5_srcs}) 13 | target_link_libraries(fp16_ex5_tests gtest_main optimisation_common) 14 | 15 | add_test(NAME fp16_ex5_test COMMAND fp16_ex5_tests) 16 | -------------------------------------------------------------------------------- /chap19/ex5/fast_special_min.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef FAST_SPECIAL_MIN_H 17 | #define FAST_SPECIAL_MIN_H 18 | 19 | #include 20 | #include 21 | 22 | __m128h fast_special_min(__m128h lhs, __m128h rhs); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /chap19/ex5/fast_special_min_test.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef FAST_SPECIAL_MIN_TEST_H 17 | #define FAST_SPECIAL_MIN_TEST_H 18 | 19 | #include 20 | 21 | void test_fast_special_min(const float *floats, uint16_t *halves, 22 | uint16_t *mins); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /chap20/ex14/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex14_srcs ex14_test.cpp) 2 | add_executable(amx_ex14_tests ${amx_ex14_srcs}) 3 | target_link_libraries(amx_ex14_tests gtest_main) 4 | add_test(NAME amx_ex14_test COMMAND amx_ex14_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex14/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex16/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex16_srcs ex16_test.cpp) 2 | add_executable(amx_ex16_tests ${amx_ex16_srcs}) 3 | target_link_libraries(amx_ex16_tests gtest_main) 4 | add_test(NAME amx_ex16_test COMMAND amx_ex16_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex16/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 2 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex17/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | check_cxx_compiler_flag("-mamx-int8 -mamx-tile" COMPILER_SUPPORTS_AMX) 3 | if (COMPILER_SUPPORTS_AMX) 4 | set_property(SOURCE ex17_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-DCOMPILER_SUPPORTS_AMX") 5 | set(amx_exx_ass amx_post_conv_gemm_relu_ass.s) 6 | endif() 7 | elseif(MSVC) 8 | set(amx_exx_ass amx_post_conv_gemm_relu_ass.asm) 9 | set_property(SOURCE ex17_test.cpp APPEND PROPERTY COMPILE_OPTIONS "/DCOMPILER_SUPPORTS_AMX") 10 | endif() 11 | 12 | set(amx_ex17_srcs ex17_test.cpp ${amx_exx_ass}) 13 | add_executable(amx_ex17_tests ${amx_ex17_srcs}) 14 | target_link_libraries(amx_ex17_tests gtest_main optimisation_common) 15 | add_test(NAME amx_ex17_test COMMAND amx_ex17_tests) 16 | -------------------------------------------------------------------------------- /chap20/ex17/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex18/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | check_cxx_compiler_flag("-mamx-int8 -mamx-tile" COMPILER_SUPPORTS_AMX) 3 | if (COMPILER_SUPPORTS_AMX) 4 | set_property(SOURCE ex18_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-DCOMPILER_SUPPORTS_AMX") 5 | set(amx_exx_ass amx_interleaved_gemm_relu_ass.s) 6 | endif() 7 | elseif(MSVC) 8 | set_property(SOURCE ex18_test.cpp APPEND PROPERTY COMPILE_OPTIONS "/DCOMPILER_SUPPORTS_AMX") 9 | set(amx_exx_ass amx_interleaved_gemm_relu_ass.asm) 10 | endif() 11 | 12 | set(amx_ex18_srcs ex18_test.cpp ${amx_exx_ass}) 13 | add_executable(amx_ex18_tests ${amx_ex18_srcs}) 14 | target_link_libraries(amx_ex18_tests gtest_main optimisation_common) 15 | add_test(NAME amx_ex18_test COMMAND amx_ex18_tests) 16 | -------------------------------------------------------------------------------- /chap20/ex18/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex19/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | check_cxx_compiler_flag(-mavx512bf16 COMPILER_SUPPORTS_AVX512_BF16) 3 | if (COMPILER_SUPPORTS_AVX512_BF16) 4 | set_property(SOURCE ex19_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-DCOMPILER_SUPPORTS_AVX512_BF16") 5 | set_property(SOURCE bf16_conv.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 6 | set_property(SOURCE bf16_conv.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512bf16") 7 | set_property(SOURCE bf16_conv.c APPEND PROPERTY COMPILE_OPTIONS "-flax-vector-conversions") 8 | set(amx_ex19_srcs bf16_conv.c) 9 | endif() 10 | else() 11 | set_property(SOURCE ex19_test.cpp APPEND PROPERTY COMPILE_OPTIONS "/DCOMPILER_SUPPORTS_AVX512_BF16") 12 | set(amx_ex19_srcs bf16_conv.c) 13 | endif() 14 | add_executable(amx_ex19_tests ${amx_ex19_srcs} ex19_test.cpp) 15 | target_link_libraries(amx_ex19_tests gtest_main optimisation_common) 16 | add_test(NAME amx_ex19_test COMMAND amx_ex19_tests) 17 | -------------------------------------------------------------------------------- /chap20/ex20/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE int8_conv_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | set_property(SOURCE int8_conv_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512bw") 4 | endif() 5 | set(amx_ex20_srcs ex20_test.cpp int8_conv_test.cpp) 6 | add_executable(amx_ex20_tests ${amx_ex20_srcs}) 7 | target_link_libraries(amx_ex20_tests gtest_main optimisation_common) 8 | 9 | add_test(NAME amx_ex20_test COMMAND amx_ex20_tests) 10 | -------------------------------------------------------------------------------- /chap20/ex20/int8_conv_test.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef INT8_CONV_TEST_H 17 | #define INT8_CONV_TEST_H 18 | 19 | #include 20 | 21 | #include 22 | #include 23 | 24 | void int8_conv_init(uint32_t *dwords, size_t max_elements, __m512i *vecs, 25 | size_t max_vecs); 26 | void int8_conv_pack_dwords_to_bytes(__m512i *vecs, uint8_t *bytes); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /chap20/ex21/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE embedding.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | set(amx_ex21_srcs ex21_test.cpp embedding.c) 5 | add_executable(amx_ex21_tests ${amx_ex21_srcs}) 6 | target_link_libraries(amx_ex21_tests gtest_main optimisation_common) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(amx_ex21_bench ex21_bench.cpp embedding.c) 10 | target_link_libraries(amx_ex21_bench benchmark::benchmark optimisation_common) 11 | ENDIF() 12 | 13 | add_test(NAME amx_ex21_test COMMAND amx_ex21_tests) 14 | -------------------------------------------------------------------------------- /chap20/ex22/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(amx_ex22_ass flat_to_flat_bf16_trans.s) 3 | elseif(MSVC) 4 | set(amx_ex22_ass flat_to_flat_bf16_trans.asm) 5 | endif() 6 | 7 | add_executable(amx_ex22_tests ex22_test.cpp ${amx_ex22_ass}) 8 | 9 | target_link_libraries(amx_ex22_tests gtest_main optimisation_common) 10 | 11 | add_test(NAME amx_ex22_tests COMMAND amx_ex22_tests) 12 | -------------------------------------------------------------------------------- /chap20/ex23/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(amx_ex23_ass vnni_to_vnni_bf16_trans.s) 3 | elseif(MSVC) 4 | set(amx_ex23_ass vnni_to_vnni_bf16_trans.asm) 5 | endif() 6 | 7 | add_executable(amx_ex23_tests ex23_test.cpp ${amx_ex23_ass}) 8 | 9 | target_link_libraries(amx_ex23_tests gtest_main optimisation_common) 10 | 11 | add_test(NAME amx_ex23_tests COMMAND amx_ex23_tests) 12 | -------------------------------------------------------------------------------- /chap20/ex24/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(amx_ex24_ass flat_to_vnni_bf16_trans.s) 3 | elseif(MSVC) 4 | set(amx_ex24_ass flat_to_vnni_bf16_trans.asm) 5 | endif() 6 | 7 | add_executable(amx_ex24_tests ex24_test.cpp ${amx_ex24_ass}) 8 | 9 | target_link_libraries(amx_ex24_tests gtest_main optimisation_common) 10 | 11 | add_test(NAME amx_ex24_tests COMMAND amx_ex24_tests) 12 | -------------------------------------------------------------------------------- /chap20/ex25/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(amx_ex25_ass flat_to_vnni_bf16_relayout.s) 3 | elseif(MSVC) 4 | set(amx_ex25_ass flat_to_vnni_bf16_relayout.asm) 5 | endif() 6 | 7 | add_executable(amx_ex25_tests ex25_test.cpp ${amx_ex25_ass}) 8 | 9 | target_link_libraries(amx_ex25_tests gtest_main optimisation_common) 10 | 11 | add_test(NAME amx_ex25_tests COMMAND amx_ex25_tests) 12 | -------------------------------------------------------------------------------- /chap20/ex27/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE byte_decompression.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512vbmi" "-mavx512vbmi2") 3 | set_property(SOURCE ex27_test.cpp APPEND PROPERTY COMPILE_OPTIONS "-mpopcnt") 4 | endif() 5 | 6 | set(amx_ex27_src byte_decompression.c) 7 | 8 | add_executable(amx_ex27_tests ex27_test.cpp ${amx_ex27_src}) 9 | 10 | target_link_libraries(amx_ex27_tests gtest_main optimisation_common) 11 | 12 | add_test(NAME amx_ex27_tests COMMAND amx_ex27_tests) 13 | -------------------------------------------------------------------------------- /chap20/ex4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex4_srcs ex4_test.cpp) 2 | add_executable(amx_ex4_tests ${amx_ex4_srcs}) 3 | target_link_libraries(amx_ex4_tests gtest_main) 4 | add_test(NAME amx_ex4_test COMMAND amx_ex4_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex4/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex5_srcs ex5_test.cpp) 2 | add_executable(amx_ex5_tests ${amx_ex5_srcs}) 3 | target_link_libraries(amx_ex5_tests gtest_main) 4 | add_test(NAME amx_ex5_test COMMAND amx_ex5_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex5/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex6_srcs ex6_test.cpp) 2 | add_executable(amx_ex6_tests ${amx_ex6_srcs}) 3 | target_link_libraries(amx_ex6_tests gtest_main) 4 | add_test(NAME amx_ex6_test COMMAND amx_ex6_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex6/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex7_srcs ex7_test.cpp) 2 | add_executable(amx_ex7_tests ${amx_ex7_srcs}) 3 | target_link_libraries(amx_ex7_tests gtest_main) 4 | add_test(NAME amx_ex7_test COMMAND amx_ex7_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex7/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap20/ex8/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(amx_ex8_srcs ex8_test.cpp) 2 | add_executable(amx_ex8_tests ${amx_ex8_srcs}) 3 | target_link_libraries(amx_ex8_tests gtest_main) 4 | add_test(NAME amx_ex8_test COMMAND amx_ex8_tests) 5 | -------------------------------------------------------------------------------- /chap20/ex8/gemm/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap5/ex15/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(simd_ex15_srcs ex15_test.cpp) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(simd_ex15_srcs ${simd_ex15_srcs} supports_avx2.s) 4 | elseif(MSVC) 5 | set(simd_ex15_srcs ${simd_ex15_srcs} supports_avx2.asm) 6 | endif() 7 | add_executable(simd_ex15_test ${simd_ex15_srcs}) 8 | target_link_libraries(simd_ex15_test gtest_main) 9 | add_test(NAME simd_ex15_test COMMAND simd_ex15_test) 10 | -------------------------------------------------------------------------------- /chap5/ex15/supports_avx2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SUPPORTS_AVX2_H__ 17 | #define SUPPORTS_AVX2_H__ 18 | 19 | #include 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | int64_t supports_avx2(void); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /chap7/ex3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(sse_ex3_ass swizzling_sse.s) 3 | elseif(MSVC) 4 | set(sse_ex3_ass swizzling_sse.asm) 5 | endif() 6 | 7 | add_executable(sse_ex3_tests ex3_test.cpp swizzling_sse.c ${sse_ex3_ass}) 8 | target_link_libraries(sse_ex3_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(sse_ex3_bench ex3_bench.cpp ${sse_ex3_ass}) 12 | target_link_libraries(sse_ex3_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME sse_ex3_test COMMAND sse_ex3_tests) 16 | -------------------------------------------------------------------------------- /chap7/ex3/swizzling_sse.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include 17 | 18 | #include "swizzling_sse.h" 19 | 20 | bool swizzling_sse_check(Vertex_aos *in, Vertex_soa *out) 21 | { 22 | /* in and out must be non-null */ 23 | if (!out || !in) 24 | return false; 25 | 26 | swizzling_sse(in, out); 27 | 28 | return true; 29 | } 30 | -------------------------------------------------------------------------------- /chap7/ex4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(sse_ex4_ass swizzling_unpck_sse.s) 3 | elseif(MSVC) 4 | set(sse_ex4_ass swizzling_unpck_sse.asm) 5 | endif() 6 | 7 | add_executable(sse_ex4_tests ex4_test.cpp swizzling_unpck_sse.c ${sse_ex4_ass}) 8 | target_link_libraries(sse_ex4_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(sse_ex4_bench ex4_bench.cpp ${sse_ex4_ass}) 12 | target_link_libraries(sse_ex4_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME sse_ex4_test COMMAND sse_ex4_tests) 16 | -------------------------------------------------------------------------------- /chap7/ex4/swizzling_unpck_sse.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include 17 | 18 | #include "swizzling_unpck_sse.h" 19 | 20 | bool swizzling_unpck_sse_check(Vertex_aos *in, Vertex_soa *out) 21 | { 22 | /* in and out must be non-null */ 23 | if (!out || !in) 24 | return false; 25 | 26 | swizzling_unpck_sse(in, out); 27 | 28 | return true; 29 | } 30 | -------------------------------------------------------------------------------- /chap7/ex5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(sse_ex5_ass deswizzling_sse.s) 3 | elseif(MSVC) 4 | set(sse_ex5_ass deswizzling_sse.asm) 5 | endif() 6 | 7 | add_executable(sse_ex5_tests ex5_test.cpp deswizzling_sse.c ${sse_ex5_ass}) 8 | target_link_libraries(sse_ex5_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(sse_ex5_bench ex5_bench.cpp ${sse_ex5_ass}) 12 | target_link_libraries(sse_ex5_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME sse_ex5_test COMMAND sse_ex5_tests) 16 | -------------------------------------------------------------------------------- /chap7/ex5/deswizzling_sse.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include 17 | 18 | #include "deswizzling_sse.h" 19 | 20 | bool deswizzling_sse_check(Vertex_soa *in, Vertex_aos *out) 21 | { 22 | /* in and out must be non-null */ 23 | if (!out || !in) 24 | return false; 25 | 26 | deswizzling_sse(in, out); 27 | 28 | return true; 29 | } 30 | -------------------------------------------------------------------------------- /chap7/ex6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set(sse_ex6_ass deswizzling_rgb_sse.s) 3 | elseif(MSVC) 4 | set(sse_ex6_ass deswizzling_rgb_sse.asm) 5 | endif() 6 | 7 | add_executable(sse_ex6_tests ex6_test.cpp deswizzling_rgb_sse.c ${sse_ex6_ass}) 8 | target_link_libraries(sse_ex6_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(sse_ex6_bench ex6_bench.cpp ${sse_ex6_ass}) 12 | target_link_libraries(sse_ex6_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME sse_ex6_test COMMAND sse_ex6_tests) 16 | -------------------------------------------------------------------------------- /chap7/ex6/deswizzling_rgb_sse.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2023 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include 17 | 18 | #include "deswizzling_rgb_sse.h" 19 | 20 | bool deswizzling_rgb_sse_check(Vertex_soa *in, Vertex_aos *out) 21 | { 22 | /* in and out must be non-null */ 23 | if (!out || !in) 24 | return false; 25 | 26 | deswizzling_rgb_sse(in, out); 27 | 28 | return true; 29 | } 30 | -------------------------------------------------------------------------------- /chap8/ex1/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(vnni_ex1_srcs ) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set(vnni_ex1_ass dotprod_vnni.s dotprod_novnni.s) 4 | elseif(MSVC) 5 | set(vnni_ex1_ass dotprod_vnni.asm dotprod_novnni.asm) 6 | endif() 7 | add_executable(vnni_ex1_tests ex1_test.cpp ${vnni_ex1_ass}) 8 | target_link_libraries(vnni_ex1_tests gtest_main optimisation_common) 9 | 10 | IF( benchmark_FOUND ) 11 | add_executable(vnni_ex1_bench ex1_bench.cpp ${vnni_ex1_ass}) 12 | target_link_libraries(vnni_ex1_bench benchmark::benchmark optimisation_common) 13 | ENDIF() 14 | 15 | add_test(NAME vnni_ex1_test COMMAND vnni_ex1_tests) 16 | -------------------------------------------------------------------------------- /chap8/ex1/dotprod_novnni.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOTPROD_NOVNNI_H__ 17 | #define DOTPROD_NOVNNI_H__ 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void dotprod_novnni_4x64x64(uint8_t *lhs, int8_t *rhs, int32_t *out); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /chap8/ex1/dotprod_vnni.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef DOTPROD_VNNI_H__ 17 | #define DOTPROD_VNNI_H__ 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void dotprod_vnni_4x64x64(uint8_t *lhs, int8_t *rhs, int32_t *out); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /chap8/ex10/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(vnni_ex10_tests ex10_test.cpp pixel_shuffler_offset.c) 2 | target_link_libraries(vnni_ex10_tests gtest_main) 3 | add_test(NAME vnni_ex10_test COMMAND vnni_ex10_tests) 4 | -------------------------------------------------------------------------------- /chap8/ex10/ex10_test.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "gtest/gtest.h" 17 | 18 | #include "pixel_shuffler_offset.h" 19 | 20 | TEST(vnni, pixel_shuffler_offset_test) 21 | { 22 | size_t res = pixel_shuffler_offset(0, 16, 5, 5, 8, 8); 23 | ASSERT_EQ(res, 1536); 24 | } 25 | -------------------------------------------------------------------------------- /chap8/ex11/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE sigmoid_approx_avx512.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512dq") 3 | endif() 4 | add_executable(vnni_ex11_tests ex11_test.cpp sigmoid_approx_avx512.cpp) 5 | target_link_libraries(vnni_ex11_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex11_test COMMAND vnni_ex11_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex11/sigmoid_approx_avx512.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "sigmoid_approx_avx512.h" 17 | #include "sigmoid_approx.hpp" 18 | 19 | void sigmoid_poly_2_avx512(float *input, float *output) 20 | { 21 | const __m512 ireg = _mm512_load_ps(input); 22 | __m512 oreg; 23 | 24 | sigmoid_poly_2(ireg, oreg); 25 | _mm512_store_ps(output, oreg); 26 | } 27 | -------------------------------------------------------------------------------- /chap8/ex11/sigmoid_approx_avx512.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SIGMOID_APPROX_AVX512_H__ 17 | #define SIGMOID_APPROX_AVX512_H__ 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void sigmoid_poly_2_avx512(float *input, float *output); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /chap8/ex12/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE sigmoid_scalef_avx512.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512dq") 3 | endif() 4 | add_executable(vnni_ex12_tests ex12_test.cpp sigmoid_scalef_avx512.cpp) 5 | target_link_libraries(vnni_ex12_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex12_test COMMAND vnni_ex12_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex12/sigmoid_scalef_avx512.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #include "sigmoid_scalef_avx512.h" 17 | #include "sigmoid_scalef.hpp" 18 | 19 | void sigmoid_scalef_avx512(float *input, float *output) 20 | { 21 | __m512 in; 22 | __m512 out; 23 | 24 | in = _mm512_load_ps(input); 25 | sigmoid_scalef(in, out); 26 | _mm512_store_ps(output, out); 27 | } 28 | -------------------------------------------------------------------------------- /chap8/ex12/sigmoid_scalef_avx512.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef SIGMOID_SCALEF_AVX512_H__ 17 | #define SIGMOID_SCALEF_AVX512_H__ 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | void sigmoid_scalef_avx512(float *input, float *output); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /chap8/ex2/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE quantization_avx512.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | add_executable(vnni_ex2_tests ex2_test.cpp quantization_avx512.cpp quantization_scalar.cpp) 5 | target_link_libraries(vnni_ex2_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex2_test COMMAND vnni_ex2_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex2/quant_types.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef QUANT_TYPES_H__ 17 | #define QUANT_TYPES_H__ 18 | 19 | #include 20 | 21 | typedef std::uint8_t u8; 22 | typedef float Dtype; 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /chap8/ex2/quantization_avx512.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef QUANTIZATION_AVX512_H__ 17 | #define QUANTIZATION_AVX512_H__ 18 | 19 | #include "quant_types.hpp" 20 | 21 | void quantize_activations_avx512(const float *data, u8 *quantized_data, 22 | int count, Dtype factor, int bits, 23 | int offset = 0); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap8/ex2/quantization_scalar.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef QUANTIZATION_SCALAR_H__ 17 | #define QUANTIZATION_SCALAR_H__ 18 | 19 | #include "quant_types.hpp" 20 | 21 | void quantize_activations_scalar(const float *data, u8 *quantized_data, 22 | int count, Dtype factor, int bits, 23 | int offset = 0); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /chap8/ex3/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | IndentWidth: 4 3 | UseTab: Never 4 | BreakBeforeBraces: Linux 5 | AllowShortIfStatementsOnASingleLine: false 6 | IndentCaseLabels: false -------------------------------------------------------------------------------- /chap8/ex3/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(vnni_ex3_srcs direct_conv.c) 2 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 3 | set_property(SOURCE direct_conv.c APPEND PROPERTY COMPILE_OPTIONS "-Wall" "-mavx512f" "-mavx512vnni") 4 | endif() 5 | add_executable(vnni_ex3_tests ex3_test.cpp ${vnni_ex3_srcs}) 6 | target_link_libraries(vnni_ex3_tests gtest_main optimisation_common) 7 | 8 | IF( benchmark_FOUND ) 9 | add_executable(vnni_ex3_bench ex3_bench.cpp ${vnni_ex3_srcs}) 10 | target_link_libraries(vnni_ex3_bench benchmark::benchmark optimisation_common) 11 | ENDIF() 12 | 13 | add_test(NAME vnni_ex3_test COMMAND vnni_ex3_tests) 14 | -------------------------------------------------------------------------------- /chap8/ex4/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE low_ofm_conv.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512vnni") 3 | endif() 4 | add_executable(vnni_ex4_tests ex4_test.cpp low_ofm_conv.cpp) 5 | target_link_libraries(vnni_ex4_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex4_test COMMAND vnni_ex4_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex4/low_ofm_conv.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef LOW_OFM_CONV_HPP_ 17 | #define LOW_OFM_CONV_HPP_ 18 | 19 | #include 20 | #include 21 | 22 | #define NUM_OFMS 3 23 | 24 | void low_ofm_conv(int IFM_W, int IFM_H, int IFMBlock, int NUM_IFMS, 25 | float *dqfs, const uint8_t *input, float *output, 26 | int8_t *weights_reorged); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /chap8/ex5/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE post_conv.cpp APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | add_executable(vnni_ex5_tests ex5_test.cpp post_conv.cpp) 5 | target_link_libraries(vnni_ex5_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex5_test COMMAND vnni_ex5_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex6/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE eltwise.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | add_executable(vnni_ex6_tests ex6_test.cpp eltwise.c) 5 | target_link_libraries(vnni_ex6_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex6_test COMMAND vnni_ex6_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex7/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) 2 | set_property(SOURCE pooling.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") 3 | endif() 4 | add_executable(vnni_ex7_tests ex7_test.cpp pooling.c) 5 | target_link_libraries(vnni_ex7_tests gtest_main optimisation_common) 6 | add_test(NAME vnni_ex7_test COMMAND vnni_ex7_tests) 7 | -------------------------------------------------------------------------------- /chap8/ex7/pooling.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef POOLING_H__ 17 | #define POOLING_H__ 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | #include 24 | 25 | void pooling(__m512 resf, void *outputFeatureMaps, int BlockOffsetOFM, 26 | int OFMItr); 27 | void test_pooling(float *outputFeatureMaps, float *expected); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /chap8/ex9/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(vnni_ex9_tests ex9_test.cpp pixel_shuffler.cpp) 2 | target_link_libraries(vnni_ex9_tests gtest_main) 3 | add_test(NAME vnni_ex9_test COMMAND vnni_ex9_tests) 4 | -------------------------------------------------------------------------------- /chap8/ex9/pixel_shuffler.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 by Intel Corporation 3 | * 4 | * Permission to use, copy, modify, and/or distribute this software for any 5 | * purpose with or without fee is hereby granted. 6 | * 7 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 12 | * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | * PERFORMANCE OF THIS SOFTWARE. 14 | */ 15 | 16 | #ifndef PIXEL_SHUFFLER_H__ 17 | #define PIXEL_SHUFFLER_H__ 18 | 19 | #include 20 | 21 | using std::vector; 22 | 23 | typedef int8_t pstype; // works on any data type. 24 | 25 | void pixel_shuffler(const vector &bottom_shape, 26 | const vector &top_shape, const pstype *bottom_data, 27 | pstype *top_data); 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /check-format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | clang-format --version 5 | for i in `find . -name '*.cpp' -o -name "*.c" -o -name "*.h" | grep -v build `; do 6 | echo "Checking format of $i" 7 | clang-format -style=file $i | diff $i - 8 | done 9 | -------------------------------------------------------------------------------- /common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(common_srcs optimisation_common.c) 2 | 3 | if(CMAKE_CXX_COMPILER_ID MATCHES "^AppleClang$") 4 | set(common_srcs ${common_srcs} ../chap5/ex15/supports_avx2.s supports_avx512_macos.s supports_avx512_bf16.s supports_amx_macos.s) 5 | elseif(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU) 6 | set(common_srcs ${common_srcs} ../chap5/ex15/supports_avx2.s supports_avx512.s supports_avx512_bf16.s supports_amx.s) 7 | elseif(MSVC) 8 | set(common_srcs ${common_srcs} ../chap5/ex15/supports_avx2.asm supports_avx512.asm supports_avx512_bf16.asm supports_amx.asm) 9 | endif() 10 | 11 | ADD_LIBRARY(optimisation_common STATIC ${common_srcs}) 12 | --------------------------------------------------------------------------------