├── .lastrelease ├── .gitignore ├── docs ├── volk_logo.png ├── volk_logo_small.png ├── CMakeLists.txt ├── main_page.dox ├── using_volk.dox ├── DCO.txt └── CONTRIBUTING.md ├── .gitmodules ├── kernels ├── volk │ ├── asm │ │ ├── orc │ │ │ ├── volk_16u_byteswap_a_orc_impl.orc │ │ │ ├── volk_32f_x2_add_32f_a_orc_impl.orc │ │ │ ├── volk_32f_x2_max_32f_a_orc_impl.orc │ │ │ ├── volk_32f_x2_min_32f_a_orc_impl.orc │ │ │ ├── volk_32f_s32f_add_32f_a_orc_impl.orc │ │ │ ├── volk_32f_x2_divide_32f_a_orc_impl.orc │ │ │ ├── volk_32i_x2_and_32i_a_orc_impl.orc │ │ │ ├── volk_32i_x2_or_32i_a_orc_impl.orc │ │ │ ├── volk_32f_x2_multiply_32f_a_orc_impl.orc │ │ │ ├── volk_32f_x2_subtract_32f_a_orc_impl.orc │ │ │ ├── volk_8i_convert_16i_a_orc_impl.orc │ │ │ ├── volk_32f_s32f_multiply_32f_a_orc_impl.orc │ │ │ ├── volk_32f_s32f_normalize_a_orc_impl.orc │ │ │ ├── volk_16ic_deinterleave_16i_x2_a_orc_impl.orc │ │ │ ├── volk_16ic_deinterleave_real_8i_a_orc_impl.orc │ │ │ ├── volk_32fc_32f_multiply_32fc_a_orc_impl.orc │ │ │ ├── volk_8i_s32f_convert_32f_a_orc_impl.orc │ │ │ ├── volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc │ │ │ └── volk_32fc_x2_multiply_32fc_a_orc_impl.orc │ │ ├── neon │ │ │ ├── volk_32f_s32f_multiply_32f_a_neonasm.s │ │ │ ├── volk_32f_x2_add_32f_a_neonasm.s │ │ │ ├── volk_32fc_x2_multiply_32fc_a_neonasm.s │ │ │ ├── volk_32f_x2_dot_prod_32f_a_neonasm.s │ │ │ ├── volk_16i_max_star_horizontal_16i.s │ │ │ ├── volk_32f_x2_add_32f_a_neonpipeline.s │ │ │ ├── volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s │ │ │ ├── volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s │ │ │ ├── volk_32fc_32f_dot_prod_32fc_a_neonasm.s │ │ │ ├── volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s │ │ │ ├── volk_32fc_x2_dot_prod_32fc_a_neonasm.s │ │ │ └── volk_32f_x2_dot_prod_32f_a_neonasm_opts.s │ │ └── riscv │ │ │ ├── volk_32f_s32f_multiply_32f_sifive_u74.s │ │ │ └── volk_32fc_x2_dot_prod_32fc_sifive_u74.s │ ├── volk_32f_null_32f.h │ ├── volk_32fc_s32f_power_spectral_densitypuppet_32f.h │ ├── volk_32fc_s32f_x2_power_spectral_density_32f.h │ ├── volk_32f_s32f_power_32f.h │ ├── volk_32fc_s32f_power_32fc.h │ ├── volk_32u_popcntpuppet_32u.h │ ├── volk_64u_popcntpuppet_64u.h │ ├── volk_32u_popcnt.h │ ├── volk_16i_max_star_16i.h │ └── volk_32f_s32f_clamppuppet_32f.h └── README.md ├── cmake ├── Checks │ └── check-rvv-intrinsics.c ├── Toolchains │ ├── intel-sde.cmake │ ├── arm_cortex_a76_hardfp_native.cmake │ ├── arm_cortex_a8_softfp_native.cmake │ ├── arm_cortex_a15_hardfp_native.cmake │ ├── arm_cortex_a8_hardfp_native.cmake │ ├── arm_cortex_a9_hardfp_native.cmake │ ├── arm_cortex_a72_hardfp_native.cmake │ ├── arm_cortex_a53_hardfp_native.cmake │ ├── rv64gcv-linux-gnu.cmake │ ├── oe-sdk_cross.cmake │ ├── aarch64-linux-gnu.cmake │ └── arm-linux-gnueabihf.cmake ├── Modules │ ├── VolkConfigVersion.cmake.in │ ├── FindORC.cmake │ ├── VolkConfig.cmake.in │ └── VolkVersion.cmake ├── cmake_uninstall.cmake.in └── msvc │ ├── sys │ └── time.h │ └── config.h ├── python └── volk_modtool │ ├── __init__.py │ ├── CMakeLists.txt │ └── cfg.py ├── tmpl ├── volk.pc.in ├── volk_config_fixed.tmpl.h ├── volk_typedefs.tmpl.h ├── volk_machines.tmpl.c ├── volk_cpu.tmpl.h ├── volk_machines.tmpl.h ├── volk_machine_xxx.tmpl.c ├── volk_cpu.tmpl.c └── volk.tmpl.h ├── scripts ├── tools │ ├── update_citations.sh │ └── compare_volk_profiles ├── ci │ └── download_intel_sde.sh └── licensing │ └── count_contrib.sh ├── include └── volk │ ├── constants.h │ ├── saturation_arithmetic.h │ ├── volk_version.h.in │ ├── volk_prefs.h │ ├── volk_alloc.hh │ ├── volk_malloc.h │ ├── volk_avx2_fma_intrinsics.h │ ├── volk_sse3_intrinsics.h │ ├── volk_complex.h │ └── volk_rvv_intrinsics.h ├── .github └── workflows │ ├── check-pr-formatting.yml │ ├── publish_docs.yml │ ├── run-tests-rvv.yml │ └── android_build.yml ├── lib ├── constants.c.in ├── volk_rank_archs.h ├── volk_malloc.c ├── volk_prefs.c └── volk_rank_archs.c ├── apps ├── volk_profile.h ├── plot_best_vs_generic.py ├── volk_option_helpers.h ├── volk-config-info.cc └── CMakeLists.txt ├── .gitlab-ci.yml ├── tests ├── CMakeLists.txt ├── test_volk_32fc_x2_multiply_32fc.cc └── volk_test.cc ├── gen ├── volk_tmpl_utils.py ├── volk_compile_utils.py ├── volk_machine_defs.py ├── machines.xml └── volk_arch_defs.py ├── appveyor.yml ├── .travis.yml ├── .clang-format ├── .cmake-format.py └── .mailmap /.lastrelease: -------------------------------------------------------------------------------- 1 | v3.2.0 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *.pyo 4 | *build*/ 5 | archives/ 6 | -------------------------------------------------------------------------------- /docs/volk_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gnuradio/volk/HEAD/docs/volk_logo.png -------------------------------------------------------------------------------- /docs/volk_logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gnuradio/volk/HEAD/docs/volk_logo_small.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "cpu_features"] 2 | path = cpu_features 3 | url = https://github.com/google/cpu_features.git 4 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_16u_byteswap_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_16u_byteswap_a_orc_impl 2 | .dest 2 dst uint16_t 3 | swapw dst, dst 4 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_add_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_add_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | addf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_max_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_max_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | maxf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_min_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_min_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | minf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_s32f_add_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_s32f_add_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .floatparam 4 scalar 5 | addf dst, src1, scalar 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_divide_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_divide_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | divf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32i_x2_and_32i_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32i_x2_and_32i_a_orc_impl 2 | .dest 4 dst int32_t 3 | .source 4 src1 int32_t 4 | .source 4 src2 int32_t 5 | andl dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32i_x2_or_32i_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32i_x2_or_32i_a_orc_impl 2 | .dest 4 dst int32_t 3 | .source 4 src1 int32_t 4 | .source 4 src2 int32_t 5 | orl dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_multiply_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_multiply_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | mulf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_x2_subtract_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_x2_subtract_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .source 4 src2 float 5 | subf dst, src1, src2 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_8i_convert_16i_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_8i_convert_16i_a_orc_impl 2 | .source 1 src int8_t 3 | .dest 2 dst int16_t 4 | .temp 2 tmp 5 | convsbw tmp, src 6 | shlw dst, tmp, 8 7 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_s32f_multiply_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_s32f_multiply_32f_a_orc_impl 2 | .dest 4 dst float 3 | .source 4 src1 float 4 | .floatparam 4 scalar 5 | mulf dst, src1, scalar 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32f_s32f_normalize_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32f_s32f_normalize_a_orc_impl 2 | .source 4 src1 float 3 | .floatparam 4 invscalar 4 | .dest 4 dst float 5 | mulf dst, src1, invscalar 6 | -------------------------------------------------------------------------------- /cmake/Checks/check-rvv-intrinsics.c: -------------------------------------------------------------------------------- 1 | #if (__riscv_v_intrinsic >= 1000000 || __clang_major__ >= 18 || __GNUC__ >= 14) 2 | int main() { return 0; } 3 | #else 4 | #error "rvv intrinsics aren't supported" 5 | #endif 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_16ic_deinterleave_16i_x2_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_16ic_deinterleave_16i_x2_a_orc_impl 2 | .dest 2 idst int16_t 3 | .dest 2 qdst int16_t 4 | .source 4 src lv_16sc_t 5 | splitlw qdst, idst, src 6 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_16ic_deinterleave_real_8i_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_16ic_deinterleave_real_8i_a_orc_impl 2 | .dest 1 dst int8_t 3 | .source 4 src lv_16sc_t 4 | .temp 2 iw 5 | select0lw iw, src 6 | convhwb dst, iw 7 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32fc_32f_multiply_32fc_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32fc_32f_multiply_32fc_a_orc_impl 2 | .source 8 src1 lv_32fc_t 3 | .source 4 src2 float 4 | .dest 8 dst lv_32fc_t 5 | .temp 8 tmp 6 | mergelq tmp, src2, src2 7 | x2 mulf dst, src1, tmp 8 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_8i_s32f_convert_32f_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_8i_s32f_convert_32f_a_orc_impl 2 | .source 1 src uint8_t 3 | .dest 4 dst float 4 | .floatparam 4 scalar 5 | .temp 4 flsrc 6 | .temp 4 lsrc 7 | .temp 2 ssrc 8 | convsbw ssrc, src 9 | convswl lsrc, ssrc 10 | convlf flsrc, lsrc 11 | mulf dst, flsrc, scalar 12 | -------------------------------------------------------------------------------- /python/volk_modtool/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright 2013, 2014 Free Software Foundation, Inc. 5 | # 6 | # This file is part of VOLK 7 | # 8 | # SPDX-License-Identifier: LGPL-3.0-or-later 9 | # 10 | 11 | from .cfg import volk_modtool_config 12 | from .volk_modtool_generate import volk_modtool 13 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_16ic_s32f_deinterleave_32f_x2_a_orc_impl 2 | .dest 4 idst float 3 | .dest 4 qdst float 4 | .source 4 src lv_16sc_t 5 | .floatparam 4 scalar 6 | .temp 8 iql 7 | .temp 8 iqf 8 | 9 | x2 convswl iql, src 10 | x2 convlf iqf, iql 11 | x2 divf iqf, iqf, scalar 12 | splitql qdst, idst, iqf 13 | -------------------------------------------------------------------------------- /tmpl/volk.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@CMAKE_INSTALL_PREFIX@ 2 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 3 | libdir=@CMAKE_INSTALL_FULL_LIBDIR@ 4 | includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ 5 | LV_CXXFLAGS=@LV_CXXFLAGS@ 6 | 7 | 8 | Name: volk 9 | Description: VOLK: Vector Optimized Library of Kernels 10 | Requires: 11 | Version: @SOVERSION@ 12 | Libs: -L${libdir} -lvolk 13 | Cflags: -I${includedir} ${LV_CXXFLAGS} 14 | -------------------------------------------------------------------------------- /cmake/Toolchains/intel-sde.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=knl") 10 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=knl") 11 | set(CMAKE_CROSSCOMPILING_EMULATOR 12 | "$ENV{TRAVIS_BUILD_DIR}/cache/$ENV{SDE_VERSION}/sde64 -knl --") 13 | -------------------------------------------------------------------------------- /tmpl/volk_config_fixed.tmpl.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_CONFIG_FIXED_H 11 | #define INCLUDED_VOLK_CONFIG_FIXED_H 12 | 13 | %for i, arch in enumerate(archs): 14 | #define LV_${arch.name.upper()} ${i} 15 | %endfor 16 | 17 | #endif /*INCLUDED_VOLK_CONFIG_FIXED*/ 18 | -------------------------------------------------------------------------------- /tmpl/volk_typedefs.tmpl.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_TYPEDEFS 11 | #define INCLUDED_VOLK_TYPEDEFS 12 | 13 | #include 14 | #include 15 | 16 | %for kern in kernels: 17 | typedef void (*${kern.pname})(${kern.arglist_types}); 18 | %endfor 19 | 20 | #endif /*INCLUDED_VOLK_TYPEDEFS*/ 21 | -------------------------------------------------------------------------------- /kernels/volk/asm/orc/volk_32fc_x2_multiply_32fc_a_orc_impl.orc: -------------------------------------------------------------------------------- 1 | .function volk_32fc_x2_multiply_32fc_a_orc_impl 2 | .source 8 src1 lv_32fc_t 3 | .source 8 src2 lv_32fc_t 4 | .dest 8 dst lv_32fc_t 5 | .temp 8 iqprod 6 | .temp 4 real 7 | .temp 4 imag 8 | .temp 4 ac 9 | .temp 4 bd 10 | .temp 8 swapped 11 | x2 mulf iqprod, src1, src2 12 | splitql bd, ac, iqprod 13 | subf real, ac, bd 14 | swaplq swapped, src1 15 | x2 mulf iqprod, swapped, src2 16 | splitql bd, ac, iqprod 17 | addf imag, ac, bd 18 | mergelq dst, real, imag 19 | -------------------------------------------------------------------------------- /scripts/tools/update_citations.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2022 Johannes Demel 4 | # 5 | # This script is part of VOLK. 6 | # 7 | # SPDX-License-Identifier: LGPL-3.0-or-later 8 | # 9 | # Find all contributors according to git and update `.zenodo.json` accordingly. 10 | 11 | script_name=$0 12 | script_full_path=$(dirname "$0") 13 | python_script=$"$script_full_path/run_citations_update.py" 14 | 15 | contributors_list="$(git log --pretty="%an <%ae>" | sort | uniq)" 16 | 17 | # Run a Python script to make things easier. 18 | python3 $python_script "$contributors_list" 19 | -------------------------------------------------------------------------------- /tmpl/volk_machines.tmpl.c: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include "volk_machines.h" 13 | 14 | struct volk_machine *volk_machines[] = { 15 | %for machine in machines: 16 | #ifdef LV_MACHINE_${machine.name.upper()} 17 | &volk_machine_${machine.name}, 18 | #endif 19 | %endfor 20 | }; 21 | 22 | unsigned int n_volk_machines = sizeof(volk_machines)/sizeof(*volk_machines); 23 | -------------------------------------------------------------------------------- /tmpl/volk_cpu.tmpl.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_CPU_H 11 | #define INCLUDED_VOLK_CPU_H 12 | 13 | #include 14 | 15 | __VOLK_DECL_BEGIN 16 | 17 | struct VOLK_CPU { 18 | %for arch in archs: 19 | int (*has_${arch.name}) (); 20 | %endfor 21 | }; 22 | 23 | extern struct VOLK_CPU volk_cpu; 24 | 25 | void volk_cpu_init (); 26 | unsigned int volk_get_lvarch (); 27 | 28 | __VOLK_DECL_END 29 | 30 | #endif /*INCLUDED_VOLK_CPU_H*/ 31 | -------------------------------------------------------------------------------- /docs/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Copyright 2022 Johannes Demel. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | find_package(Doxygen) 9 | if(DOXYGEN_FOUND) 10 | 11 | message(STATUS "Doxygen found. Building docs ...") 12 | 13 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in ${CMAKE_BINARY_DIR}/Doxyfile 14 | @ONLY) 15 | 16 | add_custom_target( 17 | volk_doc 18 | ${DOXYGEN_EXECUTABLE} ${CMAKE_BINARY_DIR}/Doxyfile 19 | WORKING_DIRECTORY ${CMAKE_BINARY_DIR} 20 | COMMENT "Generating documentation with Doxygen" 21 | VERBATIM) 22 | 23 | endif(DOXYGEN_FOUND) 24 | -------------------------------------------------------------------------------- /include/volk/constants.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2006,2009,2013 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_CONSTANTS_H 11 | #define INCLUDED_VOLK_CONSTANTS_H 12 | 13 | #include 14 | 15 | __VOLK_DECL_BEGIN 16 | 17 | VOLK_API const char* volk_prefix(); 18 | VOLK_API const char* volk_version(); 19 | VOLK_API const char* volk_c_compiler(); 20 | VOLK_API const char* volk_compiler_flags(); 21 | VOLK_API const char* volk_available_machines(); 22 | 23 | __VOLK_DECL_END 24 | 25 | #endif /* INCLUDED_VOLK_CONSTANTS_H */ 26 | -------------------------------------------------------------------------------- /python/volk_modtool/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2013, 2014 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Install python files and apps 11 | ######################################################################## 12 | include(VolkPython) 13 | 14 | volk_python_install( 15 | FILES 16 | __init__.py 17 | cfg.py 18 | volk_modtool_generate.py 19 | DESTINATION 20 | ${VOLK_PYTHON_DIR}/volk_modtool 21 | COMPONENT 22 | "volk") 23 | 24 | volk_python_install(PROGRAMS volk_modtool DESTINATION ${VOLK_RUNTIME_DIR} COMPONENT 25 | "volk") 26 | -------------------------------------------------------------------------------- /.github/workflows/check-pr-formatting.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020, 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | name: Check PR Formatting 10 | 11 | on: 12 | push: 13 | paths-ignore: 14 | - 'tmpl/' 15 | - 'include/volk/sse2neon.h' 16 | pull_request: 17 | paths-ignore: 18 | - 'tmpl/' 19 | - 'include/volk/sse2neon.h' 20 | 21 | jobs: 22 | build: 23 | runs-on: ubuntu-latest 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | - uses: gnuradio/clang-format-lint-action@v0.5-4 28 | with: 29 | source: '.' 30 | exclude: './tmpl,./include/volk/sse2neon.h' 31 | extensions: 'c,cc,cpp,cxx,h,hh' 32 | 33 | -------------------------------------------------------------------------------- /docs/main_page.dox: -------------------------------------------------------------------------------- 1 | /*! \mainpage VOLK 2 | 3 | Welcome to VOLK! 4 | 5 | VOLK is the Vector-Optimized Library of Kernels. It is a library that contains 6 | kernels of hand-written SIMD code for different mathematical operations. Since 7 | each SIMD architecture can be very different and no compiler has yet come along 8 | to handle vectorization properly or highly efficiently, VOLK approaches the 9 | problem differently. 10 | 11 | For each architecture or platform that a developer wishes to vectorize for, a 12 | new proto-kernel is added to VOLK. At runtime, VOLK will select the correct 13 | proto-kernel. In this way, the users of VOLK call a kernel for performing the 14 | operation that is platform/architecture agnostic. This allows us to write 15 | portable SIMD code. 16 | 17 | */ 18 | -------------------------------------------------------------------------------- /kernels/volk/volk_32f_null_32f.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #ifndef INCLUDED_volk_32f_null_32f_a_H 15 | #define INCLUDED_volk_32f_null_32f_a_H 16 | 17 | #ifdef LV_HAVE_GENERIC 18 | 19 | static inline void 20 | volk_32f_null_32f_generic(float* bVector, const float* aVector, unsigned int num_points) 21 | { 22 | float* bPtr = bVector; 23 | const float* aPtr = aVector; 24 | unsigned int number; 25 | 26 | for (number = 0; number < num_points; number++) { 27 | *bPtr++ = *aPtr++; 28 | } 29 | } 30 | #endif /* LV_HAVE_GENERIC */ 31 | 32 | #endif /* INCLUDED_volk_32f_null_32f_u_H */ 33 | -------------------------------------------------------------------------------- /cmake/Modules/VolkConfigVersion.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright 2014, 2015, 2018, 2020 Free Software Foundation, Inc. 2 | # 3 | # This file is part of VOLK. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | set(MAJOR_VERSION @VERSION_INFO_MAJOR_VERSION@) 9 | set(MINOR_VERSION @VERSION_INFO_MINOR_VERSION@) 10 | set(MAINT_VERSION @VERSION_INFO_MAINT_VERSION@) 11 | 12 | set(PACKAGE_VERSION ${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION}) 13 | 14 | if(${PACKAGE_FIND_VERSION_MAJOR} EQUAL ${MAJOR_VERSION}) 15 | if(${PACKAGE_FIND_VERSION_MINOR} EQUAL ${MINOR_VERSION}) 16 | if(NOT ${PACKAGE_FIND_VERSION_PATCH} GREATER ${MAINT_VERSION}) 17 | set(PACKAGE_VERSION_EXACT 1) # exact match for API version 18 | set(PACKAGE_VERSION_COMPATIBLE 1) # compat for minor/patch version 19 | endif() 20 | endif() 21 | endif() 22 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a76_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # This file is part of VOLK 3 | # 4 | # SPDX-License-Identifier: LGPL-3.0-or-later 5 | # 6 | 7 | ######################################################################## 8 | # Toolchain file for building native on a ARM Cortex A76 w/ NEON 9 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 10 | ######################################################################## 11 | set(CMAKE_CXX_COMPILER g++) 12 | set(CMAKE_C_COMPILER gcc) 13 | set(CMAKE_CXX_FLAGS 14 | "-march=armv8.2-a -mtune=cortex-a76 -mfpu=neon-fp-armv8 -mfloat-abi=hard" 15 | CACHE STRING "" FORCE) 16 | set(CMAKE_C_FLAGS 17 | ${CMAKE_CXX_FLAGS} 18 | CACHE STRING "" FORCE) #same flags for C sources 19 | set(CMAKE_ASM_FLAGS 20 | "${CMAKE_CXX_FLAGS} -mthumb -g" 21 | CACHE STRING "" FORCE) #same flags for asm sources 22 | -------------------------------------------------------------------------------- /lib/constants.c.in: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2013, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #if HAVE_CONFIG_H 11 | #include 12 | #endif 13 | 14 | #include 15 | #include 16 | 17 | const char* 18 | volk_prefix() 19 | { 20 | const char *prefix = getenv("VOLK_PREFIX"); 21 | if (prefix != NULL) return prefix; 22 | return "@prefix@"; 23 | } 24 | 25 | const char* 26 | volk_version() 27 | { 28 | return "@VERSION@"; 29 | } 30 | 31 | const char* 32 | volk_c_compiler() 33 | { 34 | return "@cmake_c_compiler_version@"; 35 | } 36 | 37 | const char* 38 | volk_compiler_flags() 39 | { 40 | return "@COMPILER_INFO@"; 41 | } 42 | 43 | const char* 44 | volk_available_machines() 45 | { 46 | return "@available_machines@"; 47 | } 48 | -------------------------------------------------------------------------------- /apps/volk_profile.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012-2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include // for bool 11 | #include // for ofstream 12 | #include // for string 13 | #include // for vector 14 | 15 | class volk_test_results_t; 16 | 17 | void read_results(std::vector* results); 18 | void read_results(std::vector* results, std::string path); 19 | void write_results(const std::vector* results, bool update_result); 20 | void write_results(const std::vector* results, 21 | bool update_result, 22 | const std::string path); 23 | void write_json(std::ofstream& json_file, std::vector results); 24 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a8_softfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=softfp" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a15_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv7-a -mtune=cortex-a15 -mfpu=neon -mfloat-abi=hard" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a8_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv7-a -mtune=cortex-a8 -mfpu=neon -mfloat-abi=hard" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a9_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A8 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv7-a -mtune=cortex-a9 -mfpu=neon -mfloat-abi=hard" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a72_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2018, 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A72 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv8-a -mtune=cortex-a72 -mfpu=neon-fp-armv8 -mfloat-abi=hard" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -mthumb -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /scripts/ci/download_intel_sde.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2019, 2020 Christoph Mayer 4 | # 5 | # This script is part of VOLK. 6 | # 7 | # SPDX-License-Identifier: LGPL-3.0-or-later 8 | 9 | set -e 10 | set -x 11 | 12 | function test_sde 13 | { 14 | if ! [ -f ${SDE} ]; then 15 | echo "1" 16 | else 17 | ${SDE} -- ls > /dev/null 18 | echo $? 19 | fi 20 | } 21 | 22 | mkdir -p cache 23 | cd cache 24 | 25 | [ -z "${SDE_VERSION}" ] && SDE_VERSION=sde-external-8.50.0-2020-03-26-lin 26 | [ -z "${SDE_URL}" ] && SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/ 27 | [ -z "${SDE}" ] && SDE=${SDE_VERSION}/sde64 28 | 29 | 30 | if [ _$(test_sde) == _0 ]; then 31 | MSG="found working version: ${SDE_VERSION}" 32 | else 33 | MSG="downloading: ${SDE_VERSION}" 34 | wget ${SDE_URL}/${SDE_VERSION}.tar.bz2 35 | tar xvf ${SDE_VERSION}.tar.bz2 36 | fi 37 | 38 | echo $SDE 39 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm_cortex_a53_hardfp_native.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014, 2018, 2019, 2021 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Toolchain file for building native on a ARM Cortex A53 w/ NEON 11 | # Usage: cmake -DCMAKE_TOOLCHAIN_FILE= 12 | ######################################################################## 13 | set(CMAKE_CXX_COMPILER g++) 14 | set(CMAKE_C_COMPILER gcc) 15 | set(CMAKE_CXX_FLAGS 16 | "-march=armv8-a -mtune=cortex-a53 -mfpu=neon-fp-armv8 -mfloat-abi=hard" 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | ${CMAKE_CXX_FLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_ASM_FLAGS 22 | "${CMAKE_CXX_FLAGS} -mthumb -g" 23 | CACHE STRING "" FORCE) #same flags for asm sources 24 | -------------------------------------------------------------------------------- /include/volk/saturation_arithmetic.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2016 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | 11 | #ifndef INCLUDED_volk_saturation_arithmetic_H_ 12 | #define INCLUDED_volk_saturation_arithmetic_H_ 13 | 14 | #include 15 | 16 | static inline int16_t sat_adds16i(int16_t x, int16_t y) 17 | { 18 | int32_t res = (int32_t)x + (int32_t)y; 19 | 20 | if (res < SHRT_MIN) 21 | res = SHRT_MIN; 22 | if (res > SHRT_MAX) 23 | res = SHRT_MAX; 24 | 25 | return res; 26 | } 27 | 28 | static inline int16_t sat_muls16i(int16_t x, int16_t y) 29 | { 30 | int32_t res = (int32_t)x * (int32_t)y; 31 | 32 | if (res < SHRT_MIN) 33 | res = SHRT_MIN; 34 | if (res > SHRT_MAX) 35 | res = SHRT_MAX; 36 | 37 | return res; 38 | } 39 | 40 | #endif /* INCLUDED_volk_saturation_arithmetic_H_ */ 41 | -------------------------------------------------------------------------------- /include/volk/volk_version.h.in: -------------------------------------------------------------------------------- 1 | /* -*- C -*- */ 2 | /* 3 | * Copyright 2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_VERSION_H 11 | #define INCLUDED_VOLK_VERSION_H 12 | 13 | #include 14 | 15 | __VOLK_DECL_BEGIN 16 | 17 | /* 18 | * define macros for the Volk version, which can then be used by any 19 | * project that #include's this header, e.g., to determine whether 20 | * some specific API is present and functional. 21 | */ 22 | 23 | #define VOLK_VERSION_MAJOR @VERSION_INFO_MAJOR_VERSION@ 24 | #define VOLK_VERSION_MINOR @VERSION_INFO_MINOR_VERSION@ 25 | #define VOLK_VERSION_MAINT @VERSION_INFO_MAINT_VERSION@ 26 | 27 | /* 28 | * VOLK_VERSION % 100 is the MAINT version 29 | * (VOLK_VERSION / 100) % 100 is the MINOR version 30 | * (VOLK_VERSION / 100) / 100 is the MAJOR version 31 | */ 32 | 33 | #define VOLK_VERSION @VOLK_VERSION_DECIMAL@ 34 | 35 | __VOLK_DECL_END 36 | 37 | #endif /* INCLUDED_VOLK_VERSION_H */ 38 | -------------------------------------------------------------------------------- /cmake/Toolchains/rv64gcv-linux-gnu.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2024 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | set(CMAKE_SYSTEM_NAME Linux) 10 | set(CMAKE_SYSTEM_PROCESSOR riscv64) 11 | 12 | set(CMAKE_C_COMPILER $ENV{CC}) 13 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) 14 | set(CMAKE_CXX_COMPILER $ENV{CXX}) 15 | 16 | set(CMAKE_C_FLAGS "$ENV{CFLAGS} -march=rv64gcv" CACHE STRING "" FORCE) 17 | set(CMAKE_CXX_FLAGS ${CMAKE_C_FLAGS} CACHE STRING "" FORCE) 18 | set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} -g" CACHE STRING "" FORCE) 19 | 20 | set(CMAKE_OBJCOPY 21 | ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy 22 | CACHE INTERNAL "objcopy tool") 23 | set(CMAKE_SIZE_UTIL 24 | ${RISCV64_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size 25 | CACHE INTERNAL "size tool") 26 | 27 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) 28 | 29 | set(QEMU_VLEN $ENV{VLEN}) 30 | if(NOT QEMU_VLEN) 31 | set(QEMU_VLEN "128") 32 | endif() 33 | 34 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-riscv64-static -L /usr/riscv64-linux-gnu/ -cpu rv64,zba=true,zbb=true,v=on,vlen=${QEMU_VLEN},rvv_ta_all_1s=on,rvv_ma_all_1s=on") 35 | -------------------------------------------------------------------------------- /kernels/volk/volk_32fc_s32f_power_spectral_densitypuppet_32f.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | 11 | #ifndef INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H 12 | #define INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H 13 | 14 | 15 | #include 16 | 17 | 18 | #ifdef LV_HAVE_GENERIC 19 | 20 | static inline void 21 | volk_32fc_s32f_power_spectral_densitypuppet_32f_generic(float* logPowerOutput, 22 | const lv_32fc_t* complexFFTInput, 23 | const float normalizationFactor, 24 | unsigned int num_points) 25 | { 26 | volk_32fc_s32f_x2_power_spectral_density_32f_generic( 27 | logPowerOutput, complexFFTInput, normalizationFactor, 2.5, num_points); 28 | } 29 | 30 | #endif /* LV_HAVE_GENERIC */ 31 | 32 | 33 | #endif /* INCLUDED_volk_32fc_s32f_power_spectral_densitypuppet_32f_a_H */ 34 | -------------------------------------------------------------------------------- /docs/using_volk.dox: -------------------------------------------------------------------------------- 1 | /*! \page using_volk Using VOLK 2 | 3 | Using VOLK in your code requires proper linking and including the correct headers. VOLK currently supports both C and C++ bindings. 4 | 5 | VOLK provides both a pkgconfig and CMake module to help configuration and 6 | linking. The pkfconfig file is installed to 7 | $install_prefix/lib/pkgconfig/volk.pc. The CMake configuration module is in 8 | $install_prefix/lib/cmake/volk/VolkConfig.cmake. 9 | 10 | The header in the VOLK include directory (includedir in pkgconfig, 11 | VOLK_INCLUDE_DIRS in cmake module) contains the header volk/volk.h defines all 12 | of the symbols exposed by VOLK. Alternatively individual kernel headers are in 13 | the same location. 14 | 15 | In most cases it is sufficient to call the dispatcher for the kernel you are using. 16 | For example the following code will compute the dot product between the taps and 17 | input vector as part of a FIR filter. 18 | \code 19 | // assume there is sufficient history in the input buffer 20 | for(unsigned int ii=0; ii < input_length-ntaps; ++ii) { 21 | volk_32fc_32f_dot_prod_32fc(filter_output[ii], input[ii-ntaps] taps, ntaps); 22 | } 23 | \endcode 24 | 25 | */ 26 | 27 | -------------------------------------------------------------------------------- /cmake/Toolchains/oe-sdk_cross.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2014 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | set(CMAKE_SYSTEM_NAME Linux) 10 | #set( CMAKE_C_COMPILER $ENV{CC} ) 11 | #set( CMAKE_CXX_COMPILER $ENV{CXX} ) 12 | string(REGEX MATCH "sysroots/([a-zA-Z0-9]+)" CMAKE_SYSTEM_PROCESSOR 13 | $ENV{SDKTARGETSYSROOT}) 14 | string(REGEX REPLACE "sysroots/" "" CMAKE_SYSTEM_PROCESSOR ${CMAKE_SYSTEM_PROCESSOR}) 15 | set(CMAKE_CXX_FLAGS 16 | $ENV{CXXFLAGS} 17 | CACHE STRING "" FORCE) 18 | set(CMAKE_C_FLAGS 19 | $ENV{CFLAGS} 20 | CACHE STRING "" FORCE) #same flags for C sources 21 | set(CMAKE_LDFLAGS_FLAGS 22 | ${CMAKE_CXX_FLAGS} 23 | CACHE STRING "" FORCE) #same flags for C sources 24 | set(CMAKE_LIBRARY_PATH ${OECORE_TARGET_SYSROOT}/usr/lib) 25 | set(CMAKE_FIND_ROOT_PATH $ENV{OECORE_TARGET_SYSROOT} $ENV{OECORE_NATIVE_SYSROOT}) 26 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 27 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 28 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 29 | set(ORC_INCLUDE_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/include/orc-0.4) 30 | set(ORC_LIBRARY_DIRS $ENV{OECORE_TARGET_SYSROOT}/usr/lib) 31 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 - 2020, 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | # This file is a template, and might need editing before it works on your project. 10 | # use the official gcc image, based on debian 11 | # can use versions as well, like gcc:5.2 12 | # see https://hub.docker.com/_/gcc/ 13 | image: ubuntu:18.04 14 | 15 | build: 16 | stage: build 17 | # instead of calling g++ directly you can also use some build toolkit like make 18 | # install the necessary build tools when needed 19 | before_script: 20 | - apt update && apt -y install make cmake python python-pip && pip install mako 21 | script: 22 | - mkdir build && cd build && cmake .. && make -j 23 | artifacts: 24 | paths: 25 | - build/ 26 | # depending on your build setup it's most likely a good idea to cache outputs to reduce the build time 27 | # cache: 28 | # paths: 29 | # - "*.o" 30 | 31 | # run tests using the binary built before 32 | test: 33 | stage: test 34 | before_script: 35 | - apt update && apt -y install cmake python python-pip && pip install mako 36 | script: 37 | - cd build && ctest -V 38 | 39 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2022, 2024 Johannes Demel 3 | # 4 | # This file is part of VOLK. 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | if(NOT ENABLE_TESTING) 10 | return() 11 | endif(NOT ENABLE_TESTING) 12 | 13 | find_package(fmt) 14 | find_package(GTest) 15 | 16 | if(NOT fmt_FOUND OR NOT GTest_FOUND) 17 | message(warning "Missing fmtlib and/or googletest for this test suite") 18 | return() 19 | endif(NOT fmt_FOUND OR NOT GTest_FOUND) 20 | 21 | file(GLOB volk_test_files "test_*.cc") 22 | 23 | add_executable( 24 | volk_tests 25 | volk_test.cc 26 | ${volk_test_files} 27 | ) 28 | 29 | target_compile_features(volk_tests PUBLIC cxx_std_20) 30 | 31 | target_link_libraries(volk_tests 32 | PRIVATE 33 | GTest::gtest_main 34 | volk 35 | fmt::fmt 36 | ) 37 | 38 | include(GoogleTest) 39 | gtest_discover_tests(volk_tests) 40 | 41 | 42 | target_include_directories(volk_tests 43 | PRIVATE $ 44 | PRIVATE $ 45 | PRIVATE $ 46 | PRIVATE $ 47 | PRIVATE ${CMAKE_CURRENT_BINARY_DIR} 48 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} 49 | ) 50 | -------------------------------------------------------------------------------- /lib/volk_rank_archs.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_RANK_ARCHS_H 11 | #define INCLUDED_VOLK_RANK_ARCHS_H 12 | 13 | #include 14 | #include 15 | 16 | #ifdef __cplusplus 17 | extern "C" { 18 | #endif 19 | 20 | int volk_get_index(const char* impl_names[], // list of implementations by name 21 | const size_t n_impls, // number of implementations available 22 | const char* impl_name // the implementation name to find 23 | ); 24 | 25 | int volk_rank_archs(const char* kern_name, // name of the kernel to rank 26 | const char* impl_names[], // list of implementations by name 27 | const int* impl_deps, // requirement mask per implementation 28 | const bool* alignment, // alignment status of each implementation 29 | size_t n_impls, // number of implementations available 30 | const bool align // if false, filter aligned implementations 31 | ); 32 | 33 | #ifdef __cplusplus 34 | } 35 | #endif 36 | #endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ 37 | -------------------------------------------------------------------------------- /include/volk/volk_prefs.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011, 2012, 2015, 2019, 2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_PREFS_H 11 | #define INCLUDED_VOLK_PREFS_H 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | __VOLK_DECL_BEGIN 18 | 19 | typedef struct volk_arch_pref { 20 | char name[128]; // name of the kernel 21 | char impl_a[128]; // best aligned impl 22 | char impl_u[128]; // best unaligned impl 23 | } volk_arch_pref_t; 24 | 25 | //////////////////////////////////////////////////////////////////////// 26 | // get path to volk_config profiling info; second arguments specifies 27 | // if config file should be tested on existence for reading. 28 | // returns \0 in the argument on failure. 29 | //////////////////////////////////////////////////////////////////////// 30 | VOLK_API void volk_get_config_path(char*, bool); 31 | 32 | //////////////////////////////////////////////////////////////////////// 33 | // load prefs into global prefs struct 34 | //////////////////////////////////////////////////////////////////////// 35 | VOLK_API size_t volk_load_preferences(volk_arch_pref_t**); 36 | 37 | __VOLK_DECL_END 38 | 39 | #endif // INCLUDED_VOLK_PREFS_H 40 | -------------------------------------------------------------------------------- /gen/volk_tmpl_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2012 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | import os 10 | import re 11 | import sys 12 | import argparse 13 | import volk_arch_defs 14 | import volk_machine_defs 15 | import volk_kernel_defs 16 | from mako.template import Template 17 | 18 | 19 | def __parse_tmpl(_tmpl, **kwargs): 20 | defs = { 21 | 'archs': volk_arch_defs.archs, 22 | 'arch_dict': volk_arch_defs.arch_dict, 23 | 'machines': volk_machine_defs.machines, 24 | 'machine_dict': volk_machine_defs.machine_dict, 25 | 'kernels': volk_kernel_defs.kernels, 26 | } 27 | defs.update(kwargs) 28 | _tmpl = """ 29 | 30 | /* this file was generated by volk template utils, do not edit! */ 31 | 32 | """ + _tmpl 33 | return str(Template(_tmpl).render(**defs)) 34 | 35 | 36 | def main(): 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--input', type=str) 39 | parser.add_argument('--output', type=str) 40 | args, extras = parser.parse_known_args() 41 | 42 | output = __parse_tmpl(open(args.input).read(), args=extras) 43 | if args.output: open(args.output, 'w').write(output) 44 | else: print(output) 45 | 46 | 47 | if __name__ == '__main__': 48 | main() 49 | 50 | -------------------------------------------------------------------------------- /tmpl/volk_machines.tmpl.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_LIBVOLK_MACHINES_H 11 | #define INCLUDED_LIBVOLK_MACHINES_H 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | __VOLK_DECL_BEGIN 20 | 21 | struct volk_machine { 22 | const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_get_lvarch format) 23 | const char *name; 24 | const size_t alignment; //the maximum byte alignment required for functions in this library 25 | %for kern in kernels: 26 | const char *${kern.name}_name; 27 | const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}]; 28 | const int ${kern.name}_impl_deps[${len_archs}]; 29 | const bool ${kern.name}_impl_alignment[${len_archs}]; 30 | const ${kern.pname} ${kern.name}_impls[${len_archs}]; 31 | const size_t ${kern.name}_n_impls; 32 | %endfor 33 | }; 34 | 35 | %for machine in machines: 36 | #ifdef LV_MACHINE_${machine.name.upper()} 37 | extern struct volk_machine volk_machine_${machine.name}; 38 | #endif 39 | %endfor 40 | 41 | __VOLK_DECL_END 42 | 43 | #endif //INCLUDED_LIBVOLK_MACHINES_H 44 | -------------------------------------------------------------------------------- /cmake/Toolchains/aarch64-linux-gnu.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018, 2020 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | set(CMAKE_SYSTEM_NAME Linux) 10 | set(CMAKE_SYSTEM_PROCESSOR aarch64) 11 | 12 | if(MINGW 13 | OR CYGWIN 14 | OR WIN32) 15 | set(UTIL_SEARCH_CMD where) 16 | elseif(UNIX OR APPLE) 17 | set(UTIL_SEARCH_CMD which) 18 | endif() 19 | 20 | set(TOOLCHAIN_PREFIX aarch64-linux-gnu-) 21 | 22 | execute_process( 23 | COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc 24 | OUTPUT_VARIABLE BINUTILS_PATH 25 | OUTPUT_STRIP_TRAILING_WHITESPACE) 26 | 27 | get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY) 28 | 29 | # The following is not needed on debian 30 | # Without that flag CMake is not able to pass test compilation check 31 | #set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs") 32 | 33 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc) 34 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) 35 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++) 36 | 37 | set(CMAKE_OBJCOPY 38 | ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy 39 | CACHE INTERNAL "objcopy tool") 40 | set(CMAKE_SIZE_UTIL 41 | ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size 42 | CACHE INTERNAL "size tool") 43 | 44 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) 45 | 46 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-aarch64 -L /usr/aarch64-linux-gnu/") 47 | -------------------------------------------------------------------------------- /.github/workflows/publish_docs.yml: -------------------------------------------------------------------------------- 1 | name: Publish docs 2 | 3 | on: [push] 4 | jobs: 5 | build-docs: 6 | name: Build VOLK docs 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | with: 12 | submodules: "recursive" 13 | - name: Install dependencies 14 | run: sudo apt install python3-mako liborc-dev doxygen 15 | - name: Configure 16 | run: mkdir build && cd build && cmake -DCMAKE_CXX_FLAGS="-Werror" -DBUILD_EXECUTABLE=ON .. 17 | - name: Build 18 | run: cmake --build build --target volk_doc 19 | - name: Setup SSH Keys 20 | env: 21 | SSH_AUTH_SOCK: /tmp/ssh_agent.sock 22 | run: | 23 | ssh-agent -a $SSH_AUTH_SOCK > /dev/null 24 | ssh-add - <<< "${{ secrets.SSH_KEY }}" 25 | mkdir $HOME/.ssh 26 | echo -n "${{ secrets.SSH_KNOWN_HOST }}" > $HOME/.ssh/known_hosts 27 | - name: Upload via SSH 28 | env: 29 | SSH_AUTH_SOCK: /tmp/ssh_agent.sock 30 | TARGET_DIR: "${{ github.ref_type }}/${{ github.ref_name }}" 31 | run: 'tar -cz build/html/ | ssh ${{ secrets.SSH_USER }}@${{ secrets.SSH_SERVER }} "mkdir -p /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); cd /www/${{ env.TARGET_DIR }}/$(date +%Y.%m.%d); tar --strip-components=2 -xzf -; rm -f /www/${{ env.TARGET_DIR }}/live; cd /www/${{ env.TARGET_DIR }}; ln -sf $(date +%Y.%m.%d) live;"' 32 | - uses: actions/upload-artifact@v4 33 | with: 34 | name: volk_docs 35 | path: build/html/ 36 | -------------------------------------------------------------------------------- /docs/DCO.txt: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 1 Letterman Drive 6 | Suite D4700 7 | San Francisco, CA, 94129 8 | 9 | Everyone is permitted to copy and distribute verbatim copies of this 10 | license document, but changing it is not allowed. 11 | 12 | 13 | Developer's Certificate of Origin 1.1 14 | 15 | By making a contribution to this project, I certify that: 16 | 17 | (a) The contribution was created in whole or in part by me and I 18 | have the right to submit it under the open source license 19 | indicated in the file; or 20 | 21 | (b) The contribution is based upon previous work that, to the best 22 | of my knowledge, is covered under an appropriate open source 23 | license and I have the right under that license to submit that 24 | work with modifications, whether created in whole or in part 25 | by me, under the same open source license (unless I am 26 | permitted to submit under a different license), as indicated 27 | in the file; or 28 | 29 | (c) The contribution was provided directly to me by some other 30 | person who certified (a), (b) or (c) and I have not modified 31 | it. 32 | 33 | (d) I understand and agree that this project and the contribution 34 | are public and that a record of the contribution (including all 35 | personal information I submit with it, including my sign-off) is 36 | maintained indefinitely and may be redistributed consistent with 37 | this project or the open source license(s) involved. 38 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2016, 2017, 2019, 2020, 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | image: Visual Studio 2022 9 | cache: 10 | - packages -> appveyor.yml 11 | environment: 12 | environment: 13 | matrix: 14 | - job_name: VS 17 2022 / python 3.12 15 | APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2022 16 | CMAKE_GENERATOR: Visual Studio 17 2022 17 | PYTHON: "C:\Python312-x64" 18 | 19 | install: 20 | # Prepend the selected Python to the PATH of this build 21 | - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% 22 | # Display version information about selected python and pip 23 | - python --version 24 | - python -c "import sys, platform, struct; 25 | print(sys.platform, platform.machine(), struct.calcsize('P')*8)" 26 | - pip --version 27 | - pip install mako 28 | before_build: 29 | - git submodule update --init --recursive 30 | - cmake -G "%CMAKE_GENERATOR%" -A x64 -DCMAKE_BUILD_TYPE:STRING=Release -DENABLE_ORC:BOOL=OFF -DENABLE_TESTING:BOOL=ON . 31 | build_script: 32 | - cmake --build . --config Release --target INSTALL 33 | test_script: 34 | - ctest -V --output-on-failure -C Release 35 | after_test: 36 | - cd "C:\Program Files" 37 | - 7z a "C:\libvolk-x64-%VC_VERSION%.zip" volk 38 | - mkdir dlls 39 | - cd dlls 40 | - 7z a "C:\libvolk-x64-deps-%VC_VERSION%.zip" * 41 | - appveyor PushArtifact C:\libvolk-x64-%VC_VERSION%.zip 42 | - appveyor PushArtifact C:\libvolk-x64-deps-%VC_VERSION%.zip 43 | -------------------------------------------------------------------------------- /cmake/Toolchains/arm-linux-gnueabihf.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 - 2020 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | set(CMAKE_SYSTEM_NAME Linux) 10 | set(CMAKE_SYSTEM_PROCESSOR arm) 11 | 12 | if(MINGW 13 | OR CYGWIN 14 | OR WIN32) 15 | set(UTIL_SEARCH_CMD where) 16 | elseif(UNIX OR APPLE) 17 | set(UTIL_SEARCH_CMD which) 18 | endif() 19 | 20 | set(TOOLCHAIN_PREFIX arm-linux-gnueabihf-) 21 | 22 | execute_process( 23 | COMMAND ${UTIL_SEARCH_CMD} ${TOOLCHAIN_PREFIX}gcc 24 | OUTPUT_VARIABLE BINUTILS_PATH 25 | OUTPUT_STRIP_TRAILING_WHITESPACE) 26 | 27 | get_filename_component(ARM_TOOLCHAIN_DIR ${BINUTILS_PATH} DIRECTORY) 28 | 29 | # The following is not needed on debian 30 | # Without that flag CMake is not able to pass test compilation check 31 | #set(CMAKE_EXE_LINKER_FLAGS_INIT "--specs=nosys.specs") 32 | 33 | set(CMAKE_C_COMPILER ${TOOLCHAIN_PREFIX}gcc) 34 | set(CMAKE_ASM_COMPILER ${CMAKE_C_COMPILER}) 35 | set(CMAKE_CXX_COMPILER ${TOOLCHAIN_PREFIX}g++) 36 | ## the following is needed for CheckCSourceCompiles used in lib/CMakeLists.txt 37 | set(CMAKE_C_FLAGS 38 | "-mfpu=neon" 39 | CACHE STRING "" FORCE) 40 | set(CMAKE_ASM_FLAGS 41 | "${CMAKE_C_FLAGS}" 42 | CACHE STRING "" FORCE) 43 | 44 | set(CMAKE_OBJCOPY 45 | ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}objcopy 46 | CACHE INTERNAL "objcopy tool") 47 | set(CMAKE_SIZE_UTIL 48 | ${ARM_TOOLCHAIN_DIR}/${TOOLCHAIN_PREFIX}size 49 | CACHE INTERNAL "size tool") 50 | 51 | set(CMAKE_FIND_ROOT_PATH ${BINUTILS_PATH}) 52 | 53 | set(CMAKE_CROSSCOMPILING_EMULATOR "qemu-arm -L /usr/arm-linux-gnueabihf/") 54 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32f_s32f_multiply_32f_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32f_s32f_multiply_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32f_s32f_multiply_32f_a_neonasm 3 | volk_32f_s32f_multiply_32f_a_neonasm: 4 | @ r0 - cVector: pointer to output array 5 | @ r1 - aVector: pointer to input array 1 6 | @ r2 - bVector: pointer to input array 2 7 | @ r3 - num_points: number of items to process 8 | 9 | stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ prologue - save register states 10 | 11 | 12 | @ quarter_points = num_points / 4 13 | movs r11, r3, lsr #2 14 | beq .loop2 @ if zero into quarterPoints 15 | 16 | @ number = quarter_points 17 | mov r10, r3 18 | @ copy address of input vector 19 | mov r4, r1 20 | @ copy address of output vector 21 | mov r5, r0 22 | 23 | @ load the scalar to a quad register 24 | @ vmov.32 d2[0], r2 25 | @ The scalar might be in s0, not totally sure 26 | vdup.32 q2, d0[0] 27 | 28 | @ this is giving fits. Current theory is hf has something to do with it 29 | .loop1: 30 | @ vld1.32 {q1}, [r4:128]! @ aVal 31 | @ vmul.f32 q3, q1, q2 32 | @ vst1.32 {q3}, [r5:128]! @ cVal 33 | @ 34 | @ subs r10, r10, #1 35 | @ bne .loop1 @ first loop 36 | 37 | @ number = quarter_points * 4 38 | mov r10, r11, asl #2 39 | 40 | .loop2: 41 | @ cmp num_points, number 42 | @ bls .done 43 | @ 44 | @ vld1.32 {d0[0]}, [aVector]! 45 | @ vmul.f32 s2, s0, s4 46 | @ vst1.32 {d1[0]}, [cVector]! 47 | @ add number, number, #1 48 | @ b .loop2 49 | 50 | .done: 51 | ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12} @ epilogue - restore register states 52 | bx lr 53 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32f_x2_add_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32f_x2_add_32f_a_neonasm 3 | volk_32f_x2_add_32f_a_neonasm: 4 | @ r0 - cVector: pointer to output array 5 | @ r1 - aVector: pointer to input array 1 6 | @ r2 - bVector: pointer to input array 2 7 | @ r3 - num_points: number of items to process 8 | cVector .req r0 9 | aVector .req r1 10 | bVector .req r2 11 | num_points .req r3 12 | quarterPoints .req r7 13 | number .req r8 14 | aVal .req q0 @ d0-d1 15 | bVal .req q1 @ d2-d3 16 | cVal .req q2 @ d4-d5 17 | 18 | @ AAPCS Section 5.1.1 19 | @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP 20 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 21 | 22 | movs quarterPoints, num_points, lsr #2 23 | beq .loop2 @ if zero into quarterPoints 24 | 25 | mov number, #0 @ number, 0 26 | .loop1: 27 | pld [aVector, #128] @ pre-load hint - this is implementation specific! 28 | pld [bVector, #128] @ pre-load hint - this is implementation specific! 29 | 30 | vld1.32 {d0-d1}, [aVector:128]! @ aVal 31 | add number, number, #1 32 | vld1.32 {d2-d3}, [bVector:128]! @ bVal 33 | vadd.f32 cVal, bVal, aVal 34 | cmp number, quarterPoints 35 | vst1.32 {d4-d5}, [cVector:128]! @ cVal 36 | 37 | blt .loop1 @ first loop 38 | 39 | mov number, quarterPoints, asl #2 40 | 41 | .loop2: 42 | cmp num_points, number 43 | bls .done 44 | 45 | vld1.32 {d0[0]}, [aVector]! 46 | vld1.32 {d0[1]}, [bVector]! 47 | vadd.f32 s2, s1, s0 48 | vst1.32 {d1[0]}, [cVector]! 49 | add number, number, #1 50 | b .loop2 51 | 52 | .done: 53 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 54 | bx lr 55 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_x2_multiply_32fc_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_x2_multiply_32fc_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32fc_x2_multiply_32fc_a_neonasm 3 | volk_32fc_x2_multiply_32fc_a_neonasm: 4 | push {r4, r5, r6, r7, r8, r9, r14} 5 | lsrs r7, r3, #2 6 | @ r0 is c vector 7 | @ r1 is a vector 8 | @ r2 is b vector 9 | @ r3 is num_points 10 | @ r7 is quarter_points 11 | beq .smallvector 12 | mov r5, #0 13 | .mainloop: 14 | vld2.32 {d24-d27}, [r1]! @ ar=q12, ai=q13 15 | add r5, r5, #1 16 | cmp r5, r7 17 | vld2.32 {d20-d23}, [r2]! @ br=q10, bi=q11 18 | pld [r1] 19 | pld [r2] 20 | vmul.f32 q0, q12, q10 @ q15 = ar*br 21 | vmul.f32 q1, q13, q11 @ q11 = ai*bi 22 | vmul.f32 q2, q12, q11 @ q14 = ar*bi 23 | vmul.f32 q3, q13, q10 @ q12 = ai*br 24 | vsub.f32 q9, q0, q1 @ real 25 | vadd.f32 q10, q2, q3 @ imag 26 | vst2.32 {q9-q10}, [r0]! 27 | bne .mainloop 28 | 29 | .smallvector: 30 | lsl r5, r7, #2 @ r5 = quarter_points * 4 31 | cmp r3, r5 @ num_points == quarter_points? 32 | bls .done 33 | .tailcase: 34 | add r5, r5, #1 @ r5 +=1 <- number++ 35 | vld1.32 d1, [r1]! @ s2, s3 = ar, ai 36 | vld1.32 d0, [r2]! @ s0, s1 = br, bi 37 | vmul.f32 s4, s0, s2 @ s4 = ar*br 38 | vmul.f32 s5, s0, s3 @ s5 = ar*bi 39 | vmls.f32 s4, s1, s3 @ s4 = s4 - ai*bi 40 | vmla.f32 s5, s1, s2 @ s5 = s5 + ai*br 41 | @vst2.32 d2[0], [r0]! 42 | vst1.32 {d2}, [r0]! 43 | cmp r3, r5 @ r3 == r5? num_points == number? 44 | bne .tailcase 45 | .done: 46 | pop {r4, r5, r6, r7, r8, r9, r15} 47 | bx lr 48 | -------------------------------------------------------------------------------- /apps/plot_best_vs_generic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2019 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | # This script is used to compare the generic kernels to the highest performing kernel, for each operation 10 | # Run: 11 | # ./volk_profile -j volk_results.json 12 | # Then run this script under python3 13 | 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | import json 17 | 18 | filename = 'volk_results.json' 19 | 20 | operations = [] 21 | metrics = [] 22 | with open(filename) as json_file: 23 | data = json.load(json_file) 24 | for test in data['volk_tests']: 25 | if ('generic' in test['results']) or ('u_generic' in test['results']): # some dont have a generic kernel 26 | operations.append(test['name'][5:]) # remove volk_ prefix that they all have 27 | extension_performance = [] 28 | for key, val in test['results'].items(): 29 | if key not in ['generic', 'u_generic']: # exclude generic results, when trying to find fastest time 30 | extension_performance.append(val['time']) 31 | try: 32 | generic_time = test['results']['generic']['time'] 33 | except: 34 | generic_time = test['results']['u_generic']['time'] 35 | metrics.append(extension_performance[np.argmin(extension_performance)]/generic_time) 36 | 37 | 38 | plt.bar(np.arange(len(metrics)), metrics) 39 | plt.hlines(1.0, -1, len(metrics), colors='r', linestyles='dashed') 40 | plt.axis([-1, len(metrics), 0, 2]) 41 | plt.xticks(np.arange(len(operations)), operations, rotation=90) 42 | plt.ylabel('Time taken of fastest kernel relative to generic kernel') 43 | plt.tight_layout() 44 | plt.show() 45 | -------------------------------------------------------------------------------- /cmake/Modules/FindORC.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2014, 2019, 2020 Free Software Foundation, Inc. 2 | # 3 | # This file is part of VOLK. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | find_package(PkgConfig) 9 | pkg_check_modules(PC_ORC "orc-0.4 > 0.4.11") 10 | 11 | include(GNUInstallDirs) 12 | 13 | find_program( 14 | ORCC_EXECUTABLE orcc 15 | HINTS ${PC_ORC_TOOLSDIR} 16 | PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin) 17 | 18 | find_path( 19 | ORC_INCLUDE_DIR 20 | NAMES orc/orc.h 21 | HINTS ${PC_ORC_INCLUDEDIR} 22 | PATHS ${ORC_ROOT}/include ${CMAKE_INSTALL_PREFIX}/include 23 | PATH_SUFFIXES orc-0.4) 24 | 25 | find_path( 26 | ORC_LIBRARY_DIR 27 | NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX} 28 | HINTS ${PC_ORC_LIBDIR} 29 | PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR} 30 | ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) 31 | 32 | find_library( 33 | ORC_LIB orc-0.4 34 | HINTS ${PC_ORC_LIBRARY_DIRS} 35 | PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR} 36 | ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) 37 | 38 | find_library( 39 | ORC_LIBRARY_STATIC liborc-0.4.a 40 | HINTS ${PC_ORC_LIBRARY_DIRS} 41 | PATHS ${ORC_ROOT}/${CMAKE_INSTALL_LIBDIR} 42 | ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) 43 | 44 | list(APPEND ORC_LIBRARY ${ORC_LIB}) 45 | 46 | set(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR}) 47 | set(ORC_LIBRARIES ${ORC_LIBRARY}) 48 | set(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR}) 49 | set(ORC_LIBRARIES_STATIC ${ORC_LIBRARY_STATIC}) 50 | 51 | include(FindPackageHandleStandardArgs) 52 | find_package_handle_standard_args(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR 53 | ORCC_EXECUTABLE) 54 | 55 | mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE) 56 | -------------------------------------------------------------------------------- /cmake/Modules/VolkConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright 2016, 2018 - 2020 Free Software Foundation, Inc. 2 | # 3 | # This file is part of VOLK. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | get_filename_component(VOLK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) 9 | 10 | if(NOT TARGET Volk::volk) 11 | include("${VOLK_CMAKE_DIR}/VolkTargets.cmake") 12 | endif() 13 | 14 | # set VOLK_FOUND to be set globally, for whether a compatible Volk was 15 | # found -- could be a correct enough version or any version depending 16 | # on how find_package was called. 17 | if(NOT TARGET Volk::volk) 18 | set(VOLK_FOUND FALSE) 19 | else() 20 | set(VOLK_FOUND TRUE) 21 | endif() 22 | 23 | # cache whether a compatible Volk was found for 24 | # use anywhere in the calling project 25 | set(VOLK_FOUND 26 | ${VOLK_FOUND} 27 | CACHE BOOL "Whether a compatible Volk was found" FORCE) 28 | 29 | if(VOLK_FOUND) 30 | # use the new target library, regardless of whether new or old style 31 | # we still need to set a variable with the library name so that there 32 | # is a variable to reference in the using-project's cmake scripts! 33 | set(VOLK_LIBRARIES 34 | Volk::volk 35 | CACHE STRING "Volk Library" FORCE) 36 | 37 | # INTERFACE_INCLUDE_DIRECTORIES should always be set 38 | get_target_property(VOLK_INCLUDE_DIRS Volk::volk INTERFACE_INCLUDE_DIRECTORIES) 39 | set(VOLK_INCLUDE_DIRS 40 | ${VOLK_INCLUDE_DIRS} 41 | CACHE STRING "Volk Include Directories" FORCE) 42 | 43 | # for backward compatibility with old-CMake non-target project finding 44 | include(FindPackageHandleStandardArgs) 45 | find_package_handle_standard_args(Volk DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS) 46 | mark_as_advanced(VOLK_LIBRARIES VOLK_INCLUDE_DIRS) 47 | endif(VOLK_FOUND) 48 | -------------------------------------------------------------------------------- /cmake/cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Free Software Foundation, Inc. 2 | # 3 | # This file is part of VOLK. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | # https://gitlab.kitware.com/cmake/community/-/wikis/FAQ#can-i-do-make-uninstall-with-cmake 9 | 10 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 11 | message( 12 | FATAL_ERROR 13 | "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"" 14 | ) 15 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 16 | 17 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) 18 | string(REGEX REPLACE "\n" ";" files "${files}") 19 | foreach(file ${files}) 20 | message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"") 21 | if(EXISTS "$ENV{DESTDIR}${file}") 22 | exec_program( 23 | "@CMAKE_COMMAND@" ARGS 24 | "-E remove \"$ENV{DESTDIR}${file}\"" 25 | OUTPUT_VARIABLE rm_out 26 | RETURN_VALUE rm_retval) 27 | if(NOT "${rm_retval}" STREQUAL 0) 28 | message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"") 29 | endif(NOT "${rm_retval}" STREQUAL 0) 30 | elseif(IS_SYMLINK "$ENV{DESTDIR}${file}") 31 | exec_program( 32 | "@CMAKE_COMMAND@" ARGS 33 | "-E remove \"$ENV{DESTDIR}${file}\"" 34 | OUTPUT_VARIABLE rm_out 35 | RETURN_VALUE rm_retval) 36 | if(NOT "${rm_retval}" STREQUAL 0) 37 | message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"") 38 | endif(NOT "${rm_retval}" STREQUAL 0) 39 | else(EXISTS "$ENV{DESTDIR}${file}") 40 | message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.") 41 | endif(EXISTS "$ENV{DESTDIR}${file}") 42 | endforeach(file) 43 | -------------------------------------------------------------------------------- /gen/volk_compile_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2012 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | import argparse 10 | import volk_arch_defs 11 | import volk_machine_defs 12 | 13 | 14 | def do_arch_flags_list(compiler): 15 | output = list() 16 | for arch in volk_arch_defs.archs: 17 | if not arch.is_supported(compiler): continue 18 | fields = [arch.name] + arch.get_flags(compiler) 19 | output.append(','.join(fields)) 20 | print(';'.join(output)) 21 | 22 | 23 | def do_machines_list(arch_names): 24 | output = list() 25 | for machine in volk_machine_defs.machines: 26 | machine_arch_set = set(machine.arch_names) 27 | if set(arch_names).intersection(machine_arch_set) == machine_arch_set: 28 | output.append(machine.name) 29 | print(';'.join(output)) 30 | 31 | 32 | def do_machine_flags_list(compiler, machine_name): 33 | output = list() 34 | machine = volk_machine_defs.machine_dict[machine_name] 35 | for arch in machine.archs: 36 | output.extend(arch.get_flags(compiler)) 37 | print(' '.join(output)) 38 | 39 | 40 | def main(): 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument('--mode', type=str) 43 | parser.add_argument('--compiler', type=str) 44 | parser.add_argument('--archs', type=str) 45 | parser.add_argument('--machine', type=str) 46 | args = parser.parse_args() 47 | 48 | if args.mode == 'arch_flags': return do_arch_flags_list(args.compiler.lower()) 49 | if args.mode == 'machines': return do_machines_list(args.archs.split(';')) 50 | if args.mode == 'machine_flags': return do_machine_flags_list(args.compiler.lower(), args.machine) 51 | 52 | if __name__ == '__main__': 53 | main() 54 | 55 | -------------------------------------------------------------------------------- /include/volk/volk_alloc.hh: -------------------------------------------------------------------------------- 1 | /* -*- C++ -*- */ 2 | /* 3 | * Copyright 2019 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_ALLOC_H 11 | #define INCLUDED_VOLK_ALLOC_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | namespace volk { 21 | 22 | /*! 23 | * \brief C++11 allocator using volk_malloc and volk_free 24 | * 25 | * \details 26 | * adapted from https://en.cppreference.com/w/cpp/named_req/Alloc 27 | */ 28 | template 29 | struct alloc { 30 | typedef T value_type; 31 | 32 | alloc() = default; 33 | 34 | template 35 | constexpr alloc(alloc const&) noexcept 36 | { 37 | } 38 | 39 | T* allocate(std::size_t n) 40 | { 41 | if (n > std::numeric_limits::max() / sizeof(T)) 42 | throw std::bad_alloc(); 43 | 44 | if (auto p = static_cast(volk_malloc(n * sizeof(T), volk_get_alignment()))) 45 | return p; 46 | 47 | throw std::bad_alloc(); 48 | } 49 | 50 | void deallocate(T* p, std::size_t) noexcept { volk_free(p); } 51 | }; 52 | 53 | template 54 | bool operator==(alloc const&, alloc const&) 55 | { 56 | return true; 57 | } 58 | 59 | template 60 | bool operator!=(alloc const&, alloc const&) 61 | { 62 | return false; 63 | } 64 | 65 | 66 | /*! 67 | * \brief type alias for std::vector using volk::alloc 68 | * 69 | * \details 70 | * example code: 71 | * volk::vector v(100); // vector using volk_malloc, volk_free 72 | */ 73 | template 74 | using vector = std::vector>; 75 | 76 | } // namespace volk 77 | #endif // INCLUDED_VOLK_ALLOC_H 78 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32f_x2_dot_prod_32f_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32f_x2_dot_prod_32f_a_neonasm 3 | volk_32f_x2_dot_prod_32f_a_neonasm: 4 | @ r0 - cVector: pointer to output array 5 | @ r1 - aVector: pointer to input array 1 6 | @ r2 - bVector: pointer to input array 2 7 | @ r3 - num_points: number of items to process 8 | cVector .req r0 9 | aVector .req r1 10 | bVector .req r2 11 | num_points .req r3 12 | quarterPoints .req r7 13 | number .req r8 14 | aVal .req q0 @ d0-d1 15 | bVal .req q1 @ d2-d3 16 | cVal .req q2 @ d4-d5 17 | 18 | @ AAPCS Section 5.1.1 19 | @ A subroutine must preserve the contents of the registers r4-r8, r10, r11 and SP 20 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 21 | 22 | veor.32 q0, q0, q0 23 | movs quarterPoints, num_points, lsr #2 24 | beq .loop2 @ if zero into quarterPoints 25 | 26 | mov number, #0 @ number, 0 27 | .loop1: 28 | pld [aVector, #128] @ pre-load hint - this is implementation specific! 29 | pld [bVector, #128] @ pre-load hint - this is implementation specific! 30 | 31 | vld1.32 {q1}, [aVector:128]! @ aVal 32 | vld1.32 {q2}, [bVector:128]! @ bVal 33 | vmla.f32 q0, q1, q2 34 | 35 | add number, number, #1 36 | cmp number, quarterPoints 37 | blt .loop1 @ first loop 38 | 39 | @ strange order comes from trying to schedule instructions 40 | vadd.f32 s0, s0, s1 41 | vadd.f32 s2, s2, s3 42 | mov number, quarterPoints, asl #2 43 | vadd.f32 s0, s0, s2 44 | 45 | .loop2: 46 | cmp num_points, number 47 | bls .done 48 | 49 | vld1.32 {d1[0]}, [aVector]! 50 | vld1.32 {d1[1]}, [bVector]! 51 | vmla.f32 s0, s2, s3 52 | add number, number, #1 53 | b .loop2 54 | 55 | .done: 56 | vstr s0, [cVector] 57 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 58 | bx lr 59 | -------------------------------------------------------------------------------- /tmpl/volk_machine_xxx.tmpl.c: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | <% this_machine = machine_dict[args[0]] %> 11 | <% arch_names = this_machine.arch_names %> 12 | 13 | %for arch in this_machine.archs: 14 | #define LV_HAVE_${arch.name.upper()} 1 15 | %endfor 16 | 17 | #include 18 | #include "volk_machines.h" 19 | #include 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif 24 | 25 | %for kern in kernels: 26 | #include 27 | %endfor 28 | 29 | struct volk_machine volk_machine_${this_machine.name} = { 30 | <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list}, 31 | <% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name}, 32 | ${this_machine.alignment}, 33 | ##//list all kernels 34 | %for kern in kernels: 35 | <% impls = kern.get_impls(arch_names) %> 36 | ##//kernel name 37 | <% kern_name = "\""+kern.name+"\"" %> ${kern_name}, 38 | ##//list of kernel implementations by name 39 | <% make_impl_name_list = "{"+', '.join(['"%s"'%i.name for i in impls])+"}" %> ${make_impl_name_list}, 40 | ##//list of arch dependencies per implementation 41 | <% make_impl_deps_list = "{"+', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in impls])+"}" %> ${make_impl_deps_list}, 42 | ##//alignment required? for each implementation 43 | <% make_impl_align_list = "{"+', '.join(['true' if i.is_aligned else 'false' for i in impls])+"}" %> ${make_impl_align_list}, 44 | ##//pointer to each implementation 45 | <% make_impl_fcn_list = "{"+', '.join(['%s_%s'%(kern.name, i.name) for i in impls])+"}" %> ${make_impl_fcn_list}, 46 | ##//number of implementations listed here 47 | <% len_impls = len(impls) %> ${len_impls}, 48 | %endfor 49 | }; 50 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_16i_max_star_horizontal_16i.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_16i_max_star_horizontal_16i_a_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_16i_max_star_horizontal_16i_a_neonasm 3 | volk_16i_max_star_horizontal_16i_a_neonasm: 4 | @ r0 - cVector: pointer to output array 5 | @ r1 - aVector: pointer to input array 1 6 | @ r2 - num_points: number of items to process 7 | 8 | pld [r1, #128] 9 | push {r4, r5, r6} @ preserve register states 10 | lsrs r5, r2, #4 @ 1/16th points = num_points/16 11 | vmov.i32 q12, #0 @ q12 = [0,0,0,0] 12 | beq .smallvector @ less than 16 elements in vector 13 | mov r4, r1 @ r4 = aVector 14 | mov r12, r0 @ gcc calls this ip 15 | mov r3, #0 @ number = 0 16 | 17 | .loop1: 18 | vld2.16 {d16-d19}, [r4]! @ aVector, interleaved load 19 | pld [r4, #128] 20 | add r3, r3, #1 @ number += 1 21 | cmp r3, r5 @ number < 1/16th points 22 | vsub.i16 q10, q8, q9 @ subtraction 23 | vcge.s16 q11, q10, #0 @ result > 0? 24 | vcgt.s16 q10, q12, q10 @ result < 0? 25 | vand.i16 q11, q8, q11 @ multiply by comparisons 26 | vand.i16 q10, q9, q10 @ multiply by other comparison 27 | vadd.i16 q10, q11, q10 @ add results to get max 28 | vst1.16 {d20-d21}, [r12]! @ store the results 29 | bne .loop1 @ at least 16 items left 30 | add r1, r1, r3, lsl #5 31 | add r0, r0, r3, lsl #4 32 | .smallvector: 33 | ands r2, r2, #15 34 | beq .end 35 | mov r3, #0 36 | .loop3: 37 | ldrh r4, [r1] 38 | bic r5, r3, #1 39 | ldrh ip, [r1, #2] 40 | add r3, r3, #2 41 | add r1, r1, #4 42 | rsb r6, ip, r4 43 | sxth r6, r6 44 | cmp r6, #0 45 | movgt ip, r4 46 | cmp r3, r2 47 | strh ip, [r0, r5] 48 | bcc .loop3 49 | .end: 50 | pop {r4, r5, r6} 51 | bx lr 52 | -------------------------------------------------------------------------------- /.github/workflows/run-tests-rvv.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 - 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | name: Run VOLK tests on different RVV configurations 10 | 11 | on: [push, pull_request] 12 | 13 | jobs: 14 | Tests: 15 | runs-on: ubuntu-24.04 16 | steps: 17 | - uses: actions/checkout@v4 18 | with: 19 | submodules: "recursive" 20 | - name: Install packages 21 | run: | 22 | sudo apt-get update -q -y 23 | sudo apt-get install -y python3-mako cmake qemu-user-static g++-14-riscv64-linux-gnu clang-18 24 | mkdir build 25 | cd build 26 | - name: Test gcc-14 VLEN=128 27 | run: | 28 | cd build; rm -rf * 29 | CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=128 \ 30 | cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. 31 | make -j$(nproc) 32 | ARGS=-V make test 33 | - name: Test gcc-14 VLEN=256 34 | run: | 35 | cd build; rm -rf * 36 | CXX=riscv64-linux-gnu-g++-14 CC=riscv64-linux-gnu-gcc-14 VLEN=256 \ 37 | cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release 38 | make -j$(nproc) 39 | ARGS=-V make test 40 | - name: Test clang-18 VLEN=512 41 | run: | 42 | cd build; rm -rf * 43 | CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=512 \ 44 | cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. 45 | make -j$(nproc) 46 | ARGS=-V make test 47 | - name: Test clang-18 VLEN=1024 48 | run: | 49 | cd build; rm -rf * 50 | CXX=clang++-18 CC=clang-18 CFLAGS=--target=riscv64-linux-gnu VLEN=1024 \ 51 | cmake -DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/rv64gcv-linux-gnu.cmake .. -DCMAKE_BUILD_TYPE=Release 52 | make -j$(nproc) 53 | ARGS=-V make test 54 | 55 | 56 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32f_x2_add_32f_a_neonpipeline.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32f_x2_add_32f_a_neonpipeline(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32f_x2_add_32f_a_neonpipeline 3 | volk_32f_x2_add_32f_a_neonpipeline: 4 | @ r0 - cVector: pointer to output array 5 | @ r1 - aVector: pointer to input array 1 6 | @ r2 - bVector: pointer to input array 2 7 | @ r3 - num_points: number of items to process 8 | cVector .req r0 9 | aVector .req r1 10 | bVector .req r2 11 | num_points .req r3 12 | quarterPoints .req r7 13 | number .req r8 14 | aVal .req q0 @ d0-d1 15 | bVal .req q1 @ d2-d3 16 | cVal .req q2 @ d4-d5 17 | 18 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 19 | 20 | pld [aVector, #128] @ pre-load hint - this is implementation specific! 21 | pld [bVector, #128] @ pre-load hint - this is implementation specific! 22 | 23 | movs quarterPoints, num_points, lsr #2 24 | beq .loop2 @ if zero into quarterPoints 25 | 26 | mov number, quarterPoints 27 | 28 | @ Optimizing for pipeline 29 | vld1.32 {d0-d1}, [aVector:128]! @ aVal 30 | vld1.32 {d2-d3}, [bVector:128]! @ bVal 31 | subs number, number, #1 32 | beq .flushpipe 33 | 34 | .loop1: 35 | pld [aVector, #128] @ pre-load hint - this is implementation specific! 36 | pld [bVector, #128] @ pre-load hint - this is implementation specific! 37 | vadd.f32 cVal, bVal, aVal 38 | vld1.32 {d0-d1}, [aVector:128]! @ aVal 39 | vld1.32 {d2-d3}, [bVector:128]! @ bVal 40 | vst1.32 {d4-d5}, [cVector:128]! @ cVal 41 | 42 | subs number, number, #1 43 | bne .loop1 @ first loop 44 | 45 | .flushpipe: 46 | @ One more time 47 | vadd.f32 cVal, bVal, aVal 48 | vst1.32 {d4-d5}, [cVector:128]! @ cVal 49 | 50 | mov number, quarterPoints, asl #2 51 | 52 | .loop2: 53 | cmp num_points, number 54 | bls .done 55 | 56 | vld1.32 {d0[0]}, [aVector]! 57 | vld1.32 {d0[1]}, [bVector]! 58 | vadd.f32 s2, s1, s0 59 | vst1.32 {d1[0]}, [cVector]! 60 | add number, number, #1 61 | b .loop2 62 | 63 | .done: 64 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 65 | bx lr 66 | -------------------------------------------------------------------------------- /scripts/tools/compare_volk_profiles: -------------------------------------------------------------------------------- 1 | #!/bin/env python3 2 | # Copyright 2022, 2025 Marcus Müller 3 | # SPDX-License-Identifier: GPL-3.0 4 | # Takes in a list of volk profiles, ignores empty and identical lines, prints a table of differences. 5 | # Hacky as hell 6 | 7 | from sys import argv 8 | 9 | 10 | def keepline(line: str) -> bool: 11 | if not line: 12 | return False 13 | line = line.strip() 14 | if line.startswith("#"): 15 | return False 16 | return True 17 | 18 | 19 | def kernel(line: str) -> str: 20 | return line.split(" ")[0] 21 | 22 | 23 | def impls(line: str) -> tuple[str, str]: 24 | return tuple(line.strip().split(" ")[1:]) 25 | 26 | 27 | machines = [ 28 | {kernel(line): impls(line) for line in open(f_name) if keepline(line)} 29 | for f_name in argv[1:] 30 | ] 31 | kernels = [set(d.keys()) for d in machines] 32 | common_kernels = [ 33 | kernel for kernel in kernels[0] if all((kernel in ks for ks in kernels[1:])) 34 | ] 35 | 36 | differing_kernels = dict() 37 | for kernel in common_kernels: 38 | first_impl = machines[0][kernel] 39 | if all(machine[kernel] == first_impl for machine in machines[1:]): 40 | continue 41 | differing_kernels[kernel] = { 42 | argv[idx + 1]: machine[kernel] for idx, machine in enumerate(machines) 43 | } 44 | 45 | max_kernel_len = max(len(kernel) for kernel in common_kernels) 46 | max_impl_len = max( 47 | max(max(len(alignment) for alignment in impl) for impl in kernel.values()) 48 | for kernel in differing_kernels.values() 49 | ) 50 | 51 | print( 52 | f"|{'Kernel':<{max_kernel_len}}|" 53 | + "|".join( 54 | f"{fname + ' a':<{max_impl_len}}|{fname + ' u':<{max_impl_len}}" 55 | for fname in argv[1:] 56 | ) 57 | + "|" 58 | ) 59 | for kernel, impls in differing_kernels.items(): 60 | print( 61 | f"|{kernel:<{max_kernel_len}}|" 62 | + "|".join( 63 | "|".join( 64 | f"{impl:<{max_impl_len}}" for impl in differing_kernels[kernel][fname] 65 | ) 66 | for fname in argv[1:] 67 | ) 68 | + "|" 69 | ) 70 | -------------------------------------------------------------------------------- /scripts/licensing/count_contrib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script to check the list of git submitters against the table of re-submitting 3 | # users from the AUTHORFILE. Requires the authors to be listed in 4 | # | ... | ... | email@address.com | 5 | # format. 6 | # 7 | # We can add another table of "git committers who are exempt from the need to 8 | # relicense due to their contributions being under an acceptable license 9 | # already" if we need; no changes to this script would be necessary. 10 | # 11 | # This script is part of VOLK. 12 | # 13 | # Copyright 2021 Marcus Müller 14 | # SPDX-License-Identifier: MPL-2.0 15 | 16 | rootdir=`git rev-parse --show-toplevel` 17 | if [[ "$#" -lt 1 ]] 18 | then 19 | authorfile=$rootdir/AUTHORS_RESUBMITTING_UNDER_LGPL_LICENSE.md 20 | else 21 | authorfile=$1 22 | fi 23 | if [[ ! -r $authorfile ]] 24 | then 25 | echo "$authorfile: file not readable" 26 | exit -1 27 | fi 28 | 29 | allfiles=`git ls-files $rootdir` 30 | lgplers="$(sed -ne 's/^|[^|]*|[^|]*| \([^|]*\)|/\1/ip' $authorfile)" 31 | lgplers="$lgplers 32478819+fritterhoff@users.noreply.github.com douggeiger@users.noreply.github.com" 32 | authorcounts="$(echo "$allfiles" | while read f; do git blame --line-porcelain --ignore-rev 092a59997a1e1d5f421a0a5f87ee655ad173b93f $f 2>/dev/null | sed -ne 's/^author-mail <\([^>]*\)>/\1/p'; done | sort -f | uniq -ic | sort -n)" 33 | 34 | total_loc=0 35 | missing_loc=0 36 | 37 | while read -r line 38 | do 39 | authoremail=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\2/g') 40 | authorlines=$(echo "$line" | sed 's/^ *\([[:digit:]]*\) *\([^, ]*\)$/\1/g') 41 | total_loc=$(( $authorlines + $total_loc )) 42 | if ! ( echo "$lgplers" | grep -i "$authoremail" ) > /dev/null 43 | then 44 | echo "missing: \"$authoremail\" (${authorlines} LOC)" 45 | missingloc=$(($missingloc + $authorlines)) 46 | fi 47 | done < <(echo "$authorcounts") 48 | 49 | percentage=$(echo "scale=2; 100.0 * $missingloc/$total_loc" | bc) 50 | echo "Missing $missingloc of $total_loc LOC in total ($percentage%)" 51 | 52 | if [[ "$missingloc" -gt 0 ]] 53 | then 54 | exit -2 55 | fi 56 | exit 0 57 | -------------------------------------------------------------------------------- /cmake/msvc/sys/time.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018, 2020 Free Software Foundation, Inc. 3 | * 4 | * This file is part of VOLK 5 | * 6 | * SPDX-License-Identifier: LGPL-3.0-or-later 7 | */ 8 | 9 | #ifndef _MSC_VER // [ 10 | #error "Use this header only with Microsoft Visual C++ compilers!" 11 | #endif // _MSC_VER ] 12 | 13 | #ifndef _MSC_SYS_TIME_H_ 14 | #define _MSC_SYS_TIME_H_ 15 | 16 | // prevent windows.h from clobbering min and max functions with macros 17 | #ifndef NOMINMAX 18 | #define NOMINMAX 19 | #endif 20 | 21 | // https://learn.microsoft.com/en-us/archive/msdn-technet-forums/430449b3-f6dd-4e18-84de-eebd26a8d668 22 | #include < time.h > 23 | #include //I've omitted this line. 24 | #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) 25 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 26 | #else 27 | #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL 28 | #endif 29 | 30 | #if _MSC_VER < 1900 31 | struct timespec { 32 | 33 | time_t tv_sec; /* Seconds since 00:00:00 GMT, */ 34 | 35 | /* 1 January 1970 */ 36 | 37 | long tv_nsec; /* Additional nanoseconds since */ 38 | 39 | /* tv_sec */ 40 | }; 41 | #endif 42 | 43 | struct timezone { 44 | int tz_minuteswest; /* minutes W of Greenwich */ 45 | int tz_dsttime; /* type of dst correction */ 46 | }; 47 | 48 | static inline int gettimeofday(struct timeval* tv, struct timezone* tz) 49 | { 50 | FILETIME ft; 51 | unsigned __int64 tmpres = 0; 52 | static int tzflag; 53 | 54 | if (NULL != tv) { 55 | GetSystemTimeAsFileTime(&ft); 56 | 57 | tmpres |= ft.dwHighDateTime; 58 | tmpres <<= 32; 59 | tmpres |= ft.dwLowDateTime; 60 | 61 | /*converting file time to unix epoch*/ 62 | tmpres -= DELTA_EPOCH_IN_MICROSECS; 63 | tv->tv_sec = (long)(tmpres / 1000000UL); 64 | tv->tv_usec = (long)(tmpres % 1000000UL); 65 | } 66 | 67 | if (NULL != tz) { 68 | if (!tzflag) { 69 | _tzset(); 70 | tzflag++; 71 | } 72 | tz->tz_minuteswest = _timezone / 60; 73 | tz->tz_dsttime = _daylight; 74 | } 75 | 76 | return 0; 77 | } 78 | 79 | #endif //_MSC_SYS_TIME_H_ 80 | -------------------------------------------------------------------------------- /include/volk/volk_malloc.h: -------------------------------------------------------------------------------- 1 | /* -*- c -*- */ 2 | /* 3 | * Copyright 2014, 2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_MALLOC_H 11 | #define INCLUDED_VOLK_MALLOC_H 12 | 13 | #include 14 | #include 15 | 16 | __VOLK_DECL_BEGIN 17 | 18 | /*! 19 | * \brief Allocate \p size bytes of data aligned to \p alignment. 20 | * 21 | * \details 22 | * We use C11 and want to rely on C11 library features, 23 | * namely we use `aligned_alloc` to allocate aligned memory. 24 | * see: https://en.cppreference.com/w/c/memory/aligned_alloc 25 | * 26 | * Not all platforms support this feature. 27 | * For Apple Clang, we fall back to `posix_memalign`. 28 | * see: https://linux.die.net/man/3/aligned_alloc 29 | * For MSVC, we fall back to `_aligned_malloc`. 30 | * see: 31 | * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=msvc-170 32 | * 33 | * Because of the ways in which volk_malloc may allocate memory, it is 34 | * important to always free volk_malloc pointers using volk_free. 35 | * Mainly, in case MSVC is used. Consult corresponding documentation 36 | * in case you use MSVC. 37 | * 38 | * \param size The number of bytes to allocate. 39 | * \param alignment The byte alignment of the allocated memory. 40 | * \return pointer to aligned memory. 41 | */ 42 | VOLK_API void* volk_malloc(size_t size, size_t alignment); 43 | 44 | /*! 45 | * \brief Free's memory allocated by volk_malloc. 46 | * 47 | * \details 48 | * We rely on C11 syntax and compilers and just call `free` in case 49 | * memory was allocated with `aligned_alloc` or `posix_memalign`. 50 | * Thus, in this case `volk_free` inherits the same behavior `free` exhibits. 51 | * see: https://en.cppreference.com/w/c/memory/free 52 | * In case `_aligned_malloc` was used, we call `_aligned_free`. 53 | * see: 54 | * https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-free?view=msvc-170 55 | * 56 | * \param aptr The aligned pointer allocated by volk_malloc. 57 | */ 58 | VOLK_API void volk_free(void* aptr); 59 | 60 | __VOLK_DECL_END 61 | 62 | #endif /* INCLUDED_VOLK_MALLOC_H */ 63 | -------------------------------------------------------------------------------- /apps/volk_option_helpers.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2018-2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef VOLK_VOLK_OPTION_HELPERS_H 11 | #define VOLK_VOLK_OPTION_HELPERS_H 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | typedef enum { 20 | VOID_CALLBACK, 21 | INT_CALLBACK, 22 | BOOL_CALLBACK, 23 | STRING_CALLBACK, 24 | FLOAT_CALLBACK, 25 | STRING, 26 | } VOLK_OPTYPE; 27 | 28 | class option_t 29 | { 30 | public: 31 | option_t(std::string t_longform, 32 | std::string t_shortform, 33 | std::string t_msg, 34 | void (*t_callback)()); 35 | option_t(std::string t_longform, 36 | std::string t_shortform, 37 | std::string t_msg, 38 | void (*t_callback)(int)); 39 | option_t(std::string t_longform, 40 | std::string t_shortform, 41 | std::string t_msg, 42 | void (*t_callback)(float)); 43 | option_t(std::string t_longform, 44 | std::string t_shortform, 45 | std::string t_msg, 46 | void (*t_callback)(bool)); 47 | option_t(std::string t_longform, 48 | std::string t_shortform, 49 | std::string t_msg, 50 | void (*t_callback)(std::string)); 51 | option_t(std::string t_longform, 52 | std::string t_shortform, 53 | std::string t_msg, 54 | std::string t_printval); 55 | 56 | std::string longform; 57 | std::string shortform; 58 | std::string msg; 59 | VOLK_OPTYPE option_type; 60 | std::string printval; 61 | void (*callback)(); 62 | }; 63 | 64 | class option_list 65 | { 66 | public: 67 | option_list(std::string program_name); 68 | bool present(std::string option_name); 69 | 70 | void add(option_t opt); 71 | 72 | void parse(int argc, char** argv); 73 | 74 | void help(); 75 | 76 | private: 77 | std::string d_program_name; 78 | std::vector d_internal_list; 79 | std::map d_present_options; 80 | }; 81 | 82 | 83 | #endif // VOLK_VOLK_OPTION_HELPERS_H 84 | -------------------------------------------------------------------------------- /cmake/msvc/config.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2012, 2017, 2020 Free Software Foundation, Inc. 3 | * 4 | * This file is part of VOLK 5 | * 6 | * SPDX-License-Identifier: LGPL-3.0-or-later 7 | */ 8 | 9 | #ifndef _MSC_VER // [ 10 | #error "Use this header only with Microsoft Visual C++ compilers!" 11 | #endif // _MSC_VER ] 12 | 13 | #ifndef _MSC_CONFIG_H_ // [ 14 | #define _MSC_CONFIG_H_ 15 | 16 | //////////////////////////////////////////////////////////////////////// 17 | // enable inline functions for C code 18 | //////////////////////////////////////////////////////////////////////// 19 | #ifndef __cplusplus 20 | #define inline __inline 21 | #endif 22 | 23 | //////////////////////////////////////////////////////////////////////// 24 | // signed size_t 25 | //////////////////////////////////////////////////////////////////////// 26 | #include 27 | typedef ptrdiff_t ssize_t; 28 | 29 | //////////////////////////////////////////////////////////////////////// 30 | // rint functions 31 | //////////////////////////////////////////////////////////////////////// 32 | #if _MSC_VER < 1800 33 | #include 34 | static inline long lrint(double x) { return (long)(x > 0.0 ? x + 0.5 : x - 0.5); } 35 | static inline long lrintf(float x) { return (long)(x > 0.0f ? x + 0.5f : x - 0.5f); } 36 | static inline long long llrint(double x) 37 | { 38 | return (long long)(x > 0.0 ? x + 0.5 : x - 0.5); 39 | } 40 | static inline long long llrintf(float x) 41 | { 42 | return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f); 43 | } 44 | static inline double rint(double x) { return (x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5); } 45 | static inline float rintf(float x) 46 | { 47 | return (x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f); 48 | } 49 | #endif 50 | 51 | //////////////////////////////////////////////////////////////////////// 52 | // math constants 53 | //////////////////////////////////////////////////////////////////////// 54 | #if _MSC_VER < 1800 55 | #include 56 | #define INFINITY HUGE_VAL 57 | #endif 58 | 59 | //////////////////////////////////////////////////////////////////////// 60 | // random and srandom 61 | //////////////////////////////////////////////////////////////////////// 62 | #include 63 | static inline long int random(void) { return rand(); } 64 | static inline void srandom(unsigned int seed) { srand(seed); } 65 | 66 | #endif // _MSC_CONFIG_H_ ] 67 | -------------------------------------------------------------------------------- /gen/volk_machine_defs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2012 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | from volk_arch_defs import arch_dict 10 | 11 | machines = list() 12 | machine_dict = dict() 13 | 14 | class machine_class(object): 15 | def __init__(self, name, archs): 16 | self.name = name 17 | self.archs = list() 18 | self.arch_names = list() 19 | for arch_name in archs: 20 | if not arch_name: continue 21 | arch = arch_dict[arch_name] 22 | self.archs.append(arch) 23 | self.arch_names.append(arch_name) 24 | self.alignment = max([a.alignment for a in self.archs]) 25 | 26 | def __repr__(self): return self.name 27 | 28 | def register_machine(name, archs): 29 | for i, arch_name in enumerate(archs): 30 | if '|' in arch_name: #handle special arch names with the '|' 31 | for arch_sub in arch_name.split('|'): 32 | if arch_sub: 33 | register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:]) 34 | else: 35 | register_machine(name, archs[:i] + archs[i+1:]) 36 | return 37 | machine = machine_class(name=name, archs=archs) 38 | machines.append(machine) 39 | machine_dict[machine.name] = machine 40 | 41 | ######################################################################## 42 | # register the machines 43 | ######################################################################## 44 | #TODO skip the XML and put it here 45 | from xml.dom import minidom 46 | import os 47 | gendir = os.path.dirname(__file__) 48 | machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine') 49 | for machine_xml in machines_xml: 50 | kwargs = dict() 51 | for attr in machine_xml.attributes.keys(): 52 | kwargs[attr] = machine_xml.attributes[attr].value 53 | for node in machine_xml.childNodes: 54 | try: 55 | name = node.tagName 56 | val = machine_xml.getElementsByTagName(name)[0].firstChild.data 57 | kwargs[name] = val 58 | except: pass 59 | kwargs['archs'] = kwargs['archs'].split() 60 | register_machine(**kwargs) 61 | 62 | if __name__ == '__main__': 63 | print(machines) 64 | -------------------------------------------------------------------------------- /kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /*! 11 | * \page volk_32fc_s32f_x2_power_spectral_density_32f 12 | * 13 | * \b Overview 14 | * 15 | * Calculates the log10 power value divided by the RBW for each input point. 16 | * 17 | * Dispatcher Prototype 18 | * \code 19 | * void volk_32fc_s32f_x2_power_spectral_density_32f(float* logPowerOutput, const 20 | * lv_32fc_t* complexFFTInput, const float normalizationFactor, const float rbw, unsigned 21 | * int num_points) \endcode 22 | * 23 | * \b Inputs 24 | * \li complexFFTInput The complex data output from the FFT point. 25 | * \li normalizationFactor: This value is divided against all the input values before the 26 | * power is calculated. \li rbw: The resolution bandwidth of the fft spectrum \li 27 | * num_points: The number of fft data points. 28 | * 29 | * \b Outputs 30 | * \li logPowerOutput: The 10.0 * log10((r*r + i*i)/RBW) for each data point. 31 | * 32 | * \b Example 33 | * \code 34 | * int N = 10000; 35 | * 36 | * volk_32fc_s32f_x2_power_spectral_density_32f(); 37 | * 38 | * volk_free(x); 39 | * \endcode 40 | */ 41 | 42 | #ifndef INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H 43 | #define INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | #ifdef LV_HAVE_GENERIC 50 | 51 | static inline void 52 | volk_32fc_s32f_x2_power_spectral_density_32f_generic(float* logPowerOutput, 53 | const lv_32fc_t* complexFFTInput, 54 | const float normalizationFactor, 55 | const float rbw, 56 | unsigned int num_points) 57 | { 58 | if (rbw != 1.0) 59 | volk_32fc_s32f_power_spectrum_32f( 60 | logPowerOutput, complexFFTInput, normalizationFactor * sqrt(rbw), num_points); 61 | else 62 | volk_32fc_s32f_power_spectrum_32f( 63 | logPowerOutput, complexFFTInput, normalizationFactor, num_points); 64 | } 65 | 66 | #endif /* LV_HAVE_GENERIC */ 67 | 68 | #endif /* INCLUDED_volk_32fc_s32f_x2_power_spectral_density_32f_a_H */ 69 | -------------------------------------------------------------------------------- /kernels/README.md: -------------------------------------------------------------------------------- 1 | # How to create custom kernel dispatchers 2 | 3 | A kernel dispatcher is kernel implementation that calls other kernel implementations. 4 | By default, a dispatcher is generated by the build system for every kernel such that: 5 | * the best aligned implementation is called when all pointer arguments are aligned, 6 | * and otherwise the best unaligned implementation is called. 7 | 8 | The author of a VOLK kernel may create a custom dispatcher, 9 | to be called in place of the automatically generated one. 10 | A custom dispatcher may be useful to handle head and tail cases, 11 | or to implement different alignment and bounds checking logic. 12 | 13 | ## Code for an example dispatcher w/ tail case 14 | 15 | ```cpp 16 | #include 17 | 18 | #ifdef LV_HAVE_DISPATCHER 19 | 20 | static inline void volk_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) 21 | { 22 | const unsigned int num_points_r = num_points%4; 23 | const unsigned int num_points_x = num_points - num_points_r; 24 | 25 | if (volk_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector)))) 26 | { 27 | volk_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x); 28 | } 29 | else 30 | { 31 | volk_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x); 32 | } 33 | 34 | volk_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r); 35 | } 36 | 37 | #endif //LV_HAVE_DISPATCHER 38 | ``` 39 | 40 | ## Code for an example dispatcher w/ tail case and accumulator 41 | 42 | ```cpp 43 | #include 44 | 45 | #ifdef LV_HAVE_DISPATCHER 46 | 47 | static inline void volk_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points) 48 | { 49 | const unsigned int num_points_r = num_points%16; 50 | const unsigned int num_points_x = num_points - num_points_r; 51 | 52 | if (volk_is_aligned(VOLK_OR_PTR(input, taps))) 53 | { 54 | volk_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x); 55 | } 56 | else 57 | { 58 | volk_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x); 59 | } 60 | 61 | float result_tail = 0; 62 | volk_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r); 63 | 64 | *result += result_tail; 65 | } 66 | 67 | #endif //LV_HAVE_DISPATCHER 68 | ``` 69 | -------------------------------------------------------------------------------- /kernels/volk/volk_32f_s32f_power_32f.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /*! 11 | * \page volk_32f_s32f_power_32f 12 | * 13 | * \b Overview 14 | * 15 | * Takes each input vector value to the specified power and stores the 16 | * results in the return vector. 17 | * 18 | * Dispatcher Prototype 19 | * \code 20 | * void volk_32f_s32f_power_32f(float* cVector, const float* aVector, const float power, 21 | * unsigned int num_points) \endcode 22 | * 23 | * \b Inputs 24 | * \li aVector: The input vector of floats. 25 | * \li power: The power to raise the input value to. 26 | * \li num_points: The number of data points. 27 | * 28 | * \b Outputs 29 | * \li cVector: The output vector. 30 | * 31 | * \b Example 32 | * Square the numbers (0,9) 33 | * \code 34 | * int N = 10; 35 | * unsigned int alignment = volk_get_alignment(); 36 | * float* increasing = (float*)volk_malloc(sizeof(float)*N, alignment); 37 | * float* out = (float*)volk_malloc(sizeof(float)*N, alignment); 38 | * 39 | * 40 | * for(unsigned int ii = 0; ii < N; ++ii){ 41 | * increasing[ii] = (float)ii; 42 | * } 43 | * 44 | * // Normalize by the smallest delta (0.2 in this example) 45 | * float scale = 2.0f; 46 | * 47 | * volk_32f_s32f_power_32f(out, increasing, scale, N); 48 | * 49 | * for(unsigned int ii = 0; ii < N; ++ii){ 50 | * printf("out[%u] = %f\n", ii, out[ii]); 51 | * } 52 | * 53 | * volk_free(increasing); 54 | * volk_free(out); 55 | * \endcode 56 | */ 57 | 58 | #ifndef INCLUDED_volk_32f_s32f_power_32f_a_H 59 | #define INCLUDED_volk_32f_s32f_power_32f_a_H 60 | 61 | #include 62 | #include 63 | #include 64 | 65 | #ifdef LV_HAVE_GENERIC 66 | 67 | static inline void volk_32f_s32f_power_32f_generic(float* cVector, 68 | const float* aVector, 69 | const float power, 70 | unsigned int num_points) 71 | { 72 | float* cPtr = cVector; 73 | const float* aPtr = aVector; 74 | unsigned int number = 0; 75 | 76 | for (number = 0; number < num_points; number++) { 77 | *cPtr++ = powf((*aPtr++), power); 78 | } 79 | } 80 | #endif /* LV_HAVE_GENERIC */ 81 | 82 | 83 | #endif /* INCLUDED_volk_32f_s32f_power_32f_a_H */ 84 | -------------------------------------------------------------------------------- /kernels/volk/volk_32fc_s32f_power_32fc.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /*! 11 | * \page volk_32fc_s32f_power_32fc 12 | * 13 | * \b Overview 14 | * 15 | * Takes each the input complex vector value to the specified power 16 | * and stores the results in the return vector. The output is scaled 17 | * and converted to 16-bit shorts. 18 | * 19 | * Dispatcher Prototype 20 | * \code 21 | * void volk_32fc_s32f_power_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const 22 | * float power, unsigned int num_points) \endcode 23 | * 24 | * \b Inputs 25 | * \li aVector: The complex input vector. 26 | * \li power: The power value to be applied to each data point. 27 | * \li num_points: The number of samples. 28 | * 29 | * \b Outputs 30 | * \li cVector: The output value as 16-bit shorts. 31 | * 32 | * \b Example 33 | * \code 34 | * int N = 10000; 35 | * 36 | * volk_32fc_s32f_power_32fc(); 37 | * 38 | * volk_free(x); 39 | * \endcode 40 | */ 41 | 42 | #ifndef INCLUDED_volk_32fc_s32f_power_32fc_a_H 43 | #define INCLUDED_volk_32fc_s32f_power_32fc_a_H 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | //! raise a complex float to a real float power 50 | static inline lv_32fc_t __volk_s32fc_s32f_power_s32fc_a(const lv_32fc_t exp, 51 | const float power) 52 | { 53 | const float arg = power * atan2f(lv_creal(exp), lv_cimag(exp)); 54 | const float mag = 55 | powf(lv_creal(exp) * lv_creal(exp) + lv_cimag(exp) * lv_cimag(exp), power / 2); 56 | return mag * lv_cmake(-cosf(arg), sinf(arg)); 57 | } 58 | 59 | #ifdef LV_HAVE_GENERIC 60 | 61 | static inline void volk_32fc_s32f_power_32fc_generic(lv_32fc_t* cVector, 62 | const lv_32fc_t* aVector, 63 | const float power, 64 | unsigned int num_points) 65 | { 66 | lv_32fc_t* cPtr = cVector; 67 | const lv_32fc_t* aPtr = aVector; 68 | unsigned int number = 0; 69 | 70 | for (number = 0; number < num_points; number++) { 71 | *cPtr++ = __volk_s32fc_s32f_power_s32fc_a((*aPtr++), power); 72 | } 73 | } 74 | 75 | #endif /* LV_HAVE_GENERIC */ 76 | 77 | 78 | #endif /* INCLUDED_volk_32fc_s32f_power_32fc_a_H */ 79 | -------------------------------------------------------------------------------- /include/volk/volk_avx2_fma_intrinsics.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2023 Magnus Lundmark 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /* 11 | * This file is intended to hold AVX2 FMA intrinsics. 12 | * They should be used in VOLK kernels to avoid copy-paste. 13 | */ 14 | 15 | #ifndef INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ 16 | #define INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ 17 | #include 18 | 19 | /* 20 | * Approximate arctan(x) via polynomial expansion 21 | * on the interval [-1, 1] 22 | * 23 | * Maximum relative error ~6.5e-7 24 | * Polynomial evaluated via Horner's method 25 | */ 26 | static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x) 27 | { 28 | const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f); 29 | const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f); 30 | const __m256 a5 = _mm256_set1_ps(+0x1.972be6p-3f); 31 | const __m256 a7 = _mm256_set1_ps(-0x1.1436ap-3f); 32 | const __m256 a9 = _mm256_set1_ps(+0x1.5785aap-4f); 33 | const __m256 a11 = _mm256_set1_ps(-0x1.2f3004p-5f); 34 | const __m256 a13 = _mm256_set1_ps(+0x1.01a37cp-7f); 35 | 36 | const __m256 x_times_x = _mm256_mul_ps(x, x); 37 | __m256 arctan; 38 | arctan = a13; 39 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a11); 40 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a9); 41 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a7); 42 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a5); 43 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a3); 44 | arctan = _mm256_fmadd_ps(x_times_x, arctan, a1); 45 | arctan = _mm256_mul_ps(x, arctan); 46 | 47 | return arctan; 48 | } 49 | 50 | /* 51 | * Approximate arcsin(x) via polynomial expansion 52 | * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5 53 | * 54 | * Maximum relative error ~1.5e-6 55 | * Polynomial evaluated via Horner's method 56 | */ 57 | static inline __m256 _mm256_arcsin_poly_avx2_fma(const __m256 x) 58 | { 59 | const __m256 c0 = _mm256_set1_ps(0x1.ffffcep-1f); 60 | const __m256 c1 = _mm256_set1_ps(0x1.55b648p-3f); 61 | const __m256 c2 = _mm256_set1_ps(0x1.24d192p-4f); 62 | const __m256 c3 = _mm256_set1_ps(0x1.0a788p-4f); 63 | 64 | const __m256 u = _mm256_mul_ps(x, x); 65 | __m256 p = c3; 66 | p = _mm256_fmadd_ps(u, p, c2); 67 | p = _mm256_fmadd_ps(u, p, c1); 68 | p = _mm256_fmadd_ps(u, p, c0); 69 | 70 | return _mm256_mul_ps(x, p); 71 | } 72 | 73 | #endif /* INCLUDE_VOLK_VOLK_AVX2_FMA_INTRINSICS_H_ */ 74 | -------------------------------------------------------------------------------- /.github/workflows/android_build.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | name: Build on Android NDK 10 | 11 | on: [push, pull_request] 12 | 13 | jobs: 14 | build: 15 | name: Build on Android NDK ${{ matrix.arch.name }} 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | arch: 21 | - { name: armeabi-v7a, allow_fail: false } 22 | - { name: arm64-v8a, allow_fail: false } 23 | - { name: x86, allow_fail: false } 24 | - { name: x86_64, allow_fail: false } 25 | 26 | runs-on: ubuntu-latest 27 | 28 | steps: 29 | - uses: actions/checkout@v4 30 | with: 31 | submodules: 'recursive' 32 | 33 | - name: Update repositories 34 | run: sudo apt update 35 | 36 | # All dependencies 37 | - name: Install dependencies 38 | run: sudo apt install -y cmake python3-mako 39 | 40 | # Setup Java 41 | - uses: actions/setup-java@v4 42 | with: 43 | distribution: 'temurin' 44 | java-version: '17' 45 | 46 | # Setup Android SDK, and auto-accept licenses 47 | - name: Install Android SDK 48 | run: wget --quiet --output-document=android-sdk.zip https://dl.google.com/android/repository/commandlinetools-linux-11076708_latest.zip && mkdir android-sdk-linux && unzip -qq android-sdk.zip -d android-sdk-linux && export ANDROID_HOME=./android-sdk-linux && echo y | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --update && (echo y; echo y; echo y; echo y; echo y; echo y; echo y; echo y) | $ANDROID_HOME/cmdline-tools/bin/sdkmanager --sdk_root=android-sdk-linux --licenses 49 | 50 | # Call SDKManager to install the Android NDK 51 | - name: Install Android NDK 52 | run: $GITHUB_WORKSPACE/android-sdk-linux/cmdline-tools/bin/sdkmanager --sdk_root=$GITHUB_WORKSPACE/android-sdk-linux --install "ndk;27.2.12479018" --channel=3 53 | 54 | # Setup build directory 55 | - name: Setup ${{ matrix.arch.name }} 56 | shell: bash 57 | run: cd $GITHUB_WORKSPACE/ && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-sdk-linux/ndk/27.2.12479018/build/cmake/android.toolchain.cmake -DANDROID_ABI=${{ matrix.arch.name }} -DANDROID_PLATFORM=android-34 .. 58 | 59 | # Build 60 | - name: Build ${{ matrix.arch.name }} 61 | shell: bash 62 | run: cd $GITHUB_WORKSPACE/build && make 63 | continue-on-error: ${{ matrix.arch.allow_fail }} 64 | -------------------------------------------------------------------------------- /lib/volk_malloc.c: -------------------------------------------------------------------------------- 1 | /* -*- c -*- */ 2 | /* 3 | * Copyright 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | /* 17 | * C11 features: 18 | * see: https://en.cppreference.com/w/c/memory/aligned_alloc 19 | * 20 | * MSVC is broken 21 | * see: 22 | * https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170 23 | * This section: 24 | * C11 The Universal CRT implemented the parts of the 25 | * C11 Standard Library that are required by C++17, 26 | * with the exception of C99 strftime() E/O alternative 27 | * conversion specifiers, C11 fopen() exclusive mode, 28 | * and C11 aligned_alloc(). The latter is unlikely to 29 | * be implemented, because C11 specified aligned_alloc() 30 | * in a way that's incompatible with the Microsoft 31 | * implementation of free(): 32 | * namely, that free() must be able to handle highly aligned allocations. 33 | * 34 | * We must work around this problem because MSVC is non-compliant! 35 | */ 36 | 37 | 38 | void* volk_malloc(size_t size, size_t alignment) 39 | { 40 | if ((size == 0) || (alignment == 0)) { 41 | return NULL; 42 | } 43 | // Tweak size to satisfy ASAN (the GCC address sanitizer). 44 | // Calling 'volk_malloc' might therefor result in the allocation of more memory than 45 | // requested for correct alignment. Any allocation size change here will in general 46 | // not impact the end result since initial size alignment is required either way. 47 | if (size % alignment) { 48 | size += alignment - (size % alignment); 49 | } 50 | #if HAVE_POSIX_MEMALIGN 51 | // quoting posix_memalign() man page: 52 | // "alignment must be a power of two and a multiple of sizeof(void *)" 53 | // volk_get_alignment() could return 1 for some machines (e.g. generic_orc) 54 | if (alignment == 1) { 55 | return malloc(size); 56 | } 57 | void* ptr; 58 | int err = posix_memalign(&ptr, alignment, size); 59 | if (err != 0) { 60 | ptr = NULL; 61 | } 62 | #elif defined(_MSC_VER) || defined(__MINGW32__) 63 | void* ptr = _aligned_malloc(size, alignment); 64 | #else 65 | void* ptr = aligned_alloc(alignment, size); 66 | #endif 67 | return ptr; 68 | } 69 | 70 | void volk_free(void* ptr) 71 | { 72 | #if defined(_MSC_VER) || defined(__MINGW32__) 73 | _aligned_free(ptr); 74 | #else 75 | free(ptr); 76 | #endif 77 | } 78 | -------------------------------------------------------------------------------- /kernels/volk/volk_32u_popcntpuppet_32u.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_volk_32u_popcntpuppet_32u_H 11 | #define INCLUDED_volk_32u_popcntpuppet_32u_H 12 | 13 | #include 14 | #include 15 | 16 | #ifdef LV_HAVE_GENERIC 17 | static inline void volk_32u_popcntpuppet_32u_generic(uint32_t* outVector, 18 | const uint32_t* inVector, 19 | unsigned int num_points) 20 | { 21 | for (size_t i = 0; i < num_points; ++i) { 22 | volk_32u_popcnt_generic(outVector + i, inVector[i]); 23 | } 24 | } 25 | #endif /* LV_HAVE_GENERIC */ 26 | 27 | #ifdef LV_HAVE_SSE4_2 28 | static inline void volk_32u_popcntpuppet_32u_a_sse4_2(uint32_t* outVector, 29 | const uint32_t* inVector, 30 | unsigned int num_points) 31 | { 32 | for (size_t i = 0; i < num_points; ++i) { 33 | volk_32u_popcnt_a_sse4_2(outVector + i, inVector[i]); 34 | } 35 | } 36 | #endif /* LV_HAVE_SSE4_2 */ 37 | 38 | #ifdef LV_HAVE_NEON 39 | static inline void volk_32u_popcntpuppet_32u_neon(uint32_t* outVector, 40 | const uint32_t* inVector, 41 | unsigned int num_points) 42 | { 43 | for (size_t i = 0; i < num_points; ++i) { 44 | volk_32u_popcnt_neon(outVector + i, inVector[i]); 45 | } 46 | } 47 | #endif /* LV_HAVE_NEON */ 48 | 49 | #ifdef LV_HAVE_RVV 50 | static inline void volk_32u_popcntpuppet_32u_rvv(uint32_t* outVector, 51 | const uint32_t* inVector, 52 | unsigned int num_points) 53 | { 54 | for (size_t i = 0; i < num_points; ++i) { 55 | volk_32u_popcnt_rvv(outVector + i, inVector[i]); 56 | } 57 | } 58 | #endif /* LV_HAVE_RVV */ 59 | 60 | #ifdef LV_HAVE_RVA22V 61 | static inline void volk_32u_popcntpuppet_32u_rva22(uint32_t* outVector, 62 | const uint32_t* inVector, 63 | unsigned int num_points) 64 | { 65 | for (size_t i = 0; i < num_points; ++i) { 66 | volk_32u_popcnt_rva22(outVector + i, inVector[i]); 67 | } 68 | } 69 | #endif /* LV_HAVE_RVA22V */ 70 | 71 | #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ 72 | -------------------------------------------------------------------------------- /kernels/volk/volk_64u_popcntpuppet_64u.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_volk_64u_popcntpuppet_64u_H 11 | #define INCLUDED_volk_64u_popcntpuppet_64u_H 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #ifdef LV_HAVE_GENERIC 18 | static inline void volk_64u_popcntpuppet_64u_generic(uint64_t* outVector, 19 | const uint64_t* inVector, 20 | unsigned int num_points) 21 | { 22 | for (size_t i = 0; i < num_points; ++i) { 23 | volk_64u_popcnt_generic(outVector + i, inVector[i]); 24 | } 25 | } 26 | #endif /* LV_HAVE_GENERIC */ 27 | 28 | #if LV_HAVE_SSE4_2 && LV_HAVE_64 29 | static inline void volk_64u_popcntpuppet_64u_a_sse4_2(uint64_t* outVector, 30 | const uint64_t* inVector, 31 | unsigned int num_points) 32 | { 33 | for (size_t i = 0; i < num_points; ++i) { 34 | volk_64u_popcnt_a_sse4_2(outVector + i, inVector[i]); 35 | } 36 | } 37 | #endif /* LV_HAVE_SSE4_2 */ 38 | 39 | #ifdef LV_HAVE_NEON 40 | static inline void volk_64u_popcntpuppet_64u_neon(uint64_t* outVector, 41 | const uint64_t* inVector, 42 | unsigned int num_points) 43 | { 44 | for (size_t i = 0; i < num_points; ++i) { 45 | volk_64u_popcnt_neon(outVector + i, inVector[i]); 46 | } 47 | } 48 | #endif /* LV_HAVE_NEON */ 49 | 50 | #ifdef LV_HAVE_RVV 51 | static inline void volk_64u_popcntpuppet_64u_rvv(uint64_t* outVector, 52 | const uint64_t* inVector, 53 | unsigned int num_points) 54 | { 55 | for (size_t i = 0; i < num_points; ++i) { 56 | volk_64u_popcnt_rvv(outVector + i, inVector[i]); 57 | } 58 | } 59 | #endif /* LV_HAVE_RVV */ 60 | 61 | #ifdef LV_HAVE_RVA22V 62 | static inline void volk_64u_popcntpuppet_64u_rva22(uint64_t* outVector, 63 | const uint64_t* inVector, 64 | unsigned int num_points) 65 | { 66 | for (size_t i = 0; i < num_points; ++i) { 67 | volk_64u_popcnt_rva22(outVector + i, inVector[i]); 68 | } 69 | } 70 | #endif /* LV_HAVE_RVA22V */ 71 | 72 | #endif /* INCLUDED_volk_32fc_s32fc_rotatorpuppet_32fc_a_H */ 73 | -------------------------------------------------------------------------------- /gen/machines.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | generic orc| 5 | 6 | 7 | 8 | generic neon orc| 9 | 10 | 11 | 12 | generic neon neonv7 softfp|hardfp orc| 13 | 14 | 15 | 16 | generic neon neonv8 orc| 17 | 18 | 19 | 20 | 21 | generic 32|64| mmx| sse sse2 orc| 22 | 23 | 24 | 25 | generic 32|64| mmx| sse sse2 sse3 orc| 26 | 27 | 28 | 29 | generic 32|64| mmx| sse sse2 sse3 ssse3 orc| 30 | 31 | 32 | 33 | generic riscv64 orc| 34 | 35 | 36 | 37 | generic riscv64 rvv rvvseg orc| 38 | 39 | 40 | 43 | 44 | 47 | 48 | 49 | generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc| 50 | 51 | 52 | 53 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 orc| 54 | 55 | 56 | 57 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc| 58 | 59 | 60 | 61 | 62 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc| 63 | 64 | 65 | 66 | 67 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 orc| 68 | 69 | 70 | 71 | 72 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f orc| 73 | 74 | 75 | 76 | 77 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc| 78 | 79 | 80 | 81 | 82 | generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc| 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /kernels/volk/asm/riscv/volk_32f_s32f_multiply_32f_sifive_u74.s: -------------------------------------------------------------------------------- 1 | .text 2 | .align 2 3 | .type volk_32f_s32f_multiply_32f_sifive_u74, @function 4 | .global volk_32f_s32f_multiply_32f_sifive_u74 5 | 6 | volk_32f_s32f_multiply_32f_sifive_u74: 7 | # Input: 8 | # a0 out 9 | # a1 in 10 | # fa0 scalar 11 | # a2 size 12 | 13 | # Main loop in 8x unrolled. 14 | 15 | # Split counter into main and final loop. 16 | # a5 main loop counter 17 | # a2 closing loop counter 18 | srli a5,a2,3 19 | andi a2,a2,7 20 | slli a5,a5,5 21 | beqz a5,.dolastloop 22 | add a5,a0,a5 23 | 24 | .align 2 25 | .loop: 26 | flw fa1,0(a1) 27 | addi a0,a0,32 # increment output (free, running on pipeline A) 28 | 29 | flw fa2,4(a1) 30 | flw fa3,8(a1) 31 | flw fa4,12(a1) 32 | flw fa5,16(a1) 33 | flw fa6,20(a1) 34 | flw fa7,24(a1) 35 | flw ft8,28(a1) 36 | addi a1,a1,32 # increment input (free, running on pipeline A) 37 | 38 | fmul.s fa1,fa1,fa0 39 | fmul.s fa2,fa2,fa0 40 | fmul.s fa3,fa3,fa0 41 | fmul.s fa4,fa4,fa0 42 | fmul.s fa5,fa5,fa0 43 | fmul.s fa6,fa6,fa0 44 | fmul.s fa7,fa7,fa0 45 | fmul.s ft8,ft8,fa0 46 | 47 | fsw fa1,-32(a0) 48 | fsw fa2,-28(a0) 49 | fsw fa3,-24(a0) 50 | fsw fa4,-20(a0) 51 | fsw fa5,-16(a0) 52 | fsw fa6,-12(a0) 53 | fsw fa7,-8(a0) 54 | fsw ft8,-4(a0) 55 | 56 | bne a5,a0,.loop 57 | 58 | .align 2 59 | .dolastloop: 60 | # TODO: is branch assumed to be taken or not? 61 | beqz a2,.done 62 | 63 | # Everything below is less optimized. In theory we could split 64 | # this into more partial unrolled loops, but it's at most 7 65 | # iterations, so not clear that it's worth it. 66 | 67 | # make a2 a pointer to the last entry. 68 | slli a2,a2,2 69 | add a2,a0,a2 # Stall! 70 | 71 | .align 2 72 | .lastloop: 73 | flw fa5,0(a1) # Latency: 2 74 | addi a0,a0,4 # Increment out 75 | fmul.s fa5,fa5,fa0 # Stalled for a cycle or two. Latency: 5 76 | addi a1,a1,4 # Increment in 77 | fsw fa5,-4(a0) # Stalled for a couple of cycles waiting for mul. 78 | bne a2,a0,.lastloop 79 | 80 | .align 2 81 | .done: 82 | ret 83 | .size volk_32f_s32f_multiply_32f_sifive_u74, .-volk_32f_s32f_multiply_32f_sifive_u74 84 | -------------------------------------------------------------------------------- /gen/volk_arch_defs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2012 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | 10 | archs = list() 11 | arch_dict = dict() 12 | 13 | class arch_class(object): 14 | def __init__(self, flags, checks, **kwargs): 15 | for key, cast, failval in ( 16 | ('name', str, None), 17 | ('environment', str, None), 18 | ('include', str, None), 19 | ('alignment', int, 1) 20 | ): 21 | try: setattr(self, key, cast(kwargs[key])) 22 | except: setattr(self, key, failval) 23 | self.checks = checks 24 | assert(self.name) 25 | self._flags = flags 26 | 27 | def is_supported(self, compiler): 28 | if not self._flags.keys(): return True 29 | return compiler in self._flags.keys() 30 | 31 | def get_flags(self, compiler): 32 | try: return self._flags[compiler] 33 | except KeyError: return list() 34 | 35 | def __repr__(self): return self.name 36 | 37 | def register_arch(**kwargs): 38 | arch = arch_class(**kwargs) 39 | archs.append(arch) 40 | arch_dict[arch.name] = arch 41 | 42 | ######################################################################## 43 | # register the arches 44 | ######################################################################## 45 | #TODO skip the XML and put it here 46 | from xml.dom import minidom 47 | import os 48 | gendir = os.path.dirname(__file__) 49 | archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch') 50 | for arch_xml in archs_xml: 51 | kwargs = dict() 52 | for attr in arch_xml.attributes.keys(): 53 | kwargs[attr] = arch_xml.attributes[attr].value 54 | for node in arch_xml.childNodes: 55 | try: 56 | name = node.tagName 57 | val = arch_xml.getElementsByTagName(name)[0].firstChild.data 58 | kwargs[name] = val 59 | except: pass 60 | checks = list() 61 | for check_xml in arch_xml.getElementsByTagName("check"): 62 | name = check_xml.attributes["name"].value 63 | params = list() 64 | for param_xml in check_xml.getElementsByTagName("param"): 65 | params.append(param_xml.firstChild.data) 66 | checks.append([name, params]) 67 | flags = dict() 68 | for flag_xml in arch_xml.getElementsByTagName("flag"): 69 | name = flag_xml.attributes["compiler"].value 70 | if name not in flags: flags[name] = list() 71 | flags[name].append(flag_xml.firstChild.data) 72 | register_arch(flags=flags, checks=checks, **kwargs) 73 | 74 | if __name__ == '__main__': 75 | print(archs) 76 | 77 | -------------------------------------------------------------------------------- /include/volk/volk_sse3_intrinsics.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2015 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /* 11 | * This file is intended to hold SSE3 intrinsics of intrinsics. 12 | * They should be used in VOLK kernels to avoid copy-pasta. 13 | */ 14 | 15 | #ifndef INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ 16 | #define INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ 17 | #include 18 | 19 | static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) 20 | { 21 | __m128 yl, yh, tmp1, tmp2; 22 | yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr 23 | yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di 24 | tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr 25 | x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br 26 | tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di 27 | return _mm_addsub_ps(tmp1, 28 | tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di 29 | } 30 | 31 | static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) 32 | { 33 | const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); 34 | y = _mm_xor_ps(y, conjugator); // conjugate y 35 | return _mm_complexmul_ps(x, y); 36 | } 37 | 38 | static inline __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) 39 | { 40 | cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values 41 | cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values 42 | return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values 43 | } 44 | 45 | static inline __m128 _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) 46 | { 47 | return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); 48 | } 49 | 50 | static inline __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, 51 | const __m128 symbols1, 52 | const __m128 points0, 53 | const __m128 points1, 54 | const __m128 scalar) 55 | { 56 | /* 57 | * Calculate: |y - x|^2 * SNR_lin 58 | * Consider 'symbolsX' and 'pointsX' to be complex float 59 | * 'symbolsX' are 'y' and 'pointsX' are 'x' 60 | */ 61 | const __m128 diff0 = _mm_sub_ps(symbols0, points0); 62 | const __m128 diff1 = _mm_sub_ps(symbols1, points1); 63 | const __m128 norms = _mm_magnitudesquared_ps_sse3(diff0, diff1); 64 | return _mm_mul_ps(norms, scalar); 65 | } 66 | 67 | #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ 68 | -------------------------------------------------------------------------------- /apps/volk-config-info.cc: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2013, 2016, 2018 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #if HAVE_CONFIG_H 11 | #include 12 | #endif 13 | 14 | #include // for volk_available_machines, volk_c_com... 15 | #include // for operator<<, endl, cout, ostream 16 | #include // for string 17 | 18 | #include "volk/volk.h" // for volk_get_alignment, volk_get_machine 19 | #include "volk_option_helpers.h" // for option_list, option_t 20 | 21 | void print_alignment() 22 | { 23 | std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl; 24 | } 25 | 26 | void print_malloc() 27 | { 28 | // You don't want to change the volk_malloc code, so just copy the if/else 29 | // structure from there and give an explanation for the implementations 30 | std::cout << "Used malloc implementation: "; 31 | #if HAVE_POSIX_MEMALIGN 32 | std::cout << "posix_memalign" << std::endl; 33 | #elif defined(_MSC_VER) 34 | std::cout << "_aligned_malloc" << std::endl; 35 | #else 36 | std::cout << "C11 aligned_alloc" << std::endl; 37 | #endif 38 | } 39 | 40 | 41 | int main(int argc, char** argv) 42 | { 43 | 44 | option_list our_options("volk-config-info"); 45 | our_options.add( 46 | option_t("prefix", "", "print the VOLK installation prefix", volk_prefix())); 47 | our_options.add( 48 | option_t("cc", "", "print the VOLK C compiler version", volk_c_compiler())); 49 | our_options.add( 50 | option_t("cflags", "", "print the VOLK CFLAGS", volk_compiler_flags())); 51 | our_options.add(option_t( 52 | "all-machines", "", "print VOLK machines built", volk_available_machines())); 53 | our_options.add(option_t("avail-machines", 54 | "", 55 | "print VOLK machines on the current " 56 | "platform", 57 | volk_list_machines)); 58 | our_options.add(option_t("machine", 59 | "", 60 | "print the current VOLK machine that will be used", 61 | volk_get_machine())); 62 | our_options.add( 63 | option_t("alignment", "", "print the memory alignment", print_alignment)); 64 | our_options.add(option_t("malloc", 65 | "", 66 | "print the malloc implementation used in volk_malloc", 67 | print_malloc)); 68 | our_options.add(option_t("version", "v", "print the VOLK version", volk_version())); 69 | 70 | our_options.parse(argc, argv); 71 | 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /cmake/Modules/VolkVersion.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Free Software Foundation, Inc. 2 | # 3 | # This file is part of VOLK. 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | # 7 | 8 | if(DEFINED __INCLUDED_VOLK_VERSION_CMAKE) 9 | return() 10 | endif() 11 | set(__INCLUDED_VOLK_VERSION_CMAKE TRUE) 12 | 13 | #eventually, replace version.sh and fill in the variables below 14 | set(MAJOR_VERSION ${VERSION_INFO_MAJOR_VERSION}) 15 | set(MINOR_VERSION ${VERSION_INFO_MINOR_VERSION}) 16 | set(MAINT_VERSION ${VERSION_INFO_MAINT_VERSION}) 17 | 18 | ######################################################################## 19 | # Extract the version string from git describe. 20 | ######################################################################## 21 | find_package(Git) 22 | 23 | if(GIT_FOUND AND EXISTS ${CMAKE_SOURCE_DIR}/.git) 24 | message(STATUS "Extracting version information from git describe...") 25 | execute_process( 26 | COMMAND ${GIT_EXECUTABLE} describe --always --abbrev=8 --long 27 | OUTPUT_VARIABLE GIT_DESCRIBE 28 | OUTPUT_STRIP_TRAILING_WHITESPACE 29 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) 30 | else() 31 | if(NOT VOLK_GIT_COUNT) 32 | set(VOLK_GIT_COUNT "0") 33 | endif() 34 | 35 | if(NOT VOLK_GIT_HASH) 36 | set(VOLK_GIT_HASH "unknown") 37 | endif() 38 | 39 | set(GIT_DESCRIBE 40 | "v${MAJOR_VERSION}.${MINOR_VERSION}-${VOLK_GIT_COUNT}-${VOLK_GIT_HASH}") 41 | endif() 42 | 43 | ######################################################################## 44 | # Use the logic below to set the version constants 45 | ######################################################################## 46 | if("${MINOR_VERSION}" STREQUAL "git") 47 | # VERSION: 1.0git-xxx-gxxxxxxxx 48 | # DOCVER: 1.0git 49 | # SOVERSION: 1.0git 50 | set(VERSION "${GIT_DESCRIBE}") 51 | set(DOCVER "${MAJOR_VERSION}.0${MINOR_VERSION}") 52 | set(SOVERSION "${MAJOR_VERSION}.0${MINOR_VERSION}") 53 | set(RC_MINOR_VERSION "0") 54 | set(RC_MAINT_VERSION "0") 55 | elseif("${MAINT_VERSION}" STREQUAL "git") 56 | # VERSION: 1.xgit-xxx-gxxxxxxxx 57 | # DOCVER: 1.xgit 58 | # SOVERSION: 1.xgit 59 | set(VERSION "${GIT_DESCRIBE}") 60 | set(DOCVER "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}") 61 | set(SOVERSION "${MAJOR_VERSION}.${MINOR_VERSION}${MAINT_VERSION}") 62 | math(EXPR RC_MINOR_VERSION "${MINOR_VERSION} - 1") 63 | set(RC_MAINT_VERSION "0") 64 | else() 65 | # This is a numbered release. 66 | # VERSION: 1.1{.x} 67 | # DOCVER: 1.1{.x} 68 | # SOVERSION: 1.1.0 69 | set(VERSION "${MAJOR_VERSION}.${MINOR_VERSION}.${MAINT_VERSION}") 70 | set(DOCVER "${VERSION}") 71 | set(SOVERSION "${MAJOR_VERSION}.${MINOR_VERSION}") 72 | set(RC_MINOR_VERSION ${MINOR_VERSION}) 73 | set(RC_MAINT_VERSION ${MAINT_VERSION}) 74 | endif() 75 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasmvmla.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasmvmla ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) 2 | .global volk_32fc_32f_dot_prod_32fc_a_neonasmvmla 3 | volk_32fc_32f_dot_prod_32fc_a_neonasmvmla: 4 | @ r0 - result: pointer to output array (32fc) 5 | @ r1 - input: pointer to input array 1 (32fc) 6 | @ r2 - taps: pointer to input array 2 (32f) 7 | @ r3 - num_points: number of items to process 8 | 9 | result .req r0 10 | input .req r1 11 | taps .req r2 12 | num_points .req r3 13 | quarterPoints .req r7 14 | number .req r8 15 | @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: 16 | @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; 17 | @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved 18 | @ registers d16-d31 (q8-q15), if present, do not need to be preserved. 19 | realAccQ .req q0 @ d0-d1/s0-s3 20 | compAccQ .req q1 @ d2-d3/s4-s7 21 | realAccS .req s0 @ d0[0] 22 | compAccS .req s4 @ d2[0] 23 | tapsVal .req q2 @ d4-d5 24 | outVal .req q3 @ d6-d7 25 | realMul .req q8 @ d8-d9 26 | compMul .req q9 @ d16-d17 27 | inRealVal .req q10 @ d18-d19 28 | inCompVal .req q11 @ d20-d21 29 | 30 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 31 | 32 | veor realAccQ, realAccQ @ zero out accumulators 33 | veor compAccQ, compAccQ @ zero out accumulators 34 | movs quarterPoints, num_points, lsr #2 35 | beq .loop2 @ if zero into quarterPoints 36 | 37 | mov number, quarterPoints 38 | 39 | .loop1: 40 | @ do work here 41 | pld [taps, #128] @ pre-load hint - this is implementation specific! 42 | pld [input, #128] @ pre-load hint - this is implementation specific! 43 | vld1.32 {tapsVal}, [taps:128]! @ tapsVal 44 | vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal 45 | vmla.f32 realAccQ, tapsVal, inRealVal 46 | vmla.f32 compAccQ, tapsVal, inCompVal 47 | subs number, number, #1 48 | bne .loop1 @ first loop 49 | 50 | @ Sum up across realAccQ and compAccQ 51 | vadd.f32 d0, d0, d1 @ realAccQ +-> d0 52 | vadd.f32 d2, d2, d3 @ compAccQ +-> d2 53 | vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) 54 | vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) 55 | @ critical values are now in s0 (realAccS), s4 (compAccS) 56 | mov number, quarterPoints, asl #2 57 | .loop2: 58 | cmp num_points, number 59 | bls .done 60 | 61 | vld1.32 {d4[0]}, [taps]! @ s8 62 | vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 63 | vmla.f32 realAccS, s8, s10 @ d0[0] 64 | vmla.f32 compAccS, s8, s12 @ d2[0] 65 | 66 | add number, number, #1 67 | b .loop2 68 | 69 | .done: 70 | vst1.32 {d0[0]}, [result]! @ realAccS 71 | vst1.32 {d2[0]}, [result] @ compAccS 72 | 73 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 74 | bx lr 75 | -------------------------------------------------------------------------------- /apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2011-2013 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | 9 | ######################################################################## 10 | # Setup profiler 11 | ######################################################################## 12 | 13 | # POSIX_MEMALIGN: If we have to fall back to `posix_memalign`. 14 | if(HAVE_POSIX_MEMALIGN) 15 | message(STATUS "Use `posix_memalign` for aligned malloc!") 16 | add_definitions(-DHAVE_POSIX_MEMALIGN) 17 | endif(HAVE_POSIX_MEMALIGN) 18 | 19 | # MAKE volk_profile 20 | add_executable( 21 | volk_profile 22 | ${CMAKE_CURRENT_SOURCE_DIR}/volk_profile.cc ${PROJECT_SOURCE_DIR}/lib/qa_utils.cc 23 | ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc) 24 | target_compile_features(volk_profile PUBLIC cxx_std_17) 25 | 26 | if(MSVC) 27 | target_include_directories( 28 | volk_profile PRIVATE $) 29 | endif(MSVC) 30 | 31 | target_include_directories( 32 | volk_profile 33 | PRIVATE $ 34 | PRIVATE $ 35 | PRIVATE $ 36 | PRIVATE $ 37 | PRIVATE ${CMAKE_CURRENT_BINARY_DIR} 38 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 39 | 40 | if(ENABLE_STATIC_LIBS) 41 | target_link_libraries(volk_profile PRIVATE volk_static fmt::fmt) 42 | set_target_properties(volk_profile PROPERTIES LINK_FLAGS "-static") 43 | else() 44 | target_link_libraries(volk_profile PRIVATE volk fmt::fmt) 45 | endif() 46 | 47 | install( 48 | TARGETS volk_profile 49 | DESTINATION bin 50 | COMPONENT "volk") 51 | 52 | # MAKE volk-config-info 53 | add_executable(volk-config-info volk-config-info.cc 54 | ${CMAKE_CURRENT_SOURCE_DIR}/volk_option_helpers.cc) 55 | target_compile_features(volk-config-info PUBLIC cxx_std_17) 56 | 57 | if(ENABLE_STATIC_LIBS) 58 | target_link_libraries(volk-config-info volk_static) 59 | set_target_properties(volk-config-info PROPERTIES LINK_FLAGS "-static") 60 | else() 61 | target_link_libraries(volk-config-info volk) 62 | endif() 63 | 64 | install( 65 | TARGETS volk-config-info 66 | DESTINATION bin 67 | COMPONENT "volk") 68 | 69 | # Launch volk_profile if requested to do so 70 | if(ENABLE_PROFILING) 71 | if(DEFINED VOLK_CONFIGPATH) 72 | set(VOLK_CONFIG_ARG "-p${VOLK_CONFIGPATH}") 73 | set(VOLK_CONFIG "${VOLK_CONFIGPATH}/volk_config") 74 | endif() 75 | 76 | add_custom_command( 77 | OUTPUT ${VOLK_CONFIG} 78 | COMMAND volk_profile "${VOLK_CONFIG_ARG}" 79 | DEPENDS volk_profile 80 | COMMENT "Launching profiler, this may take a few minutes...") 81 | add_custom_target(volk-profile-run ALL DEPENDS ${VOLK_CONFIG}) 82 | 83 | endif() 84 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 - 2022 Free Software Foundation, Inc. 3 | # 4 | # This file is part of VOLK 5 | # 6 | # SPDX-License-Identifier: LGPL-3.0-or-later 7 | # 8 | language: cpp 9 | 10 | os: linux 11 | dist: bionic 12 | 13 | addons: 14 | apt: 15 | packages: &common_packages 16 | - python3-mako 17 | - liborc-dev 18 | 19 | env: 20 | global: 21 | - SDE_VERSION=sde-external-8.50.0-2020-03-26-lin 22 | - SDE_URL=http://software.intel.com/content/dam/develop/external/us/en/protected/ 23 | 24 | matrix: 25 | include: 26 | # Job 1 ... gcc-7 with Intel SDE 27 | - name: Linux x86 Intel SDE GCC 7 28 | env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7 CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/intel-sde.cmake" 29 | addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages]}} 30 | cache: 31 | directories: 32 | - ${TRAVIS_BUILD_DIR}/cache 33 | before_script: 34 | - cd ${TRAVIS_BUILD_DIR} && ./scripts/ci/download_intel_sde.sh 35 | 36 | # Job 4 ... gcc-6 37 | - name: Linux x86 GCC 6 38 | env: MATRIX_EVAL="CC=gcc-6 && CXX=g++-6" 39 | addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-6]}} 40 | 41 | # Job 7 ... ARMv7 cross compile 42 | - name: Linux ARMv7 Qemu GCC 7 43 | env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/arm-linux-gnueabihf.cmake" 44 | addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-arm-linux-gnueabihf, qemu-user]}} 45 | 46 | # Job 8 ... ARMv8 (aarch64) cross compile 47 | - name: Linux ARMv8 (aarch64) Qemu GCC 7 48 | env: MATRIX_EVAL="CMAKE_ARG=-DCMAKE_TOOLCHAIN_FILE=../cmake/Toolchains/aarch64-linux-gnu.cmake" 49 | addons: {apt: {sources: "ubuntu-toolchain-r-test", packages: [*common_packages, g++-aarch64-linux-gnu, qemu-user]}} 50 | 51 | # Job 9 ... clang 52 | - name: Linux x86 Clang 6 53 | env: MATRIX_EVAL="CC=\"clang -fprofile-instr-generate -fcoverage-mapping\" && CXX=\"clang++ -fprofile-instr-generate -fcoverage-mapping\"" 54 | addons: {apt: {packages: [*common_packages, ]}} 55 | 56 | - name: Linux ARMv8 (aarch64) GCC 7 57 | arch: arm64 58 | env: MATRIX_EVAL="CC=gcc-7 && CXX=g++-7" 59 | addons: {apt: {packages: [*common_packages, ]}} 60 | 61 | - name: Linux ARMv8 (aarch64) Clang 6 62 | arch: arm64 63 | env: MATRIX_EVAL="CC=clang && CXX=clang++" 64 | addons: {apt: {packages: [*common_packages, ]}} 65 | 66 | script: 67 | - eval "${MATRIX_EVAL}" 68 | - lscpu 69 | - git submodule update --init --recursive 70 | - mkdir build && cd build 71 | - cmake ${CMAKE_ARG} ../ 72 | - make 73 | - echo $(./apps/volk-config-info --malloc) && echo $(./apps/volk-config-info --alignment) && echo "All compiled VOLK machines:" $(./apps/volk-config-info --all-machines) && echo "Available VOLK machines:" $(./apps/volk-config-info --avail-machines) 74 | - ctest -V 75 | -------------------------------------------------------------------------------- /tmpl/volk_cpu.tmpl.c: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | 16 | #if defined(VOLK_CPU_FEATURES) 17 | #include "cpu_features_macros.h" 18 | #if defined(CPU_FEATURES_ARCH_X86) 19 | #include "cpuinfo_x86.h" 20 | #elif defined(CPU_FEATURES_ARCH_ARM) 21 | #include "cpuinfo_arm.h" 22 | #elif defined(CPU_FEATURES_ARCH_AARCH64) 23 | #include "cpuinfo_aarch64.h" 24 | #elif defined(CPU_FEATURES_ARCH_MIPS) 25 | #include "cpuinfo_mips.h" 26 | #elif defined(CPU_FEATURES_ARCH_PPC) 27 | #include "cpuinfo_ppc.h" 28 | #elif defined(CPU_FEATURES_ARCH_RISCV) 29 | #include "cpuinfo_riscv.h" 30 | #endif 31 | 32 | // This is required for MSVC 33 | #if defined(__cplusplus) 34 | using namespace cpu_features; 35 | #endif 36 | #endif 37 | 38 | 39 | struct VOLK_CPU volk_cpu; 40 | 41 | %for arch in archs: 42 | static int i_can_has_${arch.name} (void) { 43 | %for check, params in arch.checks: 44 | %if "neon" in arch.name: 45 | #if defined(CPU_FEATURES_ARCH_ARM) 46 | if (GetArmInfo().features.${check} == 0){ return 0; } 47 | #endif 48 | %elif "mips" in arch.name: 49 | #if defined(CPU_FEATURES_ARCH_MIPS) 50 | if (GetMipsInfo().features.${check} == 0){ return 0; } 51 | #endif 52 | %elif "riscv" in arch.name or arch.name[:2] == "rv": 53 | #if defined(CPU_FEATURES_ARCH_RISCV) 54 | if (GetRiscvInfo().features.${check} == 0){ return 0; } 55 | #endif 56 | %else: 57 | #if defined(CPU_FEATURES_ARCH_X86) 58 | if (GetX86Info().features.${check} == 0){ return 0; } 59 | #endif 60 | %endif 61 | %endfor 62 | return 1; 63 | } 64 | 65 | %endfor 66 | 67 | #if defined(HAVE_FENV_H) 68 | #if defined(FE_TONEAREST) 69 | #include 70 | static inline void set_float_rounding(void){ 71 | fesetround(FE_TONEAREST); 72 | } 73 | #else 74 | static inline void set_float_rounding(void){ 75 | //do nothing 76 | } 77 | #endif 78 | #elif defined(_MSC_VER) 79 | #include 80 | static inline void set_float_rounding(void){ 81 | unsigned int cwrd; 82 | _controlfp_s(&cwrd, 0, 0); 83 | _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); 84 | } 85 | #else 86 | static inline void set_float_rounding(void){ 87 | //do nothing 88 | } 89 | #endif 90 | 91 | 92 | void volk_cpu_init() { 93 | %for arch in archs: 94 | volk_cpu.has_${arch.name} = &i_can_has_${arch.name}; 95 | %endfor 96 | set_float_rounding(); 97 | } 98 | 99 | unsigned int volk_get_lvarch() { 100 | unsigned int retval = 0; 101 | volk_cpu_init(); 102 | %for arch in archs: 103 | retval += volk_cpu.has_${arch.name}() << LV_${arch.name.upper()}; 104 | %endfor 105 | return retval; 106 | } 107 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)@ 2 | .global volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests 3 | volk_32fc_x2_dot_prod_32fc_a_neonasm_opttests: 4 | push {r4, r5, r6, r7, r8, r9, sl, fp, lr} 5 | vpush {d8-d15} 6 | lsrs fp, r3, #3 7 | sub sp, sp, #52 @ 0x34 8 | mov r9, r3 9 | mov sl, r0 10 | mov r7, r1 11 | mov r8, r2 12 | vorr q0, q7, q7 13 | vorr q1, q7, q7 14 | vorr q2, q7, q7 15 | vorr q3, q7, q7 16 | vorr q4, q7, q7 17 | vorr q5, q7, q7 18 | veor q6, q7, q7 19 | vorr q7, q7, q7 20 | beq .smallvector 21 | mov r4, r1 22 | mov ip, r2 23 | mov r3, #0 24 | .mainloop: 25 | @mov r6, ip 26 | @mov r5, r4 27 | vld4.32 {d24,d26,d28,d30}, [r6]! 28 | @add ip, ip, #64 @ 0x40 29 | @add r4, r4, #64 @ 0x40 30 | vld4.32 {d16,d18,d20,d22}, [r5]! 31 | add r3, r3, #1 32 | vld4.32 {d25,d27,d29,d31}, [r6]! 33 | vld4.32 {d17,d19,d21,d23}, [r5]! 34 | vmla.f32 q6, q8, q12 35 | vmla.f32 q0, q9, q12 36 | cmp r3, fp 37 | vmls.f32 q5, q13, q9 38 | vmla.f32 q2, q13, q8 39 | vmla.f32 q7, q10, q14 40 | vmla.f32 q1, q11, q14 41 | vmls.f32 q4, q15, q11 42 | vmla.f32 q3, q15, q10 43 | bne .mainloop 44 | lsl r3, fp, #6 45 | add r8, r8, r3 46 | add r7, r7, r3 47 | .smallvector: 48 | vadd.f32 q3, q2, q3 49 | add r3, sp, #16 50 | lsl r4, fp, #3 51 | vadd.f32 q4, q5, q4 52 | cmp r9, r4 53 | vadd.f32 q6, q6, q7 54 | vadd.f32 q1, q0, q1 55 | vadd.f32 q8, q6, q4 56 | vadd.f32 q9, q1, q3 57 | vst2.32 {d16-d19}, [r3 :64] 58 | vldr s15, [sp, #24] 59 | vldr s16, [sp, #16] 60 | vldr s17, [sp, #20] 61 | vadd.f32 s16, s16, s15 62 | vldr s11, [sp, #28] 63 | vldr s12, [sp, #40] @ 0x28 64 | vldr s13, [sp, #44] @ 0x2c 65 | vldr s14, [sp, #32] 66 | vldr s15, [sp, #36] @ 0x24 67 | vadd.f32 s17, s17, s11 68 | vadd.f32 s16, s16, s12 69 | vadd.f32 s17, s17, s13 70 | vadd.f32 s16, s16, s14 71 | vadd.f32 s17, s17, s15 72 | vstr s16, [sl] 73 | vstr s17, [sl, #4] 74 | bls .epilog 75 | add r5, sp, #8 76 | .tailcase: 77 | ldr r3, [r7], #8 78 | mov r0, r5 79 | ldr r1, [r8], #8 80 | add r4, r4, #1 81 | ldr ip, [r7, #-4] 82 | ldr r2, [r8, #-4] 83 | str ip, [sp] 84 | bl __mulsc3 85 | vldr s14, [sp, #8] 86 | vldr s15, [sp, #12] 87 | vadd.f32 s16, s16, s14 88 | cmp r4, r9 89 | vadd.f32 s17, s17, s15 90 | vstr s16, [sl] 91 | vstr s17, [sl, #4] 92 | bne .tailcase 93 | .epilog: 94 | add sp, sp, #52 @ 0x34 95 | vpop {d8-d15} 96 | pop {r4, r5, r6, r7, r8, r9, sl, fp, pc} 97 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonasm ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { 2 | .global volk_32fc_32f_dot_prod_32fc_a_neonasm 3 | volk_32fc_32f_dot_prod_32fc_a_neonasm: 4 | @ r0 - result: pointer to output array (32fc) 5 | @ r1 - input: pointer to input array 1 (32fc) 6 | @ r2 - taps: pointer to input array 2 (32f) 7 | @ r3 - num_points: number of items to process 8 | 9 | result .req r0 10 | input .req r1 11 | taps .req r2 12 | num_points .req r3 13 | quarterPoints .req r7 14 | number .req r8 15 | @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: 16 | @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; 17 | @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved 18 | @ registers d16-d31 (q8-q15), if present, do not need to be preserved. 19 | realAccQ .req q0 @ d0-d1/s0-s3 20 | compAccQ .req q1 @ d2-d3/s4-s7 21 | realAccS .req s0 @ d0[0] 22 | compAccS .req s4 @ d2[0] 23 | tapsVal .req q2 @ d4-d5 24 | outVal .req q3 @ d6-d7 25 | realMul .req q8 @ d8-d9 26 | compMul .req q9 @ d16-d17 27 | inRealVal .req q10 @ d18-d19 28 | inCompVal .req q11 @ d20-d21 29 | 30 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 31 | 32 | veor realAccQ, realAccQ @ zero out accumulators 33 | veor compAccQ, compAccQ @ zero out accumulators 34 | movs quarterPoints, num_points, lsr #2 35 | beq .loop2 @ if zero into quarterPoints 36 | 37 | mov number, quarterPoints 38 | 39 | .loop1: 40 | @ do work here 41 | @pld [taps, #128] @ pre-load hint - this is implementation specific! 42 | @pld [input, #128] @ pre-load hint - this is implementation specific! 43 | vld1.32 {d4-d5}, [taps:128]! @ tapsVal 44 | vld2.32 {d20-d23}, [input:128]! @ inRealVal, inCompVal 45 | vmul.f32 realMul, tapsVal, inRealVal 46 | vmul.f32 compMul, tapsVal, inCompVal 47 | vadd.f32 realAccQ, realAccQ, realMul 48 | vadd.f32 compAccQ, compAccQ, compMul 49 | subs number, number, #1 50 | bne .loop1 @ first loop 51 | 52 | @ Sum up across realAccQ and compAccQ 53 | vpadd.f32 d0, d0, d1 @ realAccQ +-> d0 54 | vpadd.f32 d2, d2, d3 @ compAccQ +-> d2 55 | vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) 56 | vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) 57 | @ critical values are now in s0 (realAccS), s4 (realAccQ) 58 | mov number, quarterPoints, asl #2 59 | 60 | .loop2: 61 | cmp num_points, number 62 | bls .done 63 | 64 | vld1.32 {d4[0]}, [taps]! @ s8 65 | vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 66 | vmul.f32 s5, s8, s10 67 | vmul.f32 s6, s8, s12 68 | vadd.f32 realAccS, realAccS, s5 69 | vadd.f32 compAccS, compAccS, s6 70 | 71 | add number, number, #1 72 | b .loop2 73 | 74 | .done: 75 | vst1.32 {d0[0]}, [result]! @ realAccS 76 | vst1.32 {d2[0]}, [result] @ compAccS 77 | 78 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 79 | bx lr 80 | -------------------------------------------------------------------------------- /include/volk/volk_complex.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2010, 2011, 2015, 2018, 2020, 2021 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_COMPLEX_H 11 | #define INCLUDED_VOLK_COMPLEX_H 12 | 13 | /*! 14 | * \brief Provide typedefs and operators for all complex types in C and C++. 15 | * 16 | * The typedefs encompass all signed integer and floating point types. 17 | * Each operator function is intended to work across all data types. 18 | * Under C++, these operators are defined as inline templates. 19 | * Under C, these operators are defined as preprocessor macros. 20 | * The use of macros makes the operators agnostic to the type. 21 | * 22 | * The following operator functions are defined: 23 | * - lv_cmake - make a complex type from components 24 | * - lv_creal - get the real part of the complex number 25 | * - lv_cimag - get the imaginary part of the complex number 26 | * - lv_conj - take the conjugate of the complex number 27 | */ 28 | 29 | #ifdef __cplusplus 30 | 31 | #include 32 | #include 33 | 34 | typedef std::complex lv_8sc_t; 35 | typedef std::complex lv_16sc_t; 36 | typedef std::complex lv_32sc_t; 37 | typedef std::complex lv_64sc_t; 38 | typedef std::complex lv_32fc_t; 39 | typedef std::complex lv_64fc_t; 40 | 41 | template 42 | inline std::complex lv_cmake(const T& r, const T& i) 43 | { 44 | return std::complex(r, i); 45 | } 46 | 47 | template 48 | inline typename T::value_type lv_creal(const T& x) 49 | { 50 | return x.real(); 51 | } 52 | 53 | template 54 | inline typename T::value_type lv_cimag(const T& x) 55 | { 56 | return x.imag(); 57 | } 58 | 59 | template 60 | inline T lv_conj(const T& x) 61 | { 62 | return std::conj(x); 63 | } 64 | 65 | #else /* __cplusplus */ 66 | 67 | #include 68 | #include 69 | 70 | typedef char complex lv_8sc_t; 71 | typedef short complex lv_16sc_t; 72 | typedef long complex lv_32sc_t; 73 | typedef long long complex lv_64sc_t; 74 | typedef float complex lv_32fc_t; 75 | typedef double complex lv_64fc_t; 76 | 77 | #define lv_cmake(r, i) ((r) + _Complex_I * (i)) 78 | 79 | // When GNUC is available, use the complex extensions. 80 | // The extensions always return the correct value type. 81 | // https://gcc.gnu.org/onlinedocs/gcc/Complex.html 82 | #ifdef __GNUC__ 83 | 84 | #define lv_creal(x) (__real__(x)) 85 | 86 | #define lv_cimag(x) (__imag__(x)) 87 | 88 | #define lv_conj(x) (~(x)) 89 | 90 | // When not available, use the c99 complex function family, 91 | // which always returns double regardless of the input type, 92 | // unless we have C99 and thus tgmath.h overriding functions 93 | // with type-generic versions. 94 | #else /* __GNUC__ */ 95 | 96 | #define lv_creal(x) (creal(x)) 97 | 98 | #define lv_cimag(x) (cimag(x)) 99 | 100 | #define lv_conj(x) (conj(x)) 101 | 102 | #endif /* __GNUC__ */ 103 | 104 | #endif /* __cplusplus */ 105 | 106 | #endif /* INCLUDE_VOLK_COMPLEX_H */ 107 | -------------------------------------------------------------------------------- /python/volk_modtool/cfg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Copyright 2013, 2014 Free Software Foundation, Inc. 5 | # 6 | # This file is part of VOLK 7 | # 8 | # SPDX-License-Identifier: LGPL-3.0-or-later 9 | # 10 | 11 | import configparser 12 | import sys 13 | import os 14 | import re 15 | 16 | 17 | class volk_modtool_config(object): 18 | def key_val_sub(self, num, stuff, section): 19 | return re.sub(r'\$' + 'k' + str(num), stuff[num][0], (re.sub(r'\$' + str(num), stuff[num][1], section[1][num]))); 20 | 21 | def verify(self): 22 | for i in self.verification: 23 | self.verify_section(i) 24 | def remap(self): 25 | for i in self.remapification: 26 | self.verify_section(i) 27 | 28 | def verify_section(self, section): 29 | stuff = self.cfg.items(section[0]) 30 | for i in range(len(section[1])): 31 | eval(self.key_val_sub(i, stuff, section)) 32 | try: 33 | val = eval(self.key_val_sub(i, stuff, section)) 34 | if val == False: 35 | raise ValueError 36 | except ValueError: 37 | raise ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) 38 | except: 39 | raise IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) 40 | 41 | 42 | def __init__(self, cfg=None): 43 | self.config_name = 'config' 44 | self.config_defaults = ['name', 'destination', 'base'] 45 | self.config_defaults_remap = ['1', 46 | 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))', 47 | 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))'] 48 | 49 | self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')', 50 | 'os.path.exists(\'$1\')', 51 | 'os.path.exists(\'$2\')'] 52 | self.remapification = [(self.config_name, self.config_defaults_remap)] 53 | self.verification = [(self.config_name, self.config_defaults_verify)] 54 | default = os.path.join(os.getcwd(), 'volk_modtool.cfg') 55 | icfg = configparser.RawConfigParser() 56 | if cfg: 57 | icfg.read(cfg) 58 | elif os.path.exists(default): 59 | icfg.read(default) 60 | else: 61 | print("Initializing config file...") 62 | icfg.add_section(self.config_name) 63 | for kn in self.config_defaults: 64 | rv = input("%s: "%(kn)) 65 | icfg.set(self.config_name, kn, rv) 66 | self.cfg = icfg 67 | self.remap() 68 | self.verify() 69 | 70 | 71 | 72 | def read_map(self, name, inp): 73 | if self.cfg.has_section(name): 74 | self.cfg.remove_section(name) 75 | self.cfg.add_section(name) 76 | for i in inp: 77 | self.cfg.set(name, i, inp[i]) 78 | 79 | def get_map(self, name): 80 | retval = {} 81 | stuff = self.cfg.items(name) 82 | for i in stuff: 83 | retval[i[0]] = i[1] 84 | return retval 85 | -------------------------------------------------------------------------------- /lib/volk_prefs.c: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011, 2012, 2015, 2016, 2019, 2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #if defined(_MSC_VER) 15 | #include 16 | #define access _access 17 | #define F_OK 0 18 | #else 19 | #include 20 | #endif 21 | #include 22 | 23 | void volk_get_config_path(char* path, bool read) 24 | { 25 | if (!path) 26 | return; 27 | const char* suffix = "/.volk/volk_config"; 28 | const char* suffix2 = "/volk/volk_config"; // non-hidden 29 | char* home = NULL; 30 | 31 | // allows config redirection via env variable 32 | home = getenv("VOLK_CONFIGPATH"); 33 | if (home != NULL) { 34 | strncpy(path, home, 512); 35 | strcat(path, suffix2); 36 | if (!read || access(path, F_OK) != -1) { 37 | return; 38 | } 39 | } 40 | 41 | // check for user-local config file 42 | home = getenv("HOME"); 43 | if (home != NULL) { 44 | strncpy(path, home, 512); 45 | strcat(path, suffix); 46 | if (!read || (access(path, F_OK) != -1)) { 47 | return; 48 | } 49 | } 50 | 51 | // check for config file in APPDATA (Windows) 52 | home = getenv("APPDATA"); 53 | if (home != NULL) { 54 | strncpy(path, home, 512); 55 | strcat(path, suffix); 56 | if (!read || (access(path, F_OK) != -1)) { 57 | return; 58 | } 59 | } 60 | 61 | // check for system-wide config file 62 | if (access("/etc/volk/volk_config", F_OK) != -1) { 63 | strncpy(path, "/etc", 512); 64 | strcat(path, suffix2); 65 | if (!read || (access(path, F_OK) != -1)) { 66 | return; 67 | } 68 | } 69 | 70 | // If still no path was found set path[0] to '0' and fall through 71 | path[0] = 0; 72 | return; 73 | } 74 | 75 | size_t volk_load_preferences(volk_arch_pref_t** prefs_res) 76 | { 77 | FILE* config_file; 78 | char path[512], line[512]; 79 | size_t n_arch_prefs = 0; 80 | volk_arch_pref_t* prefs = NULL; 81 | 82 | // get the config path 83 | volk_get_config_path(path, true); 84 | if (!path[0]) 85 | return n_arch_prefs; // no prefs found 86 | config_file = fopen(path, "r"); 87 | if (!config_file) 88 | return n_arch_prefs; // no prefs found 89 | 90 | // reset the file pointer and write the prefs into volk_arch_prefs 91 | while (fgets(line, sizeof(line), config_file) != NULL) { 92 | void* new_prefs = realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); 93 | if (!new_prefs) { 94 | printf("volk_load_preferences: bad malloc\n"); 95 | break; 96 | } 97 | prefs = (volk_arch_pref_t*)new_prefs; 98 | volk_arch_pref_t* p = prefs + n_arch_prefs; 99 | if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && 100 | !strncmp(p->name, "volk_", 5)) { 101 | n_arch_prefs++; 102 | } 103 | } 104 | fclose(config_file); 105 | *prefs_res = prefs; 106 | return n_arch_prefs; 107 | } 108 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlinesLeft: true 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: false 16 | AllowShortLoopsOnASingleLine: false 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: false 20 | AlwaysBreakTemplateDeclarations: true 21 | BinPackArguments: false 22 | BinPackParameters: false 23 | BreakBeforeBraces: Custom 24 | BraceWrapping: 25 | AfterClass: true 26 | AfterControlStatement: false 27 | AfterEnum: false 28 | AfterFunction: true 29 | AfterNamespace: false 30 | AfterObjCDeclaration: false 31 | AfterStruct: false 32 | AfterUnion: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | BreakBeforeBinaryOperators: None 37 | BreakBeforeTernaryOperators: true 38 | BreakConstructorInitializersBeforeComma: false 39 | BreakAfterJavaFieldAnnotations: false 40 | BreakStringLiterals: true 41 | ColumnLimit: 90 42 | CommentPragmas: '^ IWYU pragma:' 43 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 44 | ConstructorInitializerIndentWidth: 4 45 | ContinuationIndentWidth: 4 46 | Cpp11BracedListStyle: false 47 | DerivePointerAlignment: false 48 | DisableFormat: false 49 | ExperimentalAutoDetectBinPacking: false 50 | ForEachMacros: 51 | - foreach 52 | - Q_FOREACH 53 | - BOOST_FOREACH 54 | IncludeCategories: 55 | - Regex: '^"(gnuradio)/' 56 | Priority: 1 57 | - Regex: '^<(gnuradio)/' 58 | Priority: 2 59 | - Regex: '^<(boost)/' 60 | Priority: 98 61 | - Regex: '^<[a-z]*>$' 62 | Priority: 99 63 | - Regex: '^".*"$' 64 | Priority: 0 65 | - Regex: '.*' 66 | Priority: 10 67 | 68 | IncludeIsMainRegex: '(Test)?$' 69 | IndentCaseLabels: false 70 | IndentWidth: 4 71 | IndentWrappedFunctionNames: false 72 | JavaScriptQuotes: Leave 73 | JavaScriptWrapImports: true 74 | KeepEmptyLinesAtTheStartOfBlocks: true 75 | MacroBlockBegin: '' 76 | MacroBlockEnd: '' 77 | MaxEmptyLinesToKeep: 2 78 | NamespaceIndentation: None 79 | ObjCBlockIndentWidth: 2 80 | ObjCSpaceAfterProperty: false 81 | ObjCSpaceBeforeProtocolList: true 82 | PenaltyBreakBeforeFirstCallParameter: 19 83 | PenaltyBreakComment: 300 84 | PenaltyBreakFirstLessLess: 120 85 | PenaltyBreakString: 1000 86 | PenaltyExcessCharacter: 1000000 87 | PenaltyReturnTypeOnItsOwnLine: 60 88 | PointerAlignment: Left 89 | ReflowComments: true 90 | SortIncludes: true 91 | SpaceAfterCStyleCast: false 92 | SpaceAfterTemplateKeyword: true 93 | SpaceBeforeAssignmentOperators: true 94 | SpaceBeforeParens: ControlStatements 95 | SpaceInEmptyParentheses: false 96 | SpacesBeforeTrailingComments: 1 97 | SpacesInAngles: false 98 | SpacesInContainerLiterals: true 99 | SpacesInCStyleCastParentheses: false 100 | SpacesInParentheses: false 101 | SpacesInSquareBrackets: false 102 | Standard: Cpp11 103 | TabWidth: 8 104 | UseTab: Never 105 | 106 | 107 | -------------------------------------------------------------------------------- /kernels/volk/volk_32u_popcnt.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /*! 11 | * \page volk_32u_popcnt 12 | * 13 | * \b Overview 14 | * 15 | * Computes the population count (popcnt), or Hamming distance of a 16 | * binary string. This kernel takes in a single unsigned 32-bit value 17 | * and returns the count of 1's that the value contains. 18 | * 19 | * Dispatcher Prototype 20 | * \code 21 | * void volk_32u_popcnt(uint32_t* ret, const uint32_t value) 22 | * \endcode 23 | * 24 | * \b Inputs 25 | * \li value: The input value. 26 | * 27 | * \b Outputs 28 | * \li ret: The return value containing the popcnt. 29 | * 30 | * \b Example 31 | * \code 32 | int N = 10; 33 | unsigned int alignment = volk_get_alignment(); 34 | 35 | uint32_t bitstring = 0x55555555; 36 | uint32_t hamming_distance = 0; 37 | 38 | volk_32u_popcnt(&hamming_distance, bitstring); 39 | printf("hamming distance of %x = %i\n", bitstring, hamming_distance); 40 | * \endcode 41 | */ 42 | 43 | #ifndef INCLUDED_VOLK_32u_POPCNT_A16_H 44 | #define INCLUDED_VOLK_32u_POPCNT_A16_H 45 | 46 | #include 47 | #include 48 | 49 | #ifdef LV_HAVE_GENERIC 50 | 51 | static inline void volk_32u_popcnt_generic(uint32_t* ret, const uint32_t value) 52 | { 53 | // This is faster than a lookup table 54 | uint32_t retVal = value; 55 | 56 | retVal = (retVal & 0x55555555) + (retVal >> 1 & 0x55555555); 57 | retVal = (retVal & 0x33333333) + (retVal >> 2 & 0x33333333); 58 | retVal = (retVal + (retVal >> 4)) & 0x0F0F0F0F; 59 | retVal = (retVal + (retVal >> 8)); 60 | retVal = (retVal + (retVal >> 16)) & 0x0000003F; 61 | 62 | *ret = retVal; 63 | } 64 | 65 | #endif /*LV_HAVE_GENERIC*/ 66 | 67 | 68 | #ifdef LV_HAVE_NEON 69 | #include 70 | 71 | static inline void volk_32u_popcnt_neon(uint32_t* ret, const uint32_t value) 72 | { 73 | // Load value into a 64-bit vector (as 8 bytes) 74 | uint8x8_t input = vreinterpret_u8_u32(vdup_n_u32(value)); 75 | // Count bits in each byte 76 | uint8x8_t counts = vcnt_u8(input); 77 | // Sum across all bytes (only first 4 matter for 32-bit value) 78 | // Use vpaddl to widen and add: 8x8 -> 4x16 -> 2x32 -> 1x64 79 | uint16x4_t sum16 = vpaddl_u8(counts); 80 | uint32x2_t sum32 = vpaddl_u16(sum16); 81 | // Extract the lower 32-bit element which contains the sum of the lower 4 bytes 82 | *ret = vget_lane_u32(sum32, 0); 83 | } 84 | #endif /* LV_HAVE_NEON */ 85 | 86 | 87 | #ifdef LV_HAVE_SSE4_2 88 | 89 | #include 90 | 91 | static inline void volk_32u_popcnt_a_sse4_2(uint32_t* ret, const uint32_t value) 92 | { 93 | *ret = _mm_popcnt_u32(value); 94 | } 95 | 96 | #endif /*LV_HAVE_SSE4_2*/ 97 | 98 | #ifdef LV_HAVE_RVV 99 | #include 100 | 101 | static inline void volk_32u_popcnt_rvv(uint32_t* ret, const uint32_t value) 102 | { 103 | *ret = __riscv_vcpop(__riscv_vreinterpret_b4(__riscv_vmv_s_x_u64m1(value, 1)), 32); 104 | } 105 | #endif /*LV_HAVE_RVV*/ 106 | 107 | #ifdef LV_HAVE_RVA22V 108 | #include 109 | 110 | static inline void volk_32u_popcnt_rva22(uint32_t* ret, const uint32_t value) 111 | { 112 | *ret = __riscv_cpop_32(value); 113 | } 114 | #endif /*LV_HAVE_RVA22V*/ 115 | 116 | #endif /*INCLUDED_VOLK_32u_POPCNT_A16_H*/ 117 | -------------------------------------------------------------------------------- /kernels/volk/volk_16i_max_star_16i.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2012, 2014 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /*! 11 | * \page volk_16i_max_star_16i 12 | * 13 | * \b Deprecation 14 | * 15 | * This kernel is deprecated. 16 | * 17 | * \b Overview 18 | * 19 | * 20 | * 21 | * Dispatcher Prototype 22 | * \code 23 | * void volk_16i_max_star_16i(short* target, short* src0, unsigned int num_points); 24 | * \endcode 25 | * 26 | * \b Inputs 27 | * \li src0: The input vector. 28 | * \li num_points: The number of complex data points. 29 | * 30 | * \b Outputs 31 | * \li target: The output value of the max* operation. 32 | * 33 | * \b Example 34 | * \code 35 | * int N = 10000; 36 | * 37 | * volk_16i_max_star_16i(); 38 | * 39 | * volk_free(x); 40 | * volk_free(t); 41 | * \endcode 42 | */ 43 | 44 | #ifndef INCLUDED_volk_16i_max_star_16i_a_H 45 | #define INCLUDED_volk_16i_max_star_16i_a_H 46 | 47 | #include 48 | #include 49 | 50 | #ifdef LV_HAVE_SSSE3 51 | 52 | #include 53 | #include 54 | #include 55 | 56 | static inline void 57 | volk_16i_max_star_16i_a_ssse3(short* target, short* src0, unsigned int num_points) 58 | { 59 | const unsigned int num_bytes = num_points * 2; 60 | 61 | short candidate = src0[0]; 62 | short cands[8]; 63 | __m128i xmm0, xmm1, xmm3, xmm4, xmm5, xmm6; 64 | 65 | __m128i* p_src0; 66 | 67 | p_src0 = (__m128i*)src0; 68 | 69 | int bound = num_bytes >> 4; 70 | int leftovers = (num_bytes >> 1) & 7; 71 | 72 | int i = 0; 73 | 74 | xmm1 = _mm_setzero_si128(); 75 | xmm0 = _mm_setzero_si128(); 76 | //_mm_insert_epi16(xmm0, candidate, 0); 77 | 78 | xmm0 = _mm_shuffle_epi8(xmm0, xmm1); 79 | 80 | for (i = 0; i < bound; ++i) { 81 | xmm1 = _mm_load_si128(p_src0); 82 | p_src0 += 1; 83 | // xmm2 = _mm_sub_epi16(xmm1, xmm0); 84 | 85 | xmm3 = _mm_cmpgt_epi16(xmm0, xmm1); 86 | xmm4 = _mm_cmpeq_epi16(xmm0, xmm1); 87 | xmm5 = _mm_cmpgt_epi16(xmm1, xmm0); 88 | 89 | xmm6 = _mm_xor_si128(xmm4, xmm5); 90 | 91 | xmm3 = _mm_and_si128(xmm3, xmm0); 92 | xmm4 = _mm_and_si128(xmm6, xmm1); 93 | 94 | xmm0 = _mm_add_epi16(xmm3, xmm4); 95 | } 96 | 97 | _mm_store_si128((__m128i*)cands, xmm0); 98 | 99 | for (i = 0; i < 8; ++i) { 100 | candidate = ((short)(candidate - cands[i]) > 0) ? candidate : cands[i]; 101 | } 102 | 103 | for (i = 0; i < leftovers; ++i) { 104 | candidate = ((short)(candidate - src0[(bound << 3) + i]) > 0) 105 | ? candidate 106 | : src0[(bound << 3) + i]; 107 | } 108 | 109 | target[0] = candidate; 110 | } 111 | 112 | #endif /*LV_HAVE_SSSE3*/ 113 | 114 | #ifdef LV_HAVE_GENERIC 115 | 116 | static inline void 117 | volk_16i_max_star_16i_generic(short* target, short* src0, unsigned int num_points) 118 | { 119 | const unsigned int num_bytes = num_points * 2; 120 | 121 | int i = 0; 122 | 123 | int bound = num_bytes >> 1; 124 | 125 | short candidate = src0[0]; 126 | for (i = 1; i < bound; ++i) { 127 | candidate = ((short)(candidate - src0[i]) > 0) ? candidate : src0[i]; 128 | } 129 | target[0] = candidate; 130 | } 131 | 132 | #endif /*LV_HAVE_GENERIC*/ 133 | 134 | 135 | #endif /*INCLUDED_volk_16i_max_star_16i_a_H*/ 136 | -------------------------------------------------------------------------------- /.cmake-format.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Marcus Müller, 2024 Johannes Demel 2 | # SPDX-License-Identifier: LGPL-3.0-or-later 3 | 4 | class _clang_format_options: 5 | def __init__(self, clangfile=None): 6 | if not clangfile: 7 | clangfile = ".clang-format" 8 | self.lines = [] 9 | with open(clangfile, encoding="utf-8") as opened: 10 | for line in opened: 11 | if line.strip().startswith("#"): 12 | continue 13 | self.lines.append(line.rstrip().split(":")) 14 | 15 | def __getitem__(self, string): 16 | path = string.split(".") 17 | value = None 18 | for crumble in path: 19 | for line in self.lines: 20 | if line[0].strip() == crumble: 21 | if len(line) > 1: 22 | value = line[1].strip().rstrip() 23 | break 24 | return value 25 | 26 | 27 | _clang_format = _clang_format_options() 28 | 29 | # ---------------------------------- 30 | # Options affecting listfile parsing 31 | # ---------------------------------- 32 | with section("parse"): 33 | additional_commands = { 34 | 'gr_python_install': { 35 | 'flags': [], 36 | 'kwargs': { 37 | "PROGRAMS": "*", 38 | "FILES": "*", 39 | "DESTINATION": "*" 40 | } 41 | }, 42 | } 43 | 44 | with section("markup"): 45 | first_comment_is_literal = True 46 | enable_markup = False 47 | 48 | with section("format"): 49 | # Disable formatting entirely, making cmake-format a no-op 50 | disable = False 51 | 52 | # How wide to allow formatted cmake files 53 | line_width = int(_clang_format["ColumnLimit"]) 54 | 55 | # How many spaces to tab for indent 56 | tab_size = int(_clang_format["IndentWidth"]) 57 | 58 | # If true, lines are indented using tab characters (utf-8 0x09) instead of 59 | # space characters (utf-8 0x20). In cases where the layout would 60 | # require a fractional tab character, the behavior of the fractional 61 | # indentation is governed by 62 | use_tabchars = _clang_format["UseTab"] in ("ForIndentation", 63 | "ForContinuationAndIndentation", 64 | "Always") 65 | 66 | # If true, separate flow control names from their parentheses with a space 67 | separate_ctrl_name_with_space = False 68 | 69 | # If true, separate function names from parentheses with a space 70 | separate_fn_name_with_space = False 71 | 72 | # If a statement is wrapped to more than one line, than dangle the closing 73 | # parenthesis on its own line. 74 | dangle_parens = False 75 | 76 | # If the statement spelling length (including space and parenthesis) is 77 | # smaller than this amount, then force reject nested layouts. 78 | min_prefix_chars = tab_size 79 | 80 | # If the statement spelling length (including space and parenthesis) is larger 81 | # than the tab width by more than this amount, then force reject un-nested 82 | # layouts. 83 | max_prefix_chars = 3 * tab_size 84 | 85 | # What style line endings to use in the output. 86 | line_ending = "unix" 87 | 88 | # Format command names consistently as 'lower' or 'upper' case 89 | command_case = "canonical" 90 | 91 | # Format keywords consistently as 'lower' or 'upper' case 92 | keyword_case = "upper" 93 | 94 | 95 | with section("lint"): 96 | max_arguments = 6 97 | max_localvars = 20 98 | max_statements = 75 99 | -------------------------------------------------------------------------------- /tests/test_volk_32fc_x2_multiply_32fc.cc: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2022 Johannes Demel 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include "volk_test.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | class volk_32fc_x2_multiply_32fc_test : public VolkTest 21 | { 22 | protected: 23 | void SetUp() override 24 | { 25 | initialize_test(GetParam()); 26 | initialize_data(vector_length); 27 | } 28 | 29 | void initialize_data(const size_t length) 30 | { 31 | vector_length = length; 32 | vec0 = volk::vector(length); 33 | vec1 = volk::vector(length); 34 | result = volk::vector(length); 35 | for (size_t i = 0; i < length; ++i) { 36 | vec0[i] = std::complex(i * 3.14, i * 0.45); 37 | vec1[i] = std::complex(i * -2.78, i * 5.44); 38 | } 39 | 40 | expected = volk::vector(length); 41 | for (size_t i = 0; i < length; ++i) { 42 | expected[i] = vec0[i] * vec1[i]; 43 | } 44 | 45 | // This is a hacky solution to have unaligned tests. 46 | ua_result = result; 47 | ua_result.at(0) = expected.at(0); 48 | } 49 | 50 | void execute_aligned(const std::string impl_name) 51 | { 52 | volk_32fc_x2_multiply_32fc_manual( 53 | result.data(), vec0.data(), vec1.data(), vector_length, impl_name.c_str()); 54 | EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, result)); 55 | } 56 | 57 | void execute_unaligned(const std::string impl_name) 58 | { 59 | volk_32fc_x2_multiply_32fc_manual(ua_result.data() + 1, 60 | vec0.data() + 1, 61 | vec1.data() + 1, 62 | vector_length - 1, 63 | impl_name.c_str()); 64 | EXPECT_TRUE(AreComplexFloatingPointArraysAlmostEqual(expected, ua_result)); 65 | } 66 | 67 | volk::vector vec0; 68 | volk::vector vec1; 69 | volk::vector result; 70 | volk::vector expected; 71 | 72 | volk::vector ua_result; 73 | }; 74 | 75 | TEST_P(volk_32fc_x2_multiply_32fc_test, run) 76 | { 77 | fmt::print("test {} implementation: {:>12}, size={} ...", 78 | is_aligned_implementation ? "aligned" : "unaligned", 79 | implementation_name, 80 | vector_length); 81 | auto start = std::chrono::steady_clock::now(); 82 | 83 | if (is_aligned_implementation) { 84 | execute_aligned(implementation_name); 85 | } else { 86 | execute_unaligned(implementation_name); 87 | } 88 | 89 | 90 | std::chrono::duration elapsed = std::chrono::steady_clock::now() - start; 91 | fmt::print("\tduration={}\n", elapsed); 92 | } 93 | 94 | 95 | INSTANTIATE_TEST_SUITE_P( 96 | volk_32fc_x2_multiply_32fc, 97 | volk_32fc_x2_multiply_32fc_test, 98 | testing::Combine(testing::ValuesIn(get_kernel_implementation_name_list( 99 | volk_32fc_x2_multiply_32fc_get_func_desc())), 100 | testing::ValuesIn(default_vector_sizes)), 101 | generate_volk_test_name()); 102 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_32f_dot_prod_32fc_a_neonpipeline.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_32f_dot_prod_32fc_a_neonpipeline ( lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) { 2 | .global volk_32fc_32f_dot_prod_32fc_a_neonpipeline 3 | volk_32fc_32f_dot_prod_32fc_a_neonpipeline: 4 | @ r0 - result: pointer to output array (32fc) 5 | @ r1 - input: pointer to input array 1 (32fc) 6 | @ r2 - taps: pointer to input array 2 (32f) 7 | @ r3 - num_points: number of items to process 8 | 9 | result .req r0 10 | input .req r1 11 | taps .req r2 12 | num_points .req r3 13 | quarterPoints .req r7 14 | number .req r8 15 | @ Note that according to the ARM EABI (AAPCS) Section 5.1.1: 16 | @ registers s16-s31 (d8-d15, q4-q7) must be preserved across subroutine calls; 17 | @ registers s0-s15 (d0-d7, q0-q3) do not need to be preserved 18 | @ registers d16-d31 (q8-q15), if present, do not need to be preserved. 19 | realAccQ .req q0 @ d0-d1/s0-s3 20 | compAccQ .req q1 @ d2-d3/s4-s7 21 | realAccS .req s0 @ d0[0] 22 | compAccS .req s4 @ d2[0] 23 | tapsVal .req q2 @ d4-d5 24 | outVal .req q3 @ d6-d7 25 | realMul .req q8 @ d8-d9 26 | compMul .req q9 @ d16-d17 27 | inRealVal .req q10 @ d18-d19 28 | inCompVal .req q11 @ d20-d21 29 | 30 | stmfd sp!, {r7, r8, sl} @ prologue - save register states 31 | 32 | pld [taps, #128] @ pre-load hint - this is implementation specific! 33 | pld [input, #128] @ pre-load hint - this is implementation specific! 34 | 35 | veor realAccQ, realAccQ @ zero out accumulators 36 | veor compAccQ, compAccQ @ zero out accumulators 37 | movs quarterPoints, num_points, lsr #2 38 | beq .loop2 @ if zero into quarterPoints 39 | 40 | @mov number, quarterPoints 41 | mov number, #0 42 | @ Optimizing for pipeline 43 | vld1.32 {tapsVal}, [taps:128]! @ tapsVal 44 | vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal 45 | add number, number, #1 46 | 47 | .loop1: 48 | @ do work here 49 | pld [taps, #128] @ pre-load hint - this is implementation specific! 50 | pld [input, #128] @ pre-load hint - this is implementation specific! 51 | vmul.f32 realMul, tapsVal, inRealVal 52 | vmul.f32 compMul, tapsVal, inCompVal 53 | vadd.f32 realAccQ, realAccQ, realMul 54 | vadd.f32 compAccQ, compAccQ, compMul 55 | vld1.32 {tapsVal}, [taps:128]! @ tapsVal 56 | vld2.32 {inRealVal-inCompVal}, [input:128]! @ inRealVal, inCompVal 57 | 58 | @subs number, number, #1 59 | @bls .loop1 @ first loop 60 | add number, number, #1 61 | cmp number, quarterPoints 62 | blt .loop1 63 | 64 | vmul.f32 realMul, tapsVal, inRealVal 65 | vmul.f32 compMul, tapsVal, inCompVal 66 | vadd.f32 realAccQ, realAccQ, realMul 67 | vadd.f32 compAccQ, compAccQ, compMul 68 | 69 | @ Sum up across realAccQ and compAccQ 70 | vadd.f32 d0, d0, d1 @ realAccQ +-> d0 71 | vadd.f32 d2, d2, d3 @ compAccQ +-> d2 72 | vadd.f32 realAccS, s0, s1 @ sum the contents of d0 together (realAccQ) 73 | vadd.f32 compAccS, s4, s5 @ sum the contents of d2 together (compAccQ) 74 | 75 | @ critical values are now in s0 (realAccS), s4 (realAccQ) 76 | mov number, quarterPoints, asl #2 77 | cmp num_points, number 78 | beq .done 79 | 80 | .loop2: 81 | vld1.32 {d4[0]}, [taps]! @ s8 82 | vld2.32 {d5[0],d6[0]}, [input]! @ s10, s12 83 | vmul.f32 s5, s8, s10 84 | vmul.f32 s6, s8, s12 85 | vadd.f32 realAccS, realAccS, s5 86 | vadd.f32 compAccS, compAccS, s6 87 | 88 | add number, number, #1 89 | cmp number, num_points 90 | blt .loop2 91 | 92 | .done: 93 | vst1.32 {d0[0]}, [result]! @ realAccS 94 | vst1.32 {d2[0]}, [result] @ compAccS 95 | 96 | ldmfd sp!, {r7, r8, sl} @ epilogue - restore register states 97 | bx lr 98 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32fc_x2_dot_prod_32fc_a_neonasm.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32fc_x2_dot_prod_32fc_neonasm(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | .global volk_32fc_x2_dot_prod_32fc_neonasm 3 | volk_32fc_x2_dot_prod_32fc_neonasm: 4 | push {r4, r5, r6, r7, r8, lr} 5 | vpush {q0-q7} 6 | vpush {q8-q15} 7 | mov r8, r3 @ hold on to num_points (r8) 8 | @ zero out accumulators -- leave 1 reg in alu 9 | veor q8, q15, q15 10 | mov r7, r0 @ (r7) is cVec 11 | veor q9, q15, q15 12 | mov r5, r1 @ (r5) is aVec 13 | veor q10, q15, q15 14 | mov r6, r2 @ (r6) is bVec 15 | veor q11, q15, q15 16 | lsrs r3, r3, #3 @ eighth_points (r3) = num_points/8 17 | veor q12, q15, q15 18 | mov r12, r2 @ (r12) is bVec 19 | veor q13, q15, q15 20 | mov r4, r1 @ (r4) is aVec 21 | veor q14, q15, q15 22 | veor q15, q15, q15 23 | beq .smallvector @ nathan optimized this file based on an objdump 24 | @ but I don't understand this jump. Seems like it should go to loop2 25 | @ and smallvector (really vector reduction) shouldn't need to be a label 26 | mov r2, #0 @ 0 out r2 (now number) 27 | .loop1: 28 | add r2, r2, #1 @ increment number 29 | vld4.32 {d0,d2,d4,d6}, [r12]! @ q0-q3 30 | cmp r2, r3 @ is number < eighth_points 31 | @pld [r12, #64] 32 | vld4.32 {d8,d10,d12,d14}, [r4]! @ q4-q7 33 | @pld [r4, #64] 34 | vmla.f32 q12, q4, q0 @ real (re*re) 35 | vmla.f32 q14, q4, q1 @ imag (re*im) 36 | vmls.f32 q15, q5, q1 @ real (im*im) 37 | vmla.f32 q13, q5, q0 @ imag (im*re) 38 | 39 | vmla.f32 q8, q2, q6 @ real (re*re) 40 | vmla.f32 q9, q2, q7 @ imag (re*im) 41 | vmls.f32 q10, q3, q7 @ real (im*im) 42 | vmla.f32 q11, q3, q6 @ imag (im*re) 43 | bne .loop1 44 | lsl r2, r3, #3 @ r2 = eighth_points * 8 45 | add r6, r6, r2 @ bVec = bVec + eighth_points -- whyyyyy gcc?!? 46 | add r5, r5, r2 @ aVec = aVec + eighth_points 47 | @ q12-q13 were original real accumulators 48 | @ q14-q15 were original imag accumulators 49 | @ reduce 8 accumulators down to 2 (1 real, 1 imag) 50 | vadd.f32 q8, q10, q8 @ real + real 51 | vadd.f32 q11, q11, q9 @ imag + imag 52 | vadd.f32 q12, q12, q15 @ real + real 53 | vadd.f32 q14, q14, q13 @ imag + imag 54 | vadd.f32 q8, q8, q12 55 | vadd.f32 q9, q9, q14 56 | .smallvector: 57 | lsl r4, r3, #3 58 | cmp r8, r4 59 | vst2.32 {d16-d19}, [sp :64] @ whaaaaat? no way this is necessary! 60 | vldr s15, [sp, #8] 61 | vldr s17, [sp] 62 | vldr s16, [sp, #4] 63 | vadd.f32 s17, s17, s15 64 | vldr s11, [sp, #12] 65 | vldr s12, [sp, #24] 66 | vldr s13, [sp, #28] 67 | vldr s14, [sp, #16] 68 | vldr s15, [sp, #20] 69 | vadd.f32 s16, s16, s11 70 | vadd.f32 s17, s17, s12 71 | vadd.f32 s16, s16, s13 72 | vadd.f32 s17, s17, s14 73 | vadd.f32 s16, s16, s15 74 | vstr s17, [r7] 75 | vstr s16, [r7, #4] 76 | bls .done 77 | .loop2: 78 | mov r3, r6 79 | add r6, r6, #8 80 | vldr s0, [r3] 81 | vldr s1, [r6, #-4] 82 | mov r3, r5 83 | add r5, r5, #8 84 | vldr s2, [r3] 85 | vldr s3, [r5, #-4] 86 | bl __mulsc3 @ GCC/Clang built-in. Portability? 87 | add r4, r4, #1 88 | cmp r4, r8 89 | vadd.f32 s17, s17, s0 90 | vadd.f32 s16, s16, s1 91 | vstr s17, [r7] 92 | vstr s16, [r7, #4] 93 | bne .loop2 94 | .done: 95 | vpop {q8-q15} 96 | vpop {q0-q7} 97 | pop {r4, r5, r6, r7, r8, pc} 98 | 99 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to VOLK 2 | 3 | Welcome! You are reading about how to contribute code to VOLK. First of 4 | all, we are very happy that you're about to contribute, and welcome your 5 | submissions! We hope many more will come. 6 | 7 | In this document, we will explain the main things to consider when submitting 8 | pull requests against VOLK. Reading this first will help a lot with 9 | streamlining the process of getting your code merged. 10 | 11 | There is also a [wiki-based version of this file][wikicontrib], which contains 12 | more detail. VOLK is part of the GNU Radio project and as such, it follows the 13 | same contribution guidelines. This file is an [adopted GNU Radio checklist][gnuradiocontrib]. 14 | 15 | ## What about non-code contributions? 16 | 17 | Those are at least as important as code contributions: Emails to the mailing 18 | list, answers on Stack Overflow, Wiki page edits, examples... We very much 19 | appreciate those. However, this document is specifically about contributing 20 | code. 21 | 22 | ## DCO Signed? 23 | 24 | Any code contributions going into VOLK will become part of an LGPL-licensed 25 | (former contributions are GPL-licensed), open source repository. It is therefore 26 | imperative that code submissions belong to the authors, and that submitters have 27 | the authority to merge that code into the public VOLK codebase. 28 | 29 | For that purpose, we use the [Developer's Certificate of Origin](DCO.txt). It 30 | is the same document used by other projects. Signing the DCO states that there 31 | are no legal reasons to not merge your code. 32 | 33 | To sign the DCO, suffix your git commits with a "Signed-off-by" line. When 34 | using the command line, you can use `git commit -s` to automatically add this 35 | line. If there were multiple authors of the code, or other types of 36 | stakeholders, make sure that all are listed, each with a separate Signed-off-by 37 | line. 38 | 39 | ## Coding Guidelines 40 | 41 | We have codified our coding guidelines in [GNU Radio GREP1][grep1]. Please read them, 42 | and stick to them. For C/C++ code, use clang-format. For Python, PEP8 is your friend 43 | (but again, check the actual coding guidelines). 44 | 45 | ## Git commit messages are very important 46 | 47 | We follow standard git commit message guidelines, similar to many other open 48 | source projects. See the [coding guidelines][grep1] for more details. In a 49 | nutshell: 50 | - Keep the lines below 72 characters 51 | - Subject line has the component prepended (e.g., `kernelname:`) 52 | - Avoid empty git commit messages 53 | - The git commit message explains the change, the code only explains the current 54 | state 55 | 56 | ## Unit Tests 57 | 58 | VOLK unit tests compare the results of each kernel version to the generic version. 59 | Keep the generic kernel version as simple as possible and verify your optimized 60 | kernels against the generic version. 61 | 62 | ## The Buddy Principle: Submit One, Review One 63 | 64 | When you've submitted a pull request, please take the time to review another 65 | one. This helps make sure that there are always a number of reviews at least 66 | equal to the number of pull requests, which means the maintainers don't get 67 | overwhelmed when a lot is being contributed. 68 | 69 | ## Standard command line options 70 | 71 | When writing programs that are executable from the command line, 72 | please follow existing examples regarding their command line arguments, and 73 | reuse them. 74 | 75 | [grep1]: https://github.com/gnuradio/greps/blob/master/grep-0001-coding-guidelines.md 76 | [wikicontrib]: https://wiki.gnuradio.org/index.php/Development 77 | [gr-devs]: https://github.com/orgs/gnuradio/teams/gr-devs 78 | [gnuradiocontrib]: https://github.com/gnuradio/gnuradio/blob/master/CONTRIBUTING.md 79 | -------------------------------------------------------------------------------- /tests/volk_test.cc: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2022 Johannes Demel 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | template 19 | ::testing::AssertionResult AreComplexFloatingPointArraysAlmostEqual(const T& expected, 20 | const T& actual) 21 | { 22 | ::testing::AssertionResult result = ::testing::AssertionFailure(); 23 | if (expected.size() != actual.size()) { 24 | return result << "expected result size=" << expected.size() 25 | << " differs from actual size=" << actual.size(); 26 | } 27 | const unsigned long length = expected.size(); 28 | 29 | int errorsFound = 0; 30 | const char* separator = " "; 31 | for (unsigned long index = 0; index < length; index++) { 32 | auto expected_real = ::testing::internal::FloatingPoint(expected[index].real()); 33 | auto expected_imag = ::testing::internal::FloatingPoint(expected[index].imag()); 34 | auto actual_real = ::testing::internal::FloatingPoint(actual[index].real()); 35 | auto actual_imag = ::testing::internal::FloatingPoint(actual[index].imag()); 36 | if (not expected_real.AlmostEquals(actual_real) or 37 | not expected_imag.AlmostEquals(actual_imag)) 38 | 39 | { 40 | if (errorsFound == 0) { 41 | result << "Differences found:"; 42 | } 43 | if (errorsFound < 3) { 44 | result << separator << expected[index] << " != " << actual[index] << " @ " 45 | << index; 46 | separator = ",\n"; 47 | } 48 | errorsFound++; 49 | } 50 | } 51 | if (errorsFound > 0) { 52 | result << separator << errorsFound << " differences in total"; 53 | return result; 54 | } 55 | return ::testing::AssertionSuccess(); 56 | } 57 | 58 | std::vector get_kernel_implementation_name_list(const volk_func_desc_t desc) 59 | { 60 | std::vector names; 61 | for (size_t i = 0; i < desc.n_impls; i++) { 62 | names.push_back(std::string(desc.impl_names[i])); 63 | } 64 | std::sort(names.begin(), names.end()); 65 | return names; 66 | } 67 | 68 | bool is_aligned_implementation_name(const std::string& name) 69 | { 70 | return name.rfind("a_", 0) == 0; 71 | } 72 | 73 | std::tuple, std::vector> 74 | separate_implementations_by_alignment(const std::vector& names) 75 | { 76 | std::vector aligned; 77 | std::vector unaligned; 78 | for (auto name : names) { 79 | if (is_aligned_implementation_name(name)) { 80 | aligned.push_back(name); 81 | } else { 82 | unaligned.push_back(name); 83 | } 84 | } 85 | return { aligned, unaligned }; 86 | } 87 | 88 | std::vector 89 | get_aligned_kernel_implementation_names(const volk_func_desc_t desc) 90 | { 91 | auto impls = get_kernel_implementation_name_list(desc); 92 | auto [aligned, unaligned] = separate_implementations_by_alignment(impls); 93 | return aligned; 94 | } 95 | 96 | std::vector 97 | get_unaligned_kernel_implementation_names(const volk_func_desc_t desc) 98 | { 99 | auto impls = get_kernel_implementation_name_list(desc); 100 | auto [aligned, unaligned] = separate_implementations_by_alignment(impls); 101 | return unaligned; 102 | } 103 | -------------------------------------------------------------------------------- /.mailmap: -------------------------------------------------------------------------------- 1 | Alexandre Rouma AlexandreRouma 2 | Alexey Slokva alesha72003 3 | Andrey Rodionov dernasherbrezon 4 | 5 | 6 | 7 | Christoph Mayer cmayer 8 | Christoph Mayer hcab14 9 | 10 | 11 | Douglas Geiger Doug 12 | Douglas Geiger Doug Geiger 13 | Douglas Geiger Douglas Geiger 14 | 15 | Federico Larroca git-artes 16 | Geof Nieboer gnieboer 17 | Jam M. Hernandez Quiceno Jam Quiceno 18 | 19 | Johannes Demel jdemel 20 | Johannes Demel jdemel 21 | Johannes Demel jdemel 22 | Johannes Demel jdemel 23 | Johannes Demel Johannes Demel 24 | Johannes Demel Johannes Demel 25 | Johannes Demel Johannes Demel 26 | John Sallay jsallay <31416796+jsallay@users.noreply.github.com> 27 | 28 | Marc Lichtman Marc L 29 | 30 | 31 | Marcus Müller Marcus Mueller 32 | <157892+michaelld@users.noreply.github.com> 33 | 34 | Michael Dickens Michael L Dickens 35 | Michael Dickens Micheal Dickens 36 | Mike Piscopo ghostop14 37 | 38 | 39 | 40 | 41 | Nathan West Nathan West 42 | 43 | Nicholas McCarthy namccart 44 | Nicholas McCarthy Nick McCarthy 45 | Nick Foster Nick Foster 46 | Nick Foster Nick Foster 47 | Olaf Bernstein 48 | Pascal Giard Pascal Giard 49 | 50 | Philip Balister root 51 | Philip Balister root 52 | Rick Farina Rick Farina (Zero_Chaos) 53 | <32478819+fritterhoff@users.noreply.github.com> 54 | Ryan Volz Ryan Volz 55 | Sam Lane Sam Lane <9569766+SJ-Innovation@users.noreply.github.com> 56 | 57 | -------------------------------------------------------------------------------- /lib/volk_rank_archs.c: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2012 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | int volk_get_index(const char* impl_names[], // list of implementations by name 19 | const size_t n_impls, // number of implementations available 20 | const char* impl_name // the implementation name to find 21 | ) 22 | { 23 | unsigned int i; 24 | for (i = 0; i < n_impls; i++) { 25 | if (!strncmp(impl_names[i], impl_name, 20)) { 26 | return i; 27 | } 28 | } 29 | // TODO return -1; 30 | // something terrible should happen here 31 | fprintf(stderr, "Volk warning: no arch found, returning generic impl\n"); 32 | return volk_get_index(impl_names, n_impls, "generic"); // but we'll fake it for now 33 | } 34 | 35 | int volk_rank_archs(const char* kern_name, // name of the kernel to rank 36 | const char* impl_names[], // list of implementations by name 37 | const int* impl_deps, // requirement mask per implementation 38 | const bool* alignment, // alignment status of each implementation 39 | size_t n_impls, // number of implementations available 40 | const bool align // if false, filter aligned implementations 41 | ) 42 | { 43 | size_t i; 44 | static volk_arch_pref_t* volk_arch_prefs; 45 | static size_t n_arch_prefs = 0; 46 | static int prefs_loaded = 0; 47 | if (!prefs_loaded) { 48 | n_arch_prefs = volk_load_preferences(&volk_arch_prefs); 49 | prefs_loaded = 1; 50 | } 51 | 52 | // If we've defined VOLK_GENERIC to be anything, always return the 53 | // 'generic' kernel. Used in GR's QA code. 54 | char* gen_env = getenv("VOLK_GENERIC"); 55 | if (gen_env) { 56 | return volk_get_index(impl_names, n_impls, "generic"); 57 | } 58 | 59 | // If we've defined the kernel name as an environment variable, always return 60 | // the 'overridden' kernel. Used for manually overring config kernels at runtime. 61 | char* override_env = getenv(kern_name); 62 | if (override_env) { 63 | return volk_get_index(impl_names, n_impls, override_env); 64 | } 65 | 66 | // now look for the function name in the prefs list 67 | for (i = 0; i < n_arch_prefs; i++) { 68 | if (!strncmp(kern_name, 69 | volk_arch_prefs[i].name, 70 | sizeof(volk_arch_prefs[i].name))) // found it 71 | { 72 | const char* impl_name = 73 | align ? volk_arch_prefs[i].impl_a : volk_arch_prefs[i].impl_u; 74 | return volk_get_index(impl_names, n_impls, impl_name); 75 | } 76 | } 77 | 78 | // return the best index with the largest deps 79 | size_t best_index_a = 0; 80 | size_t best_index_u = 0; 81 | int best_value_a = -1; 82 | int best_value_u = -1; 83 | for (i = 0; i < n_impls; i++) { 84 | const signed val = impl_deps[i]; 85 | if (alignment[i] && val > best_value_a) { 86 | best_index_a = i; 87 | best_value_a = val; 88 | } 89 | if (!alignment[i] && val > best_value_u) { 90 | best_index_u = i; 91 | best_value_u = val; 92 | } 93 | } 94 | 95 | // when align and we found a best aligned, use it 96 | if (align && best_value_a != -1) 97 | return best_index_a; 98 | 99 | // otherwise return the best unaligned 100 | return best_index_u; 101 | } 102 | -------------------------------------------------------------------------------- /kernels/volk/asm/riscv/volk_32fc_x2_dot_prod_32fc_sifive_u74.s: -------------------------------------------------------------------------------- 1 | .text 2 | .align 2 3 | .type volk_32fc_x2_dot_prod_32fc_sifive_u74, @function 4 | .global volk_32fc_x2_dot_prod_32fc_sifive_u74 5 | 6 | # 7 | # RISC-V implementation using only I and F sets. 8 | # About 41% less CPU use than GCC, measured with volk_profile, 9 | # and a test gnuradio graph using Freq XLAT FIR filter. 10 | # 11 | # The generic C code is also 2x unrolled, but its main flaw 12 | # seems to be not properly fusing into fmadd and fnmsub. 13 | # 14 | # Focus of this hand coded assembly: 15 | # * Better use of fused multiply. 16 | # * Try to maximize space between write and read. 17 | # 18 | # Instruction order has been done manually and benchmarked, 19 | # and may not be optimal. 20 | # 21 | volk_32fc_x2_dot_prod_32fc_sifive_u74: 22 | # a0: out 23 | # a1: in 24 | # a2: taps 25 | # a3: number of points 26 | 27 | # Calculate end of main loop. 28 | and a4,a3,1 29 | xor a4,a3,a4 30 | slli a5,a4,3 31 | add a5,a5,a1 32 | 33 | # Output regs. 34 | fmv.w.x ft0,zero 35 | fmv.w.x ft1,zero 36 | fmv.w.x ft2,zero 37 | fmv.w.x ft3,zero 38 | fmv.w.x ft4,zero 39 | fmv.w.x ft5,zero 40 | fmv.w.x ft6,zero 41 | fmv.w.x ft7,zero 42 | beq a1,a5,.endloop 43 | 44 | # Main loop two complexes at a time. 45 | .loop: 46 | # Load input in order of when it'll be used. 47 | # flw has 2 cycle latency, 1 cycle repeat. 48 | flw ft8,0(a1) # in0 49 | flw ft9,0(a2) # tp0 50 | flw ft10,4(a2) # tp1 51 | flw ft11,4(a1) # in1 52 | 53 | # None of the fused multiple-adds have a write-read stall. 54 | # FMA, like mul and add, have 5 cycle latency, 1 cycle repeat. 55 | fmadd.s ft0,ft8, ft9, ft0 # in0*tp0 56 | flw fa0,8(a1) # in0 57 | fmadd.s ft1,ft8, ft10,ft1 # in0*tp1 58 | flw fa1,8(a2) # tp0 59 | fnmsub.s ft2,ft11,ft10,ft2 # -in1*tp1 60 | flw fa2,12(a2) # tp1 61 | fmadd.s ft3,ft11,ft9, ft3 # in1*tp0 62 | flw fa3,12(a1) # in1 63 | 64 | fmadd.s ft4,fa0,fa1,ft4 # in0*tp0 65 | addi a1,a1,16 # free ride in pipeline A. 66 | fmadd.s ft5,fa0,fa2,ft5 # in0*tp1 67 | addi a2,a2,16 # free ride in pipeline A. 68 | fnmsub.s ft6,fa3,fa2,ft6 # -in1*tp1 69 | fmadd.s ft7,fa3,fa1,ft7 # in1*tp0 70 | bne a1,a5,.loop 71 | 72 | .endloop: 73 | # Check if odd number of inputs. 74 | andi a3,a3,1 75 | beqz a3,.done 76 | 77 | # Do odd one complex. 78 | flw fa0,0(a1) # in0 79 | flw fa1,0(a2) # tp0 80 | flw fa2,4(a2) # tp1 81 | flw fa3,4(a1) # in1 82 | 83 | fmadd.s ft4,fa0,fa1,ft4 # in0*tp0 84 | fmadd.s ft5,fa0,fa2,ft5 # in0*tp1 85 | fnmsub.s ft6,fa3,fa2,ft6 # -in1*tp1 86 | fmadd.s ft7,fa3,fa1,ft7 # in1*tp0 87 | .done: 88 | # Some one-time stalling here. 89 | # Latency 5, repeat 1. 90 | fadd.s ft0,ft0,ft2 91 | fadd.s ft1,ft1,ft3 92 | fadd.s ft0,ft0,ft4 93 | fadd.s ft1,ft1,ft5 94 | fadd.s ft0,ft0,ft6 95 | fadd.s ft1,ft1,ft7 96 | # fsw has latency 4, repeat 1. 97 | fsw ft0,0(a0) 98 | fsw ft1,4(a0) 99 | ret 100 | 101 | .size volk_32fc_x2_dot_prod_32fc_sifive_u74, .-volk_32fc_x2_dot_prod_32fc_sifive_u74 102 | -------------------------------------------------------------------------------- /kernels/volk/volk_32f_s32f_clamppuppet_32f.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2023 Magnus Lundmark 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_volk_32f_s32f_clamppuppet_32f_H 11 | #define INCLUDED_volk_32f_s32f_clamppuppet_32f_H 12 | 13 | #include 14 | 15 | #ifdef LV_HAVE_GENERIC 16 | static inline void volk_32f_s32f_clamppuppet_32f_generic(float* out, 17 | const float* in, 18 | const float min, 19 | unsigned int num_points) 20 | { 21 | volk_32f_s32f_x2_clamp_32f_generic(out, in, min, -min, num_points); 22 | } 23 | #endif 24 | 25 | #ifdef LV_HAVE_AVX2 26 | static inline void volk_32f_s32f_clamppuppet_32f_a_avx2(float* out, 27 | const float* in, 28 | const float min, 29 | unsigned int num_points) 30 | { 31 | volk_32f_s32f_x2_clamp_32f_a_avx2(out, in, min, -min, num_points); 32 | } 33 | #endif 34 | 35 | #ifdef LV_HAVE_SSE4_1 36 | static inline void volk_32f_s32f_clamppuppet_32f_a_sse4_1(float* out, 37 | const float* in, 38 | const float min, 39 | unsigned int num_points) 40 | { 41 | volk_32f_s32f_x2_clamp_32f_a_sse4_1(out, in, min, -min, num_points); 42 | } 43 | #endif 44 | 45 | #ifdef LV_HAVE_AVX2 46 | static inline void volk_32f_s32f_clamppuppet_32f_u_avx2(float* out, 47 | const float* in, 48 | const float min, 49 | unsigned int num_points) 50 | { 51 | volk_32f_s32f_x2_clamp_32f_u_avx2(out, in, min, -min, num_points); 52 | } 53 | #endif 54 | 55 | #ifdef LV_HAVE_SSE4_1 56 | static inline void volk_32f_s32f_clamppuppet_32f_u_sse4_1(float* out, 57 | const float* in, 58 | const float min, 59 | unsigned int num_points) 60 | { 61 | volk_32f_s32f_x2_clamp_32f_u_sse4_1(out, in, min, -min, num_points); 62 | } 63 | #endif 64 | 65 | #ifdef LV_HAVE_NEON 66 | static inline void volk_32f_s32f_clamppuppet_32f_neon(float* out, 67 | const float* in, 68 | const float min, 69 | unsigned int num_points) 70 | { 71 | volk_32f_s32f_x2_clamp_32f_neon(out, in, min, -min, num_points); 72 | } 73 | #endif 74 | 75 | #ifdef LV_HAVE_NEONV8 76 | static inline void volk_32f_s32f_clamppuppet_32f_neonv8(float* out, 77 | const float* in, 78 | const float min, 79 | unsigned int num_points) 80 | { 81 | volk_32f_s32f_x2_clamp_32f_neonv8(out, in, min, -min, num_points); 82 | } 83 | #endif 84 | 85 | #ifdef LV_HAVE_RVV 86 | static inline void volk_32f_s32f_clamppuppet_32f_rvv(float* out, 87 | const float* in, 88 | const float min, 89 | unsigned int num_points) 90 | { 91 | volk_32f_s32f_x2_clamp_32f_rvv(out, in, min, -min, num_points); 92 | } 93 | #endif 94 | 95 | #endif /* INCLUDED_volk_32f_s32f_clamppuppet_32f_H */ 96 | -------------------------------------------------------------------------------- /include/volk/volk_rvv_intrinsics.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2024 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | /* 11 | * This file is intended to hold RVV intrinsics of intrinsics. 12 | * They should be used in VOLK kernels to avoid copy-paste. 13 | */ 14 | 15 | #ifndef INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ 16 | #define INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ 17 | #include 18 | 19 | #define RISCV_SHRINK2(op, T, S, v) \ 20 | __riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ 21 | __riscv_vget_##T##S##m1(v, 1), \ 22 | __riscv_vsetvlmax_e##S##m1()) 23 | 24 | #define RISCV_SHRINK4(op, T, S, v) \ 25 | __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ 26 | __riscv_vget_##T##S##m1(v, 1), \ 27 | __riscv_vsetvlmax_e##S##m1()), \ 28 | __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ 29 | __riscv_vget_##T##S##m1(v, 3), \ 30 | __riscv_vsetvlmax_e##S##m1()), \ 31 | __riscv_vsetvlmax_e##S##m1()) 32 | 33 | #define RISCV_SHRINK8(op, T, S, v) \ 34 | __riscv_##op(__riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 0), \ 35 | __riscv_vget_##T##S##m1(v, 1), \ 36 | __riscv_vsetvlmax_e##S##m1()), \ 37 | __riscv_##op(__riscv_vget_##T##S##m1(v, 2), \ 38 | __riscv_vget_##T##S##m1(v, 3), \ 39 | __riscv_vsetvlmax_e##S##m1()), \ 40 | __riscv_vsetvlmax_e##S##m1()), \ 41 | __riscv_##op(__riscv_##op(__riscv_vget_##T##S##m1(v, 4), \ 42 | __riscv_vget_##T##S##m1(v, 5), \ 43 | __riscv_vsetvlmax_e##S##m1()), \ 44 | __riscv_##op(__riscv_vget_##T##S##m1(v, 6), \ 45 | __riscv_vget_##T##S##m1(v, 7), \ 46 | __riscv_vsetvlmax_e##S##m1()), \ 47 | __riscv_vsetvlmax_e##S##m1()), \ 48 | __riscv_vsetvlmax_e##S##m1()) 49 | 50 | #define RISCV_PERM4(f, v, vidx) \ 51 | __riscv_vcreate_v_u8m1_u8m4( \ 52 | f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ 53 | f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ 54 | f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ 55 | f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1())) 56 | 57 | #define RISCV_LUT4(f, vtbl, v) \ 58 | __riscv_vcreate_v_u8m1_u8m4( \ 59 | f(vtbl, __riscv_vget_u8m1(v, 0), __riscv_vsetvlmax_e8m1()), \ 60 | f(vtbl, __riscv_vget_u8m1(v, 1), __riscv_vsetvlmax_e8m1()), \ 61 | f(vtbl, __riscv_vget_u8m1(v, 2), __riscv_vsetvlmax_e8m1()), \ 62 | f(vtbl, __riscv_vget_u8m1(v, 3), __riscv_vsetvlmax_e8m1())) 63 | 64 | #define RISCV_PERM8(f, v, vidx) \ 65 | __riscv_vcreate_v_u8m1_u8m8( \ 66 | f(__riscv_vget_u8m1(v, 0), vidx, __riscv_vsetvlmax_e8m1()), \ 67 | f(__riscv_vget_u8m1(v, 1), vidx, __riscv_vsetvlmax_e8m1()), \ 68 | f(__riscv_vget_u8m1(v, 2), vidx, __riscv_vsetvlmax_e8m1()), \ 69 | f(__riscv_vget_u8m1(v, 3), vidx, __riscv_vsetvlmax_e8m1()), \ 70 | f(__riscv_vget_u8m1(v, 4), vidx, __riscv_vsetvlmax_e8m1()), \ 71 | f(__riscv_vget_u8m1(v, 5), vidx, __riscv_vsetvlmax_e8m1()), \ 72 | f(__riscv_vget_u8m1(v, 6), vidx, __riscv_vsetvlmax_e8m1()), \ 73 | f(__riscv_vget_u8m1(v, 7), vidx, __riscv_vsetvlmax_e8m1())) 74 | 75 | #define RISCV_VMFLTZ(T, v, vl) __riscv_vmslt(__riscv_vreinterpret_i##T(v), 0, vl) 76 | 77 | #endif /* INCLUDE_VOLK_VOLK_RVV_INTRINSICS_H_ */ 78 | -------------------------------------------------------------------------------- /tmpl/volk.tmpl.h: -------------------------------------------------------------------------------- 1 | /* -*- c++ -*- */ 2 | /* 3 | * Copyright 2011-2020 Free Software Foundation, Inc. 4 | * 5 | * This file is part of VOLK 6 | * 7 | * SPDX-License-Identifier: LGPL-3.0-or-later 8 | */ 9 | 10 | #ifndef INCLUDED_VOLK_RUNTIME 11 | #define INCLUDED_VOLK_RUNTIME 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | 23 | __VOLK_DECL_BEGIN 24 | 25 | typedef struct volk_func_desc 26 | { 27 | const char **impl_names; 28 | const int *impl_deps; 29 | const bool *impl_alignment; 30 | size_t n_impls; 31 | } volk_func_desc_t; 32 | 33 | //! Prints a list of machines available 34 | VOLK_API void volk_list_machines(void); 35 | 36 | //! Returns the name of the machine this instance will use 37 | VOLK_API const char* volk_get_machine(void); 38 | 39 | //! Get the machine alignment in bytes 40 | VOLK_API size_t volk_get_alignment(void); 41 | 42 | /*! 43 | * The VOLK_OR_PTR macro is a convenience macro 44 | * for checking the alignment of a set of pointers. 45 | * Example usage: 46 | * volk_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2))) 47 | */ 48 | #define VOLK_OR_PTR(ptr0, ptr1) \ 49 | (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1))) 50 | 51 | /*! 52 | * Is the pointer on a machine alignment boundary? 53 | * 54 | * Note: for performance reasons, this function 55 | * is not usable until another volk API call is made 56 | * which will perform certain initialization tasks. 57 | * 58 | * \param ptr the pointer to some memory buffer 59 | * \return 1 for alignment boundary, else 0 60 | */ 61 | VOLK_API bool volk_is_aligned(const void *ptr); 62 | 63 | // Just drop the deprecated attribute in case we are on Windows. Clang and GCC support `__attribute__`. 64 | // We just assume the compiler and the system are tight together as far as Mako templates are concerned. 65 | <% 66 | deprecated_kernels = ('volk_16i_x5_add_quad_16i_x4', 'volk_16i_branch_4_state_8', 67 | 'volk_16i_max_star_16i', 'volk_16i_max_star_horizontal_16i', 68 | 'volk_16i_permute_and_scalar_add', 'volk_16i_x4_quad_max_star_16i', 69 | 'volk_32fc_s32fc_multiply_32fc', 'volk_32fc_s32fc_x2_rotator_32fc', 70 | 'volk_32fc_x2_s32fc_multiply_conjugate_add_32fc') 71 | from platform import system 72 | if system() == 'Windows': 73 | deprecated_kernels = () 74 | %> 75 | %for kern in kernels: 76 | 77 | % if kern.name in deprecated_kernels: 78 | //! A function pointer to the dispatcher implementation 79 | extern VOLK_API ${kern.pname} ${kern.name} __attribute__((deprecated)); 80 | 81 | //! A function pointer to the fastest aligned implementation 82 | extern VOLK_API ${kern.pname} ${kern.name}_a __attribute__((deprecated)); 83 | 84 | //! A function pointer to the fastest unaligned implementation 85 | extern VOLK_API ${kern.pname} ${kern.name}_u __attribute__((deprecated)); 86 | 87 | //! Call into a specific implementation given by name 88 | extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) __attribute__((deprecated)); 89 | 90 | //! Get description parameters for this kernel 91 | extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void) __attribute__((deprecated)); 92 | % else: 93 | //! A function pointer to the dispatcher implementation 94 | extern VOLK_API ${kern.pname} ${kern.name}; 95 | 96 | //! A function pointer to the fastest aligned implementation 97 | extern VOLK_API ${kern.pname} ${kern.name}_a; 98 | 99 | //! A function pointer to the fastest unaligned implementation 100 | extern VOLK_API ${kern.pname} ${kern.name}_u; 101 | 102 | //! Call into a specific implementation given by name 103 | extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name); 104 | 105 | //! Get description parameters for this kernel 106 | extern VOLK_API volk_func_desc_t ${kern.name}_get_func_desc(void); 107 | % endif 108 | 109 | %endfor 110 | 111 | __VOLK_DECL_END 112 | 113 | #endif /*INCLUDED_VOLK_RUNTIME*/ 114 | -------------------------------------------------------------------------------- /kernels/volk/asm/neon/volk_32f_x2_dot_prod_32f_a_neonasm_opts.s: -------------------------------------------------------------------------------- 1 | @ static inline void volk_32f_x2_dot_prod_32f_a_neonasm_opts(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); 2 | @ r0 = cVector 3 | @ r1 = aVector 4 | @ r2 = bVector 5 | @ r3 = num_points 6 | .global volk_32f_x2_dot_prod_32f_a_neonasm_opts 7 | volk_32f_x2_dot_prod_32f_a_neonasm_opts: 8 | push {r4, r5, r6, r7, r8, r9, r10, r11} 9 | @ sixteenth_points = num_points / 16 10 | lsrs r8, r3, #4 11 | sub r13, r13, #16 @ subtracting 16 from stack pointer?, wat? 12 | @ 0 out neon accumulators 13 | veor q0, q3, q3 14 | veor q1, q3, q3 15 | veor q2, q3, q3 16 | veor q3, q3, q3 17 | beq .smallvector @ if less than 16 points skip main loop 18 | mov r7, r2 @ copy input ptrs 19 | mov r6, r1 @ copy input ptrs 20 | mov r5, #0 @ loop counter 21 | .mainloop: 22 | vld4.32 {d16,d18,d20,d22}, [r6]! 23 | add r5, r5, #1 @ inc loop counter 24 | cmp r5, r8 @ loop counter < sixteenth_points? 25 | vld4.32 {d24,d26,d28,d30}, [r7]! 26 | vld4.32 {d17,d19,d21,d23}, [r6]! 27 | vld4.32 {d25,d27,d29,d31}, [r7]! 28 | vmla.f32 q3, q8, q12 29 | vmla.f32 q0, q13, q9 30 | vmla.f32 q1, q14, q10 31 | vmla.f32 q2, q15, q11 32 | bne .mainloop 33 | lsl r12, r8, #6 @ r12=r8/64 34 | add r1, r1, r12 35 | add r2, r2, r12 36 | .smallvector: @ actually this can be skipped for small vectors 37 | vadd.f32 q3, q3, q0 38 | lsl r8, r8, #4 @ sixteenth_points * 16 39 | cmp r3, r8 @ num_points < sixteenth_points*16? 40 | vadd.f32 q2, q1, q2 41 | vadd.f32 q3, q2, q3 @ sum of 4 accumulators in to q3 42 | vadd.f32 s15, s12, s15 @ q3 is s12-s15, so reduce to a single float 43 | vadd.f32 s15, s15, s13 44 | vadd.f32 s15, s15, s14 45 | bls .done @ if vector is multiple of 16 then finish 46 | sbfx r11, r1, #2, #1 @ check alignment 47 | rsb r9, r8, r3 48 | and r11, r11, #3 49 | mov r6, r1 50 | cmp r11, r9 51 | movcs r11, r9 52 | cmp r9, #3 53 | movls r11, r9 54 | cmp r11, #0 55 | beq .nothingtodo 56 | mov r5, r2 57 | mov r12, r8 58 | .dlabel5: 59 | add r12, r12, #1 60 | vldmia r6!, {s14} 61 | rsb r4, r8, r12 62 | vldmia r5!, {s13} 63 | cmp r4, r11 64 | vmla.f32 s15, s13, s14 65 | mov r7, r6 66 | mov r4, r5 67 | bcc .dlabel5 68 | cmp r9, r11 69 | beq .done 70 | .dlabel8: 71 | rsb r9, r11, r9 72 | lsr r8, r9, #2 73 | lsls r10, r8, #2 74 | beq .dlabel6 75 | lsl r6, r11, #2 76 | veor q8, q8, q8 77 | add r1, r1, r6 78 | add r6, r2, r6 79 | mov r5, #0 80 | .dlabel9: 81 | add r5, r5, #1 82 | vld1.32 {d20-d21}, [r6]! 83 | cmp r5, r8 84 | vld1.64 {d18-d19}, [r1 :64]! 85 | vmla.f32 q8, q10, q9 86 | bcc .dlabel9 87 | vadd.f32 d16, d16, d17 88 | lsl r2, r10, #2 89 | veor q9, q9, q9 90 | add r7, r7, r2 91 | vpadd.f32 d6, d16, d16 92 | add r4, r4, r2 93 | cmp r9, r10 94 | add r12, r12, r10 95 | vadd.f32 s15, s15, s12 96 | beq .done 97 | .dlabel6: 98 | mov r2, r7 99 | .dlabel7: 100 | add r12, r12, #1 101 | vldmia r2!, {s13} 102 | cmp r3, r12 103 | vldmia r4!, {s14} 104 | vmla.f32 s15, s13, s14 105 | bhi .dlabel7 106 | .done: 107 | vstr s15, [r0] 108 | add r13, r13, #16 109 | pop {r4, r5, r6, r7, r8, r9, r10, r11} 110 | bx lr @ lr is the return address 111 | .nothingtodo: 112 | mov r12, r8 113 | mov r4, r2 114 | mov r7, r1 115 | b .dlabel8 116 | 117 | --------------------------------------------------------------------------------